vector-search

Vector Search for Construction

Safety Notice

This listing is imported from skills.sh public index metadata. Review upstream SKILL.md and repository scripts before running.

Copy this and send it to your AI assistant to learn

Install skill "vector-search" with this command: npx skills add datadrivenconstruction/ddc_skills_for_ai_agents_in_construction/datadrivenconstruction-ddc-skills-for-ai-agents-in-construction-vector-search

Vector Search for Construction

Overview

Based on DDC methodology (Chapter 4.4), this skill implements semantic vector search for construction data. Move beyond keyword matching - find documents and data by meaning, not just words.

Book Reference: "Современные технологии работы с данными" / "Modern Data Technologies"

"Векторные базы данных позволяют находить семантически похожие документы, даже если они используют разную терминологию." — DDC Book, Chapter 4.4

Quick Start

from sentence_transformers import SentenceTransformer from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, PointStruct

Initialize embedding model

model = SentenceTransformer('all-MiniLM-L6-v2')

Create Qdrant client (in-memory for demo)

client = QdrantClient(":memory:")

Create collection

client.create_collection( collection_name="construction_docs", vectors_config=VectorParams(size=384, distance=Distance.COSINE) )

Sample construction documents

documents = [ "Concrete mix design for C30 grade with water-cement ratio 0.45", "Steel reinforcement specifications for structural columns", "Waterproofing membrane installation for basement walls", "Fire-rated door specifications for escape routes" ]

Index documents

for idx, doc in enumerate(documents): embedding = model.encode(doc).tolist() client.upsert( collection_name="construction_docs", points=[PointStruct(id=idx, vector=embedding, payload={"text": doc})] )

Search

query = "basement moisture protection" query_vector = model.encode(query).tolist() results = client.search( collection_name="construction_docs", query_vector=query_vector, limit=3 )

for result in results: print(f"Score: {result.score:.3f} - {result.payload['text']}")

Vector Database Setup

Qdrant Setup

from qdrant_client import QdrantClient from qdrant_client.models import ( VectorParams, Distance, PointStruct, Filter, FieldCondition, MatchValue ) import uuid

class ConstructionVectorDB: """Vector database for construction documents and data"""

def __init__(self, host="localhost", port=6333, in_memory=False):
    if in_memory:
        self.client = QdrantClient(":memory:")
    else:
        self.client = QdrantClient(host=host, port=port)

    self.model = SentenceTransformer('all-MiniLM-L6-v2')
    self.collections = {}

def create_collection(self, name, description=None):
    """Create a new collection"""
    self.client.create_collection(
        collection_name=name,
        vectors_config=VectorParams(
            size=384,  # Dimension for all-MiniLM-L6-v2
            distance=Distance.COSINE
        )
    )
    self.collections[name] = description

def index_documents(self, collection_name, documents, metadata=None):
    """Index documents with embeddings"""
    points = []

    for idx, doc in enumerate(documents):
        embedding = self.model.encode(doc).tolist()

        payload = {"text": doc}
        if metadata and idx < len(metadata):
            payload.update(metadata[idx])

        points.append(PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload=payload
        ))

    self.client.upsert(
        collection_name=collection_name,
        points=points
    )

    return len(points)

def search(self, collection_name, query, limit=5, filters=None):
    """Semantic search"""
    query_vector = self.model.encode(query).tolist()

    search_filter = None
    if filters:
        conditions = [
            FieldCondition(key=k, match=MatchValue(value=v))
            for k, v in filters.items()
        ]
        search_filter = Filter(must=conditions)

    results = self.client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=limit,
        query_filter=search_filter
    )

    return [
        {
            'score': r.score,
            'text': r.payload.get('text'),
            'metadata': {k: v for k, v in r.payload.items() if k != 'text'}
        }
        for r in results
    ]

def hybrid_search(self, collection_name, query, keyword_filter=None, limit=5):
    """Combine semantic search with keyword filtering"""
    # First semantic search
    semantic_results = self.search(collection_name, query, limit=limit*2)

    # Then keyword filter if provided
    if keyword_filter:
        filtered = [
            r for r in semantic_results
            if keyword_filter.lower() in r['text'].lower()
        ]
        return filtered[:limit]

    return semantic_results[:limit]

ChromaDB Alternative

import chromadb from chromadb.utils import embedding_functions

class ChromaConstructionDB: """ChromaDB-based vector search for construction"""

def __init__(self, persist_directory=None):
    if persist_directory:
        self.client = chromadb.PersistentClient(path=persist_directory)
    else:
        self.client = chromadb.Client()

    self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )

def create_collection(self, name):
    """Create or get collection"""
    return self.client.get_or_create_collection(
        name=name,
        embedding_function=self.embedding_fn
    )

def index_specifications(self, collection_name, specs):
    """Index construction specifications"""
    collection = self.create_collection(collection_name)

    ids = [f"spec_{i}" for i in range(len(specs))]
    documents = [s['text'] for s in specs]
    metadatas = [{k: v for k, v in s.items() if k != 'text'} for s in specs]

    collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas
    )

    return len(specs)

def search(self, collection_name, query, n_results=5, where=None):
    """Search specifications"""
    collection = self.create_collection(collection_name)

    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where=where
    )

    return [
        {
            'id': results['ids'][0][i],
            'text': results['documents'][0][i],
            'metadata': results['metadatas'][0][i] if results['metadatas'] else {},
            'distance': results['distances'][0][i] if results['distances'] else None
        }
        for i in range(len(results['ids'][0]))
    ]

Construction-Specific Applications

Specification Search

class SpecificationSearchEngine: """Search engine for construction specifications"""

def __init__(self, db: ConstructionVectorDB):
    self.db = db
    self.collection = "specifications"

def index_specifications(self, specs_df):
    """Index specifications from DataFrame"""
    self.db.create_collection(self.collection, "Construction specifications")

    documents = specs_df['description'].tolist()
    metadata = specs_df.drop('description', axis=1).to_dict('records')

    return self.db.index_documents(self.collection, documents, metadata)

def find_similar_specs(self, query, category=None, limit=5):
    """Find similar specifications"""
    filters = {'category': category} if category else None
    return self.db.search(self.collection, query, limit=limit, filters=filters)

def find_related_materials(self, material_name, limit=10):
    """Find specifications related to a material"""
    query = f"specifications for {material_name} materials"
    return self.db.search(self.collection, query, limit=limit)

def search_by_requirement(self, requirement, limit=5):
    """Search by performance requirement"""
    query = f"specification meeting requirement: {requirement}"
    return self.db.search(self.collection, query, limit=limit)

Standards and Codes Search

class StandardsSearchEngine: """Search engine for building standards and codes"""

def __init__(self, db: ConstructionVectorDB):
    self.db = db
    self.collection = "standards"

def index_standards(self, standards):
    """Index building standards

    Args:
        standards: List of dicts with 'code', 'title', 'section', 'text'
    """
    self.db.create_collection(self.collection, "Building standards and codes")

    documents = [s['text'] for s in standards]
    metadata = [{k: v for k, v in s.items() if k != 'text'} for s in standards]

    return self.db.index_documents(self.collection, documents, metadata)

def find_applicable_standards(self, context, limit=5):
    """Find standards applicable to a given context"""
    return self.db.search(self.collection, context, limit=limit)

def search_fire_codes(self, query):
    """Search fire safety codes"""
    full_query = f"fire safety code requirement: {query}"
    return self.db.search(
        self.collection,
        full_query,
        limit=10,
        filters={'category': 'fire_safety'}
    )

def search_accessibility(self, query):
    """Search accessibility standards (ADA, etc.)"""
    full_query = f"accessibility requirement: {query}"
    return self.db.search(
        self.collection,
        full_query,
        limit=10,
        filters={'category': 'accessibility'}
    )

Work Item Search (OpenConstructionEstimate)

class WorkItemSearchEngine: """Search engine for construction work items and unit prices"""

def __init__(self, db: ConstructionVectorDB):
    self.db = db
    self.collection = "work_items"

def index_work_items(self, items_df):
    """Index work items database

    Args:
        items_df: DataFrame with columns:
            - code: Work item code
            - description: Work description
            - unit: Unit of measure
            - unit_price: Price per unit
            - category: Work category
    """
    self.db.create_collection(self.collection, "Construction work items")

    documents = items_df['description'].tolist()
    metadata = items_df.drop('description', axis=1).to_dict('records')

    return self.db.index_documents(self.collection, documents, metadata)

def find_similar_work(self, description, limit=10):
    """Find similar work items by description"""
    results = self.db.search(self.collection, description, limit=limit)

    return [
        {
            'description': r['text'],
            'code': r['metadata'].get('code'),
            'unit': r['metadata'].get('unit'),
            'unit_price': r['metadata'].get('unit_price'),
            'similarity': r['score']
        }
        for r in results
    ]

def estimate_from_description(self, work_description, quantity):
    """Get cost estimate from work description"""
    matches = self.find_similar_work(work_description, limit=3)

    if not matches:
        return None

    best_match = matches[0]
    unit_price = best_match.get('unit_price', 0)

    return {
        'matched_item': best_match['description'],
        'code': best_match['code'],
        'unit': best_match['unit'],
        'unit_price': unit_price,
        'quantity': quantity,
        'total_cost': unit_price * quantity,
        'similarity_score': best_match['similarity']
    }

RAG for Construction

Retrieval Augmented Generation

from openai import OpenAI

class ConstructionRAG: """RAG system for construction queries"""

def __init__(self, vector_db: ConstructionVectorDB, openai_client=None):
    self.db = vector_db
    self.llm = openai_client or OpenAI()

def answer_query(self, query, collection, n_context=5):
    """Answer query using RAG"""
    # Retrieve relevant context
    context_docs = self.db.search(collection, query, limit=n_context)

    # Build context string
    context = "\n\n".join([
        f"Document {i+1}:\n{doc['text']}"
        for i, doc in enumerate(context_docs)
    ])

    # Generate answer
    prompt = f"""Based on the following construction documents, answer the query.

Context: {context}

Query: {query}

Provide a detailed, accurate answer based only on the provided context. If the context doesn't contain enough information, say so."""

    response = self.llm.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a construction industry expert."},
            {"role": "user", "content": prompt}
        ]
    )

    return {
        'answer': response.choices[0].message.content,
        'sources': context_docs,
        'query': query
    }

def summarize_specifications(self, topic, collection="specifications"):
    """Summarize specifications on a topic"""
    docs = self.db.search(collection, topic, limit=10)

    context = "\n".join([doc['text'] for doc in docs])

    prompt = f"""Summarize the following construction specifications related to: {topic}

Specifications: {context}

Provide a structured summary with key requirements and recommendations."""

    response = self.llm.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )

    return {
        'summary': response.choices[0].message.content,
        'source_count': len(docs)
    }

Document Indexing Pipeline

import os import pdfplumber from typing import List, Dict

class DocumentIndexingPipeline: """Pipeline for indexing construction documents"""

def __init__(self, vector_db: ConstructionVectorDB):
    self.db = vector_db
    self.chunk_size = 500
    self.chunk_overlap = 50

def chunk_text(self, text: str) -> List[str]:
    """Split text into chunks"""
    words = text.split()
    chunks = []

    for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
        chunk = ' '.join(words[i:i + self.chunk_size])
        if len(chunk) > 50:  # Skip very small chunks
            chunks.append(chunk)

    return chunks

def extract_pdf_text(self, pdf_path: str) -> str:
    """Extract text from PDF"""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def index_document(self, file_path: str, collection: str, metadata: Dict = None):
    """Index a single document"""
    # Extract text
    if file_path.endswith('.pdf'):
        text = self.extract_pdf_text(file_path)
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

    # Chunk text
    chunks = self.chunk_text(text)

    # Build metadata for each chunk
    base_metadata = metadata or {}
    base_metadata['source_file'] = os.path.basename(file_path)

    chunk_metadata = [
        {**base_metadata, 'chunk_index': i}
        for i in range(len(chunks))
    ]

    # Index
    return self.db.index_documents(collection, chunks, chunk_metadata)

def index_directory(self, directory: str, collection: str, extensions=None):
    """Index all documents in a directory"""
    if extensions is None:
        extensions = ['.pdf', '.txt', '.md']

    total_indexed = 0

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in extensions):
                file_path = os.path.join(root, file)
                try:
                    count = self.index_document(file_path, collection)
                    total_indexed += count
                    print(f"Indexed: {file} ({count} chunks)")
                except Exception as e:
                    print(f"Error indexing {file}: {e}")

    return total_indexed

Quick Reference

Component Description Use Case

Qdrant High-performance vector DB Production deployments

ChromaDB Simple embedded vector DB Development/testing

SentenceTransformers Embedding models Text to vectors

RAG Retrieval + Generation Q&A over documents

Embedding Models for Construction

Recommended models by use case

EMBEDDING_MODELS = { 'general': 'all-MiniLM-L6-v2', # Fast, 384 dim 'multilingual': 'paraphrase-multilingual-MiniLM-L12-v2', # Multi-language 'quality': 'all-mpnet-base-v2', # Better quality, 768 dim 'construction': 'allenai/scibert_scivocab_uncased' # Technical texts }

Resources

Next Steps

  • See llm-data-automation for LLM integration

  • See document-classification-nlp for document categorization

  • See rag-construction for RAG applications

Source Transparency

This detail page is rendered from real SKILL.md content. Trust labels are metadata-based hints, not a safety guarantee.

Related Skills

Related by shared tags or category signals.

Automation

cad-to-data

No summary provided by upstream source.

Repository SourceNeeds Review
Automation

drawing-analyzer

No summary provided by upstream source.

Repository SourceNeeds Review
Automation

dwg-to-excel

No summary provided by upstream source.

Repository SourceNeeds Review
Automation

daily-progress-report

No summary provided by upstream source.

Repository SourceNeeds Review