RAG System Builder Skill

Overview

This skill creates complete RAG (Retrieval-Augmented Generation) systems that combine semantic search with LLM-powered Q&A. Users can ask natural language questions and receive accurate answers grounded in your document collection.

Quick Start

from sentence_transformers import SentenceTransformer import anthropic

Setup

model = SentenceTransformer('all-MiniLM-L6-v2') client = anthropic.Anthropic()

Retrieve context (simplified)

query = "What are the safety requirements?" query_embedding = model.encode(query, normalize_embeddings=True)

... search for similar chunks ...

Generate answer

response = client.messages.create( model="claude-sonnet-4-6", max_tokens=1024, messages=[{"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}] ) print(response.content[0].text)

When to Use

Building AI assistants for technical documentation
Creating Q&A systems for standards libraries
Developing chatbots with domain expertise
Enabling natural language queries over knowledge bases
Adding AI-powered search to existing document systems

Architecture

Prerequisites

Knowledge base with extracted text (see knowledge-base-builder )
Vector embeddings for semantic search (see semantic-search-setup )
API key: ANTHROPIC_API_KEY or OPENAI_API_KEY

Implementation

Step 1: Vector Embeddings Table

import sqlite3 import numpy as np

def setup_embeddings_table(db_path): conn = sqlite3.connect(db_path, timeout=30) cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS embeddings (
        id INTEGER PRIMARY KEY,
        chunk_id INTEGER UNIQUE,
        embedding BLOB,
        model_name TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        FOREIGN KEY (chunk_id) REFERENCES chunks(id)
    )
''')

conn.commit()
return conn

Step 2: Generate Embeddings

from sentence_transformers import SentenceTransformer import numpy as np

class EmbeddingGenerator: def init(self, model_name='all-MiniLM-L6-v2'): self.model = SentenceTransformer(model_name) self.dimension = 384 # all-MiniLM-L6-v2

def embed_text(self, text):
    """Generate embedding for text."""
    embedding = self.model.encode(text, normalize_embeddings=True)
    return embedding.astype(np.float32)

def embed_batch(self, texts, batch_size=100):
    """Generate embeddings for multiple texts."""
    embeddings = self.model.encode(
        texts,
        batch_size=batch_size,
        normalize_embeddings=True,
        show_progress_bar=True
    )
    return embeddings.astype(np.float32)

Step 3: Semantic Search

def semantic_search(db_path, query, model, top_k=5): """Find most similar chunks to query.""" conn = sqlite3.connect(db_path, timeout=30) cursor = conn.cursor()

# Embed query
query_embedding = model.embed_text(query)

# Get all embeddings
cursor.execute('''
    SELECT e.chunk_id, e.embedding, c.chunk_text, d.filename
    FROM embeddings e
    JOIN chunks c ON e.chunk_id = c.id
    JOIN documents d ON c.doc_id = d.id
''')

results = []
for chunk_id, emb_blob, text, filename in cursor.fetchall():
    embedding = np.frombuffer(emb_blob, dtype=np.float32)
    score = np.dot(query_embedding, embedding)  # Cosine similarity
    results.append({
        'chunk_id': chunk_id,
        'score': float(score),
        'text': text,
        'filename': filename
    })

# Sort by similarity
results.sort(key=lambda x: x['score'], reverse=True)
return results[:top_k]

Step 4: RAG Query Engine

import anthropic import openai

class RAGQueryEngine: def init(self, db_path, embedding_model): self.db_path = db_path self.model = embedding_model

def query(self, question, top_k=5, provider='anthropic'):
    """Answer question using RAG."""

    # 1. Retrieve relevant context
    results = semantic_search(self.db_path, question, self.model, top_k)

    # 2. Build context string
    context = "\n\n---\n\n".join([
        f"Source: {r['filename']}\n{r['text']}"
        for r in results
    ])

    # 3. Build prompt
    prompt = f"""Based on the following technical documents, answer the question.

If the answer is not in the documents, say so.

DOCUMENTS: {context}

QUESTION: {question}

ANSWER:"""

    # 4. Get LLM response
    if provider == 'anthropic':
        return self._query_claude(prompt), results
    else:
        return self._query_openai(prompt), results

def _query_claude(self, prompt):
    client = anthropic.Anthropic()
    response = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

def _query_openai(self, prompt):
    client = openai.OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

Step 5: CLI Interface

#!/usr/bin/env python3 """RAG Query CLI - Ask questions about your documents."""

import argparse import os

def main(): parser = argparse.ArgumentParser(description='RAG Q&A System') parser.add_argument('question', nargs='?', help='Question to ask') parser.add_argument('-i', '--interactive', action='store_true') parser.add_argument('-k', '--top-k', type=int, default=5) parser.add_argument('--provider', choices=['anthropic', 'openai'], default='anthropic')

args = parser.parse_args()

engine = RAGQueryEngine(DB_PATH, EmbeddingGenerator())

if args.interactive:
    print("RAG Q&#x26;A System (type 'quit' to exit)")
    while True:
        question = input("\nQuestion: ").strip()
        if question.lower() == 'quit':
            break
        answer, sources = engine.query(question, args.top_k, args.provider)
        print(f"\nAnswer: {answer}")
        print(f"\nSources: {[s['filename'] for s in sources]}")
else:
    answer, sources = engine.query(args.question, args.top_k, args.provider)
    print(f"Answer: {answer}")
    print(f"\nSources:")
    for s in sources:
        print(f"  - {s['filename']} (score: {s['score']:.3f})")

if name == 'main': main()

Prompt Engineering Tips

System Prompt Template

SYSTEM_PROMPT = """You are a technical expert assistant. Your role is to:

Answer questions based ONLY on the provided documents
Cite specific sources when possible
Acknowledge when information is not available
Be precise with technical terminology
Provide practical, actionable answers

If asked about topics not covered in the documents, say: "I don't have information about that in the available documents." """

Multi-Turn Conversations

def query_with_history(self, question, history=[]): """Support follow-up questions.""" context = self.get_relevant_context(question)

messages = [{"role": "system", "content": SYSTEM_PROMPT}]

# Add conversation history
for h in history[-4:]:  # Last 4 turns
    messages.append({"role": "user", "content": h['question']})
    messages.append({"role": "assistant", "content": h['answer']})

# Add current question with context
messages.append({
    "role": "user",
    "content": f"Context:\n{context}\n\nQuestion: {question}"
})

return self.llm.query(messages)

Execution Checklist

Set up knowledge base with text extraction
Generate vector embeddings for all chunks
Configure API keys (ANTHROPIC_API_KEY or OPENAI_API_KEY)
Test semantic search independently
Build and test RAG pipeline end-to-end
Tune top_k parameter for answer quality
Add source attribution to responses
Implement error handling for API failures

Error Handling

Common Errors

Error: anthropic.APIError (rate limit)

Cause: Too many API requests
Solution: Add exponential backoff retry logic

Error: Empty search results

Cause: No relevant documents in knowledge base
Solution: Expand search with lower similarity threshold

Error: Context too long

Cause: Top-k chunks exceed model context window
Solution: Reduce top_k or chunk size

Error: API key not found

Cause: Environment variable not set
Solution: Export ANTHROPIC_API_KEY or OPENAI_API_KEY

Error: Low quality answers

Cause: Poor retrieval or insufficient context
Solution: Tune chunk size, overlap, and top_k parameters

Metrics

Metric Typical Value

Query latency (end-to-end) 2-5 seconds

Retrieval time <100ms

LLM response time 1-4 seconds

Token usage per query 500-2000 tokens

Answer relevance 85-95% with good tuning

Performance Optimization

Cache Embeddings

Load all embeddings into memory at startup

self.embedding_cache = self._load_all_embeddings()

Use FAISS for Large Collections

import faiss

Build FAISS index for fast similarity search

index = faiss.IndexFlatIP(dimension) # Inner product for cosine sim index.add(embeddings)

Batch Queries

Process multiple questions efficiently

questions = ["Q1", "Q2", "Q3"] query_embeddings = model.embed_batch(questions)

Best Practices

Chunk size matters - 500-1500 chars optimal for context
Retrieve enough context - top_k=5-10 for comprehensive answers
Include source attribution - Always show which documents were used
Handle edge cases - Empty results, API errors, timeouts
Monitor token usage - Track costs and optimize prompts
Use SQLite timeout - timeout=30 for concurrent access

Example Usage

Single question

./rag "What are the fatigue design requirements for risers?"

Interactive mode

./rag -i

With OpenAI

./rag --provider openai "Explain API 2RD requirements"

Advanced: Hybrid Search (BM25 + Vector)

Combine keyword and semantic search for better results:

import sqlite3 from rank_bm25 import BM25Okapi import numpy as np

class HybridSearch: def init(self, db_path, embedding_model): self.db_path = db_path self.model = embedding_model self._build_bm25_index()

def _build_bm25_index(self):
    """Build BM25 index from chunks."""
    conn = sqlite3.connect(self.db_path)
    cursor = conn.cursor()
    cursor.execute('SELECT id, chunk_text FROM chunks')

    self.chunk_ids = []
    tokenized_corpus = []
    for chunk_id, text in cursor.fetchall():
        self.chunk_ids.append(chunk_id)
        tokenized_corpus.append(text.lower().split())

    self.bm25 = BM25Okapi(tokenized_corpus)
    conn.close()

def search(self, query, top_k=10, alpha=0.5):
    """Hybrid search with alpha weighting.

    alpha=0.0: Pure BM25 (keyword)
    alpha=1.0: Pure vector (semantic)
    alpha=0.5: Balanced hybrid
    """
    # BM25 scores
    tokenized_query = query.lower().split()
    bm25_scores = self.bm25.get_scores(tokenized_query)
    bm25_scores = bm25_scores / (bm25_scores.max() + 1e-6)  # Normalize

    # Vector scores
    vector_results = semantic_search(self.db_path, query, self.model, top_k=len(self.chunk_ids))
    vector_scores = {r['chunk_id']: r['score'] for r in vector_results}

    # Combine scores
    combined = []
    for i, chunk_id in enumerate(self.chunk_ids):
        score = (1 - alpha) * bm25_scores[i] + alpha * vector_scores.get(chunk_id, 0)
        combined.append((chunk_id, score))

    combined.sort(key=lambda x: x[1], reverse=True)
    return combined[:top_k]

Advanced: Reranking

Add a reranking step for improved precision:

from sentence_transformers import CrossEncoder

class Reranker: def init(self, model_name='cross-encoder/ms-marco-MiniLM-L-6-v2'): self.model = CrossEncoder(model_name)

def rerank(self, query, candidates, top_k=5):
    """Rerank candidates using cross-encoder."""
    pairs = [(query, c['text']) for c in candidates]
    scores = self.model.predict(pairs)

    for i, score in enumerate(scores):
        candidates[i]['rerank_score'] = float(score)

    reranked = sorted(candidates, key=lambda x: x['rerank_score'], reverse=True)
    return reranked[:top_k]

Usage in RAG pipeline

def query_with_rerank(self, question, initial_k=20, final_k=5): # First pass: retrieve more candidates candidates = semantic_search(self.db_path, question, self.model, top_k=initial_k)

# Second pass: rerank for precision
reranked = self.reranker.rerank(question, candidates, top_k=final_k)

return reranked

Streaming Responses

For better UX with long answers:

def query_streaming(self, question, top_k=5): """Stream RAG response for real-time display.""" context = self.get_context(question, top_k) prompt = self.build_prompt(context, question)

# Anthropic streaming
with anthropic.Anthropic().messages.stream(
    model="claude-sonnet-4-6",
    max_tokens=1024,
    messages=[{"role": "user", "content": prompt}]
) as stream:
    for text in stream.text_stream:
        yield text

Related Skills

knowledge-base-builder
Build the document database first
semantic-search-setup
Generate vector embeddings
pdf-text-extractor
Extract text from PDFs
document-rag-pipeline
Complete end-to-end pipeline

Dependencies

pip install sentence-transformers anthropic openai numpy

Optional:

faiss-cpu (for large-scale vector search)
rank-bm25 (for hybrid search)

Version History

1.2.0 (2026-01-02): Added Quick Start, Execution Checklist, Error Handling, Metrics sections; updated frontmatter with version, category, related_skills
1.1.0 (2025-12-30): Added hybrid search (BM25+vector), reranking, streaming responses
1.0.0 (2025-10-15): Initial release with basic RAG implementation

rag-system-builder

Safety Notice

Copy this and send it to your AI assistant to learn