Embedding Comparison Skill

Evaluate and compare different embedding models on your actual data.

Overview

The default all-MiniLM-L6-v2 model is a good starting point, but may not be optimal for your specific content. This skill helps you:

Benchmark different models on your data
Compare retrieval quality
Make informed model selection decisions

Why Compare Models?

Factor Impact

Domain vocabulary Technical jargon may need specialized models

Document length Some models handle long text better

Query style Conversational vs keyword queries

Speed requirements Larger models = better quality but slower

Memory constraints Some models need significant RAM

Candidate Models

General Purpose

Model Dimensions Speed Quality Size

all-MiniLM-L6-v2

384 Fast Good 80MB

all-MiniLM-L12-v2

384 Medium Better 120MB

all-mpnet-base-v2

768 Slow Best 420MB

Specialized

Model Best For Dimensions

multi-qa-MiniLM-L6-cos-v1

Question answering 384

msmarco-MiniLM-L6-cos-v5

Search/retrieval 384

paraphrase-MiniLM-L6-v2

Semantic similarity 384

Code-Focused

Model Best For Source

krlvi/sentence-t5-base-nlpl-code_search_net

Code search HuggingFace

flax-sentence-embeddings/st-codesearch-distilroberta-base

Code + docs HuggingFace

Benchmarking Framework

Step 1: Create Test Dataset

#!/usr/bin/env python3 """Create a test dataset for embedding comparison."""

from typing import List, Dict import json

def create_test_dataset( documents: List[str], queries: List[str], relevance: Dict[str, List[int]] ) -> Dict: """ Create a test dataset.

Args:
    documents: List of documents to search
    queries: List of test queries
    relevance: Dict mapping query index to relevant document indices

Returns:
    Test dataset dict
"""
return {
    "documents": documents,
    "queries": queries,
    "relevance": relevance
}

Example: Create test dataset from your actual content

def create_from_qdrant(collection_name: str, sample_size: int = 50) -> Dict: """Create test dataset from existing Qdrant collection.""" from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

# Scroll through collection to get samples
results = client.scroll(
    collection_name=collection_name,
    limit=sample_size,
    with_payload=True
)

documents = [p.payload.get("content", "") for p in results[0]]

# You'll need to manually create queries and mark relevance
# This is the ground truth that benchmarks against

return {
    "documents": documents,
    "queries": [],  # Fill in manually
    "relevance": {}  # Fill in manually
}

Example test dataset

EXAMPLE_DATASET = { "documents": [ "Python is a high-level programming language known for readability.", "FastAPI is a modern web framework for building APIs with Python.", "Qdrant is a vector database for AI applications.", "Docker containers provide isolated runtime environments.", "REST APIs use HTTP methods for client-server communication.", ], "queries": [ "How do I build a web API?", "What is a vector database?", "How do I containerize my application?", ], "relevance": { "0": [1, 4], # Query 0 is relevant to docs 1 and 4 "1": [2], # Query 1 is relevant to doc 2 "2": [3], # Query 2 is relevant to doc 3 } }

if name == "main": with open("test_dataset.json", "w") as f: json.dump(EXAMPLE_DATASET, f, indent=2) print("Created test_dataset.json")

Step 2: Benchmark Script

#!/usr/bin/env python3 """Benchmark embedding models on test dataset."""

import json import time from typing import Dict, List import numpy as np from sentence_transformers import SentenceTransformer

Models to compare

MODELS = [ "all-MiniLM-L6-v2", "all-MiniLM-L12-v2", "all-mpnet-base-v2", "multi-qa-MiniLM-L6-cos-v1", "msmarco-MiniLM-L6-cos-v5", ]

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: """Compute cosine similarity between two vectors.""" return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def compute_metrics( model: SentenceTransformer, documents: List[str], queries: List[str], relevance: Dict[str, List[int]], k: int = 3 ) -> Dict: """ Compute retrieval metrics for a model.

Metrics:
- Precision@k: Fraction of top-k results that are relevant
- Recall@k: Fraction of relevant docs found in top-k
- MRR: Mean Reciprocal Rank
"""
# Encode documents
doc_embeddings = model.encode(documents)

precisions = []
recalls = []
reciprocal_ranks = []

for q_idx, query in enumerate(queries):
    q_key = str(q_idx)
    if q_key not in relevance:
        continue

    relevant_docs = set(relevance[q_key])

    # Encode query and compute similarities
    q_embedding = model.encode([query])[0]
    similarities = [
        cosine_similarity(q_embedding, doc_emb)
        for doc_emb in doc_embeddings
    ]

    # Get top-k results
    top_k_indices = np.argsort(similarities)[-k:][::-1]

    # Precision@k
    hits = len(set(top_k_indices) &#x26; relevant_docs)
    precisions.append(hits / k)

    # Recall@k
    recalls.append(hits / len(relevant_docs))

    # MRR (reciprocal rank of first relevant result)
    for rank, idx in enumerate(top_k_indices, 1):
        if idx in relevant_docs:
            reciprocal_ranks.append(1 / rank)
            break
    else:
        reciprocal_ranks.append(0)

return {
    "precision_at_k": np.mean(precisions),
    "recall_at_k": np.mean(recalls),
    "mrr": np.mean(reciprocal_ranks)
}

def benchmark_model(model_name: str, dataset: Dict) -> Dict: """Benchmark a single model.""" print(f"\nBenchmarking: {model_name}")

# Load model (time it)
load_start = time.perf_counter()
model = SentenceTransformer(model_name)
load_time = time.perf_counter() - load_start

# Time encoding
encode_start = time.perf_counter()
_ = model.encode(dataset["documents"])
encode_time = time.perf_counter() - encode_start

# Compute retrieval metrics
metrics = compute_metrics(
    model,
    dataset["documents"],
    dataset["queries"],
    dataset["relevance"]
)

# Get model info
test_embedding = model.encode(["test"])[0]

return {
    "model": model_name,
    "dimensions": len(test_embedding),
    "load_time_s": round(load_time, 2),
    "encode_time_s": round(encode_time, 3),
    "encode_per_doc_ms": round(encode_time / len(dataset["documents"]) * 1000, 2),
    **{k: round(v, 3) for k, v in metrics.items()}
}

def run_benchmark(dataset_path: str = "test_dataset.json") -> List[Dict]: """Run full benchmark.""" with open(dataset_path) as f: dataset = json.load(f)

print(f"Dataset: {len(dataset['documents'])} docs, {len(dataset['queries'])} queries")

results = []
for model_name in MODELS:
    try:
        result = benchmark_model(model_name, dataset)
        results.append(result)
        print(f"  P@3: {result['precision_at_k']:.3f}, MRR: {result['mrr']:.3f}")
    except Exception as e:
        print(f"  Error: {e}")

return results

def print_results_table(results: List[Dict]): """Print results as formatted table.""" print("\n" + "=" * 80) print("BENCHMARK RESULTS") print("=" * 80)

# Header
print(f"{'Model':&#x3C;35} {'Dim':>5} {'P@3':>6} {'R@3':>6} {'MRR':>6} {'ms/doc':>8}")
print("-" * 80)

# Sort by MRR (or your preferred metric)
for r in sorted(results, key=lambda x: -x['mrr']):
    print(f"{r['model']:&#x3C;35} {r['dimensions']:>5} {r['precision_at_k']:>6.3f} "
          f"{r['recall_at_k']:>6.3f} {r['mrr']:>6.3f} {r['encode_per_doc_ms']:>8.2f}")

print("=" * 80)

if name == "main": results = run_benchmark() print_results_table(results)

# Save results
with open("benchmark_results.json", "w") as f:
    json.dump(results, f, indent=2)
print("\nResults saved to benchmark_results.json")

Decision Framework

When to Use Different Models

all-MiniLM-L6-v2 (default) ├── Fast enough for real-time ├── Good general quality ├── Low memory footprint └── Use when: Starting out, general content

all-MiniLM-L12-v2 ├── Slightly better quality ├── Still reasonably fast └── Use when: Quality matters more than speed

all-mpnet-base-v2 ├── Best quality ├── Significantly slower ├── Higher memory usage └── Use when: Accuracy is critical, batch processing OK

multi-qa-MiniLM-L6-cos-v1 ├── Optimized for Q&A ├── Better with question-form queries └── Use when: Building Q&A system, FAQ retrieval

msmarco-MiniLM-L6-cos-v5 ├── Optimized for search ├── Better with keyword-style queries └── Use when: Building search engine, keyword queries

Quick Selection Guide

Your Content Recommended Model

General documentation all-MiniLM-L6-v2

Technical/code docs msmarco-MiniLM-L6-cos-v5

Q&A / FAQ multi-qa-MiniLM-L6-cos-v1

High-stakes retrieval all-mpnet-base-v2

Mixed content Run benchmark on your data

Switching Models

After deciding on a model:

1. Update environment

export EMBEDDING_MODEL=all-mpnet-base-v2

2. Re-embed all collections (embeddings aren't portable between models!)

python scripts/reembed_collections.py

3. Rebuild router embeddings

python scripts/rebuild_router.py

Important: Different models produce different dimensional embeddings. You cannot mix embeddings from different models in the same collection!

Reembedding Script

#!/usr/bin/env python3 """Re-embed all collections with a new model."""

import os from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct from sentence_transformers import SentenceTransformer

NEW_MODEL = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2") QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")

def reembed_collection(collection_name: str, model: SentenceTransformer, client: QdrantClient): """Re-embed a single collection."""

# Get existing data
results = client.scroll(
    collection_name=collection_name,
    limit=10000,
    with_payload=True
)

points = results[0]
if not points:
    print(f"  {collection_name}: empty, skipping")
    return

# Extract documents
documents = [p.payload.get("content", "") for p in points]

# Re-embed
new_embeddings = model.encode(documents).tolist()
vector_size = len(new_embeddings[0])

# Delete and recreate collection
client.delete_collection(collection_name)
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)

# Re-add points
new_points = [
    PointStruct(
        id=p.id,
        vector=new_embeddings[i],
        payload=p.payload
    )
    for i, p in enumerate(points)
]

client.upsert(collection_name=collection_name, points=new_points)
print(f"  {collection_name}: re-embedded {len(points)} documents")

def main(): print(f"Re-embedding with model: {NEW_MODEL}")

model = SentenceTransformer(NEW_MODEL)
client = QdrantClient(url=QDRANT_URL)

collections = client.get_collections().collections
print(f"Found {len(collections)} collections")

for coll in collections:
    reembed_collection(coll.name, model, client)

print("✅ Re-embedding complete!")

if name == "main": main()

Refinement Notes

Track findings from your benchmarks.

Created test dataset from real content
Ran benchmark on candidate models
Selected optimal model for use case
Re-embedded collections
Verified retrieval quality improved

embedding-comparison

Safety Notice

Copy this and send it to your AI assistant to learn

Example: Create test dataset from your actual content

Example test dataset

Models to compare

1. Update environment

2. Re-embed all collections (embeddings aren't portable between models!)

3. Rebuild router embeddings

Source Transparency

Related Skills

ffmpeg-patterns

site-crawler

ai-video-generation