model-routing-strategy

Model Routing Strategy

Dynamically select the optimal model for each task to balance quality, cost, and latency.

When to Use

Building applications using multiple LLM providers
Optimizing costs while maintaining quality
Need different model capabilities for different tasks
Implementing fallback strategies
A/B testing model performance

Model Comparison Matrix

Capability vs Cost

Capability Best Models Cost Tier

Complex reasoning Claude Opus, o1 $$$

General tasks Claude Sonnet, GPT-4o $$

Simple tasks Claude Haiku, GPT-4o-mini $

Code generation Claude Sonnet, GPT-4o $$

Creative writing Claude Opus, GPT-4 $$$

Extraction/Classification Claude Haiku, GPT-4o-mini $

Latency Comparison

Model Typical Latency (TTFB)

Claude Haiku 200-400ms

Claude Sonnet 400-800ms

Claude Opus 800-1500ms

GPT-4o-mini 200-400ms

GPT-4o 400-700ms

GPT-4 Turbo 500-1000ms

Routing Strategies

Strategy 1: Complexity-Based Routing

type Complexity = 'simple' | 'medium' | 'complex';

interface Task { prompt: string; requirements: { needsReasoning: boolean; needsCreativity: boolean; needsAccuracy: boolean; maxLatencyMs?: number; maxCostUSD?: number; }; }

function assessComplexity(task: Task): Complexity { const prompt = task.prompt.toLowerCase();

// Complex indicators const complexPatterns = [ /analyze.*and.*compare/, /explain.*step.*by.*step/, /write.*comprehensive/, /evaluate.*trade.*offs/, /design.*architecture/, /debug.*complex/, /review.*security/ ];

// Simple indicators const simplePatterns = [ /summarize.*briefly/, /extract.*from/, /classify.*as/, /format.*as.*json/, /translate.*to/, /fix.*typo/ ];

if (complexPatterns.some(p => p.test(prompt)) || task.requirements.needsReasoning || task.requirements.needsCreativity) { return 'complex'; }

if (simplePatterns.some(p => p.test(prompt))) { return 'simple'; }

return 'medium'; }

function selectModel(complexity: Complexity): string { const modelMap = { simple: 'claude-3-haiku', // $0.25/$1.25 per 1M medium: 'claude-3.5-sonnet', // $3/$15 per 1M complex: 'claude-3-opus' // $15/$75 per 1M };

return modelMap[complexity]; }

Strategy 2: Cost-Constrained Routing

interface ModelOption { name: string; provider: string; inputCostPer1K: number; outputCostPer1K: number; qualityScore: number; // 0-1 avgLatencyMs: number; }

const models: ModelOption[] = [ { name: 'claude-3-opus', provider: 'anthropic', inputCostPer1K: 0.015, outputCostPer1K: 0.075, qualityScore: 0.98, avgLatencyMs: 1200 }, { name: 'claude-3.5-sonnet', provider: 'anthropic', inputCostPer1K: 0.003, outputCostPer1K: 0.015, qualityScore: 0.95, avgLatencyMs: 600 }, { name: 'claude-3-haiku', provider: 'anthropic', inputCostPer1K: 0.00025, outputCostPer1K: 0.00125, qualityScore: 0.85, avgLatencyMs: 300 }, { name: 'gpt-4o', provider: 'openai', inputCostPer1K: 0.005, outputCostPer1K: 0.015, qualityScore: 0.94, avgLatencyMs: 550 }, { name: 'gpt-4o-mini', provider: 'openai', inputCostPer1K: 0.00015, outputCostPer1K: 0.0006, qualityScore: 0.82, avgLatencyMs: 300 }, ];

function selectWithinBudget( estimatedInputTokens: number, estimatedOutputTokens: number, maxCost: number, minQuality: number = 0.8 ): ModelOption | null { const viable = models .filter(m => { const cost = (estimatedInputTokens / 1000) * m.inputCostPer1K + (estimatedOutputTokens / 1000) * m.outputCostPer1K; return cost <= maxCost && m.qualityScore >= minQuality; }) .sort((a, b) => b.qualityScore - a.qualityScore); // Best quality within budget

return viable[0] || null; }

Strategy 3: Latency-Optimized Routing

function selectForLatency( maxLatencyMs: number, minQuality: number = 0.8 ): ModelOption | null { return models .filter(m => m.avgLatencyMs <= maxLatencyMs && m.qualityScore >= minQuality) .sort((a, b) => b.qualityScore - a.qualityScore)[0] || null; }

// For real-time applications const realtimeModel = selectForLatency(500, 0.8); // Haiku or GPT-4o-mini

// For batch processing const batchModel = selectForLatency(2000, 0.95); // Sonnet or Opus

Strategy 4: Task-Type Routing

const taskModelMap: Record<TaskType, string[]> = { code: ['claude-3.5-sonnet', 'gpt-4o', 'claude-3-opus'], creative: ['claude-3-opus', 'gpt-4', 'claude-3.5-sonnet'], analysis: ['claude-3-opus', 'o1', 'claude-3.5-sonnet'], extraction: ['claude-3-haiku', 'gpt-4o-mini', 'claude-3.5-sonnet'], chat: ['claude-3.5-sonnet', 'gpt-4o', 'claude-3-haiku'], classification: ['claude-3-haiku', 'gpt-4o-mini', 'claude-3.5-sonnet'] };

function selectForTaskType(taskType: TaskType, budget: 'low' | 'medium' | 'high'): string { const candidates = taskModelMap[taskType];

const budgetIndex = { low: 2, medium: 1, high: 0 }; return candidates[Math.min(budgetIndex[budget], candidates.length - 1)]; }

Fallback Chains

interface FallbackChain { primary: string; fallbacks: string[]; retryConfig: { maxRetries: number; backoffMs: number; }; }

const chains: Record<string, FallbackChain> = { highQuality: { primary: 'claude-3-opus', fallbacks: ['claude-3.5-sonnet', 'gpt-4o', 'claude-3-haiku'], retryConfig: { maxRetries: 3, backoffMs: 1000 } }, costEffective: { primary: 'claude-3-haiku', fallbacks: ['gpt-4o-mini', 'claude-3.5-sonnet'], retryConfig: { maxRetries: 2, backoffMs: 500 } } };

async function executeWithFallback( chain: FallbackChain, prompt: string ): Promise<string> { const allModels = [chain.primary, ...chain.fallbacks];

for (let i = 0; i < allModels.length; i++) { const model = allModels[i];

try {
  return await callModel(model, prompt);
} catch (error) {
  console.log(`Model ${model} failed, trying fallback...`);

  if (i &#x3C; allModels.length - 1) {
    await sleep(chain.retryConfig.backoffMs * (i + 1));
  }
}

}

throw new Error('All models in fallback chain failed'); }

Dynamic Routing with Learning

interface ModelPerformance { model: string; successRate: number; avgLatency: number; avgQuality: number; // From human feedback or automated eval totalCalls: number; }

class AdaptiveRouter { private performance = new Map<string, ModelPerformance>();

recordOutcome( model: string, success: boolean, latencyMs: number, qualityScore?: number ): void { const perf = this.performance.get(model) || { model, successRate: 1, avgLatency: latencyMs, avgQuality: 0.9, totalCalls: 0 };

// Exponential moving average
const alpha = 0.1;
perf.successRate = alpha * (success ? 1 : 0) + (1 - alpha) * perf.successRate;
perf.avgLatency = alpha * latencyMs + (1 - alpha) * perf.avgLatency;
if (qualityScore !== undefined) {
  perf.avgQuality = alpha * qualityScore + (1 - alpha) * perf.avgQuality;
}
perf.totalCalls++;

this.performance.set(model, perf);

}

selectBest( candidates: string[], weights: { quality: number; latency: number; reliability: number } ): string { let bestScore = -Infinity; let bestModel = candidates[0];

for (const model of candidates) {
  const perf = this.performance.get(model);
  if (!perf) continue;

  const score =
    weights.quality * perf.avgQuality +
    weights.reliability * perf.successRate +
    weights.latency * (1 - perf.avgLatency / 5000); // Normalize to 0-1

  if (score > bestScore) {
    bestScore = score;
    bestModel = model;
  }
}

return bestModel;

} }

A/B Testing Models

interface ABTest { name: string; variants: { model: string; weight: number }[]; metrics: string[]; }

class ModelABTester { private tests = new Map<string, ABTest>(); private results = new Map<string, { model: string; metric: string; value: number }[]>();

selectVariant(testName: string, userId: string): string { const test = this.tests.get(testName); if (!test) throw new Error(Test ${testName} not found);

// Deterministic selection based on userId
const hash = this.hashUserId(userId);
let cumWeight = 0;

for (const variant of test.variants) {
  cumWeight += variant.weight;
  if (hash &#x3C; cumWeight) {
    return variant.model;
  }
}

return test.variants[0].model;

}

recordMetric(testName: string, model: string, metric: string, value: number): void { const results = this.results.get(testName) || []; results.push({ model, metric, value }); this.results.set(testName, results); }

getResults(testName: string): Record<string, Record<string, { avg: number; count: number }>> { const results = this.results.get(testName) || []; // Aggregate by model and metric // ... } }

Best Practices

Start simple - Complexity-based routing covers most cases
Monitor quality - Cheaper isn't better if quality drops
Use fallbacks - Always have a backup
Test routing logic - Unit test your router
Log decisions - Know why each model was chosen
Review regularly - Model capabilities and pricing change
Consider latency - User experience matters

model-routing-strategy

Safety Notice

Copy this and send it to your AI assistant to learn

Source Transparency

Related Skills

graphrag-patterns

agentic-rag

production-rag-checklist

rag-evaluation