agent-cost-budgeting

Agent Cost Budgeting

Control and optimize token usage across AI agent systems.

When to Use

Preventing runaway API costs
Implementing per-task cost limits
Optimizing multi-agent token usage
Tracking and reporting AI spending
Building cost-aware agent behaviors

Cost Model

interface CostModel { provider: string; model: string; inputCostPer1K: number; // $ per 1000 input tokens outputCostPer1K: number; // $ per 1000 output tokens cacheCostPer1K?: number; // $ per 1000 cached tokens (if supported) }

const COST_MODELS: Record<string, CostModel> = { 'claude-3-opus': { provider: 'anthropic', model: 'claude-3-opus-20240229', inputCostPer1K: 0.015, outputCostPer1K: 0.075 }, 'claude-3-sonnet': { provider: 'anthropic', model: 'claude-3-sonnet-20240229', inputCostPer1K: 0.003, outputCostPer1K: 0.015 }, 'claude-3-haiku': { provider: 'anthropic', model: 'claude-3-haiku-20240307', inputCostPer1K: 0.00025, outputCostPer1K: 0.00125 }, 'gpt-4-turbo': { provider: 'openai', model: 'gpt-4-turbo', inputCostPer1K: 0.01, outputCostPer1K: 0.03 }, 'gpt-4o': { provider: 'openai', model: 'gpt-4o', inputCostPer1K: 0.005, outputCostPer1K: 0.015 }, 'gpt-4o-mini': { provider: 'openai', model: 'gpt-4o-mini', inputCostPer1K: 0.00015, outputCostPer1K: 0.0006 } };

function calculateCost( model: string, inputTokens: number, outputTokens: number ): number { const costModel = COST_MODELS[model]; if (!costModel) throw new Error(Unknown model: ${model});

return ( (inputTokens / 1000) * costModel.inputCostPer1K + (outputTokens / 1000) * costModel.outputCostPer1K ); }

Budget Management

interface Budget { id: string; name: string; limitUSD: number; spentUSD: number; period: 'task' | 'hourly' | 'daily' | 'monthly'; resetAt?: Date; alertThresholds: number[]; // e.g., [0.5, 0.8, 0.95] hardLimit: boolean; // Stop vs warn at limit }

class BudgetManager { private budgets = new Map<string, Budget>();

async checkBudget(budgetId: string, estimatedCost: number): Promise<BudgetCheck> { const budget = this.budgets.get(budgetId); if (!budget) return { allowed: true };

const remaining = budget.limitUSD - budget.spentUSD;
const newTotal = budget.spentUSD + estimatedCost;
const utilizationAfter = newTotal / budget.limitUSD;

// Check thresholds
const crossedThresholds = budget.alertThresholds.filter(
  t => budget.spentUSD / budget.limitUSD &#x3C; t &#x26;&#x26; utilizationAfter >= t
);

if (crossedThresholds.length > 0) {
  await this.sendAlerts(budget, crossedThresholds);
}

// Check limit
if (estimatedCost > remaining) {
  if (budget.hardLimit) {
    return {
      allowed: false,
      reason: `Budget exceeded: ${remaining.toFixed(4)} USD remaining`,
      remaining
    };
  } else {
    return {
      allowed: true,
      warning: `Budget will be exceeded`,
      remaining
    };
  }
}

return { allowed: true, remaining };

}

async recordSpend(budgetId: string, cost: number): Promise<void> { const budget = this.budgets.get(budgetId); if (!budget) return;

budget.spentUSD += cost;

// Check if period should reset
if (budget.resetAt &#x26;&#x26; new Date() >= budget.resetAt) {
  budget.spentUSD = cost; // Start fresh with current spend
  budget.resetAt = this.calculateNextReset(budget.period);
}

} }

Token Estimation

// Rough estimation before API call function estimateTokens(text: string): number { // Rough heuristic: ~4 characters per token for English return Math.ceil(text.length / 4); }

// More accurate estimation using tiktoken (for OpenAI) import { encoding_for_model } from 'tiktoken';

function countTokensAccurate(text: string, model: string): number { const enc = encoding_for_model(model); const tokens = enc.encode(text); enc.free(); return tokens.length; }

// Estimate cost before execution function estimateCallCost( model: string, systemPrompt: string, userMessage: string, expectedOutputTokens: number ): number { const inputTokens = estimateTokens(systemPrompt + userMessage); return calculateCost(model, inputTokens, expectedOutputTokens); }

Cost-Aware Agent

class CostAwareAgent { private budget: Budget; private spent = 0;

constructor(budgetUSD: number) { this.budget = { id: 'agent-budget', name: 'Agent Task Budget', limitUSD: budgetUSD, spentUSD: 0, period: 'task', alertThresholds: [0.5, 0.8], hardLimit: true }; }

async execute(task: string): Promise<Result> { // Estimate cost const estimate = this.estimateTaskCost(task);

if (estimate > this.remaining) {
  return this.handleBudgetExceeded(task, estimate);
}

// Choose model based on budget
const model = this.selectModelForBudget(task);

// Execute with tracking
const result = await this.llm.complete({
  model,
  messages: [{ role: 'user', content: task }],
  onUsage: (usage) => this.recordUsage(usage, model)
});

return result;

}

private selectModelForBudget(task: string): string { const complexity = this.assessComplexity(task); const remaining = this.budget.limitUSD - this.spent;

// Use cheaper models when budget is tight
if (remaining &#x3C; 0.10) {
  return 'gpt-4o-mini'; // Cheapest
}

if (remaining &#x3C; 0.50 || complexity === 'low') {
  return 'claude-3-haiku';
}

if (remaining &#x3C; 2.00 || complexity === 'medium') {
  return 'claude-3-sonnet';
}

return 'claude-3-opus'; // Full power when budget allows

}

private handleBudgetExceeded(task: string, estimate: number): Result { // Options: // 1. Simplify the task // 2. Use cheaper model // 3. Return partial result // 4. Request budget increase

const cheaperModel = this.findCheapestViableModel(task);
if (cheaperModel) {
  return this.execute(task); // Retry with cheaper model
}

return {
  success: false,
  error: 'Budget exceeded',
  partialResult: null,
  budgetInfo: {
    remaining: this.remaining,
    estimated: estimate
  }
};

}

get remaining(): number { return this.budget.limitUSD - this.spent; } }

Multi-Agent Cost Distribution

interface AgentCostAllocation { agentId: string; allocatedUSD: number; spentUSD: number; priority: 'low' | 'medium' | 'high'; }

class MultiAgentBudgetManager { private totalBudget: number; private allocations = new Map<string, AgentCostAllocation>();

allocateBudget(agents: { id: string; priority: string }[]): void { // Priority weights const weights = { high: 3, medium: 2, low: 1 };

const totalWeight = agents.reduce(
  (sum, a) => sum + weights[a.priority], 0
);

for (const agent of agents) {
  const share = (weights[agent.priority] / totalWeight) * this.totalBudget;
  this.allocations.set(agent.id, {
    agentId: agent.id,
    allocatedUSD: share,
    spentUSD: 0,
    priority: agent.priority as any
  });
}

}

// Reallocate from under-spending to over-spending agents rebalance(): void { const underSpenders = Array.from(this.allocations.values()) .filter(a => a.spentUSD < a.allocatedUSD * 0.5);

const overSpenders = Array.from(this.allocations.values())
  .filter(a => a.spentUSD > a.allocatedUSD * 0.8);

for (const over of overSpenders) {
  const needed = over.spentUSD - over.allocatedUSD * 0.8;

  for (const under of underSpenders) {
    const available = under.allocatedUSD * 0.5 - under.spentUSD;
    const transfer = Math.min(needed, available);

    if (transfer > 0) {
      under.allocatedUSD -= transfer;
      over.allocatedUSD += transfer;
    }
  }
}

} }

Cost Optimization Strategies

Prompt Caching

// Anthropic prompt caching async function callWithCaching(messages: Message[]): Promise<Response> { // Mark system prompt for caching const cachedMessages = messages.map((m, i) => { if (i === 0 && m.role === 'system') { return { ...m, cache_control: { type: 'ephemeral' } }; } return m; });

return anthropic.messages.create({ model: 'claude-3-sonnet-20240229', messages: cachedMessages }); }

Model Routing

function routeToOptimalModel(task: string, budget: number): string { const complexity = assessComplexity(task);

// Complexity vs cost matrix const modelMatrix = { simple: ['gpt-4o-mini', 'claude-3-haiku'], medium: ['claude-3-sonnet', 'gpt-4o'], complex: ['claude-3-opus', 'gpt-4-turbo'] };

const candidates = modelMatrix[complexity];

// Find cheapest that fits budget for (const model of candidates) { const estimatedCost = estimateCallCost(model, task, 500); if (estimatedCost <= budget) { return model; } }

return candidates[candidates.length - 1]; // Fallback to cheapest }

Context Compression

async function compressContext( context: string, targetTokens: number ): Promise<string> { const currentTokens = estimateTokens(context);

if (currentTokens <= targetTokens) { return context; }

// Use cheap model to summarize const summary = await llm.complete({ model: 'gpt-4o-mini', messages: [{ role: 'user', content: Summarize this context in under ${targetTokens} tokens, preserving key information:\n\n${context} }] });

return summary; }

Reporting

interface CostReport { period: { start: Date; end: Date }; totalSpent: number; byModel: Record<string, { calls: number; tokens: number; cost: number }>; byAgent: Record<string, { calls: number; cost: number }>; byTask: Record<string, { cost: number; success: boolean }>; trends: { dailyAverage: number; projectedMonthly: number; topCostDrivers: string[]; }; }

function generateCostReport(usageData: UsageRecord[]): CostReport { // ... aggregation logic }

Best Practices

Set budgets early - Don't wait for the bill
Monitor in real-time - Not after the fact
Use tiered models - Right-size for the task
Cache aggressively - Reuse where possible
Compress context - Less tokens = less cost
Alert early - 80% threshold, not 100%
Track by task - Know what's expensive
Review regularly - Optimize based on data

agent-cost-budgeting

Safety Notice

Copy this and send it to your AI assistant to learn

Source Transparency

Related Skills

graphrag-patterns

agentic-rag

production-rag-checklist

rag-evaluation