Agent Cost Budgeting
Control and optimize token usage across AI agent systems.
When to Use
-
Preventing runaway API costs
-
Implementing per-task cost limits
-
Optimizing multi-agent token usage
-
Tracking and reporting AI spending
-
Building cost-aware agent behaviors
Cost Model
interface CostModel { provider: string; model: string; inputCostPer1K: number; // $ per 1000 input tokens outputCostPer1K: number; // $ per 1000 output tokens cacheCostPer1K?: number; // $ per 1000 cached tokens (if supported) }
const COST_MODELS: Record<string, CostModel> = { 'claude-3-opus': { provider: 'anthropic', model: 'claude-3-opus-20240229', inputCostPer1K: 0.015, outputCostPer1K: 0.075 }, 'claude-3-sonnet': { provider: 'anthropic', model: 'claude-3-sonnet-20240229', inputCostPer1K: 0.003, outputCostPer1K: 0.015 }, 'claude-3-haiku': { provider: 'anthropic', model: 'claude-3-haiku-20240307', inputCostPer1K: 0.00025, outputCostPer1K: 0.00125 }, 'gpt-4-turbo': { provider: 'openai', model: 'gpt-4-turbo', inputCostPer1K: 0.01, outputCostPer1K: 0.03 }, 'gpt-4o': { provider: 'openai', model: 'gpt-4o', inputCostPer1K: 0.005, outputCostPer1K: 0.015 }, 'gpt-4o-mini': { provider: 'openai', model: 'gpt-4o-mini', inputCostPer1K: 0.00015, outputCostPer1K: 0.0006 } };
function calculateCost(
model: string,
inputTokens: number,
outputTokens: number
): number {
const costModel = COST_MODELS[model];
if (!costModel) throw new Error(Unknown model: ${model});
return ( (inputTokens / 1000) * costModel.inputCostPer1K + (outputTokens / 1000) * costModel.outputCostPer1K ); }
Budget Management
interface Budget { id: string; name: string; limitUSD: number; spentUSD: number; period: 'task' | 'hourly' | 'daily' | 'monthly'; resetAt?: Date; alertThresholds: number[]; // e.g., [0.5, 0.8, 0.95] hardLimit: boolean; // Stop vs warn at limit }
class BudgetManager { private budgets = new Map<string, Budget>();
async checkBudget(budgetId: string, estimatedCost: number): Promise<BudgetCheck> { const budget = this.budgets.get(budgetId); if (!budget) return { allowed: true };
const remaining = budget.limitUSD - budget.spentUSD;
const newTotal = budget.spentUSD + estimatedCost;
const utilizationAfter = newTotal / budget.limitUSD;
// Check thresholds
const crossedThresholds = budget.alertThresholds.filter(
t => budget.spentUSD / budget.limitUSD < t && utilizationAfter >= t
);
if (crossedThresholds.length > 0) {
await this.sendAlerts(budget, crossedThresholds);
}
// Check limit
if (estimatedCost > remaining) {
if (budget.hardLimit) {
return {
allowed: false,
reason: `Budget exceeded: ${remaining.toFixed(4)} USD remaining`,
remaining
};
} else {
return {
allowed: true,
warning: `Budget will be exceeded`,
remaining
};
}
}
return { allowed: true, remaining };
}
async recordSpend(budgetId: string, cost: number): Promise<void> { const budget = this.budgets.get(budgetId); if (!budget) return;
budget.spentUSD += cost;
// Check if period should reset
if (budget.resetAt && new Date() >= budget.resetAt) {
budget.spentUSD = cost; // Start fresh with current spend
budget.resetAt = this.calculateNextReset(budget.period);
}
} }
Token Estimation
// Rough estimation before API call function estimateTokens(text: string): number { // Rough heuristic: ~4 characters per token for English return Math.ceil(text.length / 4); }
// More accurate estimation using tiktoken (for OpenAI) import { encoding_for_model } from 'tiktoken';
function countTokensAccurate(text: string, model: string): number { const enc = encoding_for_model(model); const tokens = enc.encode(text); enc.free(); return tokens.length; }
// Estimate cost before execution function estimateCallCost( model: string, systemPrompt: string, userMessage: string, expectedOutputTokens: number ): number { const inputTokens = estimateTokens(systemPrompt + userMessage); return calculateCost(model, inputTokens, expectedOutputTokens); }
Cost-Aware Agent
class CostAwareAgent { private budget: Budget; private spent = 0;
constructor(budgetUSD: number) { this.budget = { id: 'agent-budget', name: 'Agent Task Budget', limitUSD: budgetUSD, spentUSD: 0, period: 'task', alertThresholds: [0.5, 0.8], hardLimit: true }; }
async execute(task: string): Promise<Result> { // Estimate cost const estimate = this.estimateTaskCost(task);
if (estimate > this.remaining) {
return this.handleBudgetExceeded(task, estimate);
}
// Choose model based on budget
const model = this.selectModelForBudget(task);
// Execute with tracking
const result = await this.llm.complete({
model,
messages: [{ role: 'user', content: task }],
onUsage: (usage) => this.recordUsage(usage, model)
});
return result;
}
private selectModelForBudget(task: string): string { const complexity = this.assessComplexity(task); const remaining = this.budget.limitUSD - this.spent;
// Use cheaper models when budget is tight
if (remaining < 0.10) {
return 'gpt-4o-mini'; // Cheapest
}
if (remaining < 0.50 || complexity === 'low') {
return 'claude-3-haiku';
}
if (remaining < 2.00 || complexity === 'medium') {
return 'claude-3-sonnet';
}
return 'claude-3-opus'; // Full power when budget allows
}
private handleBudgetExceeded(task: string, estimate: number): Result { // Options: // 1. Simplify the task // 2. Use cheaper model // 3. Return partial result // 4. Request budget increase
const cheaperModel = this.findCheapestViableModel(task);
if (cheaperModel) {
return this.execute(task); // Retry with cheaper model
}
return {
success: false,
error: 'Budget exceeded',
partialResult: null,
budgetInfo: {
remaining: this.remaining,
estimated: estimate
}
};
}
get remaining(): number { return this.budget.limitUSD - this.spent; } }
Multi-Agent Cost Distribution
interface AgentCostAllocation { agentId: string; allocatedUSD: number; spentUSD: number; priority: 'low' | 'medium' | 'high'; }
class MultiAgentBudgetManager { private totalBudget: number; private allocations = new Map<string, AgentCostAllocation>();
allocateBudget(agents: { id: string; priority: string }[]): void { // Priority weights const weights = { high: 3, medium: 2, low: 1 };
const totalWeight = agents.reduce(
(sum, a) => sum + weights[a.priority], 0
);
for (const agent of agents) {
const share = (weights[agent.priority] / totalWeight) * this.totalBudget;
this.allocations.set(agent.id, {
agentId: agent.id,
allocatedUSD: share,
spentUSD: 0,
priority: agent.priority as any
});
}
}
// Reallocate from under-spending to over-spending agents rebalance(): void { const underSpenders = Array.from(this.allocations.values()) .filter(a => a.spentUSD < a.allocatedUSD * 0.5);
const overSpenders = Array.from(this.allocations.values())
.filter(a => a.spentUSD > a.allocatedUSD * 0.8);
for (const over of overSpenders) {
const needed = over.spentUSD - over.allocatedUSD * 0.8;
for (const under of underSpenders) {
const available = under.allocatedUSD * 0.5 - under.spentUSD;
const transfer = Math.min(needed, available);
if (transfer > 0) {
under.allocatedUSD -= transfer;
over.allocatedUSD += transfer;
}
}
}
} }
Cost Optimization Strategies
- Prompt Caching
// Anthropic prompt caching async function callWithCaching(messages: Message[]): Promise<Response> { // Mark system prompt for caching const cachedMessages = messages.map((m, i) => { if (i === 0 && m.role === 'system') { return { ...m, cache_control: { type: 'ephemeral' } }; } return m; });
return anthropic.messages.create({ model: 'claude-3-sonnet-20240229', messages: cachedMessages }); }
- Model Routing
function routeToOptimalModel(task: string, budget: number): string { const complexity = assessComplexity(task);
// Complexity vs cost matrix const modelMatrix = { simple: ['gpt-4o-mini', 'claude-3-haiku'], medium: ['claude-3-sonnet', 'gpt-4o'], complex: ['claude-3-opus', 'gpt-4-turbo'] };
const candidates = modelMatrix[complexity];
// Find cheapest that fits budget for (const model of candidates) { const estimatedCost = estimateCallCost(model, task, 500); if (estimatedCost <= budget) { return model; } }
return candidates[candidates.length - 1]; // Fallback to cheapest }
- Context Compression
async function compressContext( context: string, targetTokens: number ): Promise<string> { const currentTokens = estimateTokens(context);
if (currentTokens <= targetTokens) { return context; }
// Use cheap model to summarize
const summary = await llm.complete({
model: 'gpt-4o-mini',
messages: [{
role: 'user',
content: Summarize this context in under ${targetTokens} tokens, preserving key information:\n\n${context}
}]
});
return summary; }
Reporting
interface CostReport { period: { start: Date; end: Date }; totalSpent: number; byModel: Record<string, { calls: number; tokens: number; cost: number }>; byAgent: Record<string, { calls: number; cost: number }>; byTask: Record<string, { cost: number; success: boolean }>; trends: { dailyAverage: number; projectedMonthly: number; topCostDrivers: string[]; }; }
function generateCostReport(usageData: UsageRecord[]): CostReport { // ... aggregation logic }
Best Practices
-
Set budgets early - Don't wait for the bill
-
Monitor in real-time - Not after the fact
-
Use tiered models - Right-size for the task
-
Cache aggressively - Reuse where possible
-
Compress context - Less tokens = less cost
-
Alert early - 80% threshold, not 100%
-
Track by task - Know what's expensive
-
Review regularly - Optimize based on data