Cost Control & Budgeting
AI API costs are unpredictable and grow fast. One engineer experimenting with a 200K-token prompt can spend $50 in an afternoon. This lesson builds the cost tracking, budget enforcement, and optimization layer that prevents surprise bills and gives teams visibility into exactly what they are spending.
Per-Request Cost Calculation
The gateway calculates cost for every request as soon as the response arrives. This requires a pricing table that you keep updated as providers change prices:
from dataclasses import dataclass
from datetime import datetime, timezone
@dataclass
class ModelPricing:
"""Pricing per 1M tokens for a specific model."""
provider: str
model: str
input_per_1m: float # USD per 1M input tokens
output_per_1m: float # USD per 1M output tokens
cached_input_per_1m: float = 0.0 # Discounted cached input (Anthropic)
last_updated: str = "2026-01-15"
# Keep this table updated - check provider pricing pages monthly
PRICING = {
"gpt-4o": ModelPricing("openai", "gpt-4o", 2.50, 10.00),
"gpt-4o-mini": ModelPricing("openai", "gpt-4o-mini", 0.15, 0.60),
"gpt-4.1": ModelPricing("openai", "gpt-4.1", 2.00, 8.00),
"gpt-4.1-mini": ModelPricing("openai", "gpt-4.1-mini", 0.40, 1.60),
"claude-sonnet-4": ModelPricing("anthropic", "claude-sonnet-4-20250514", 3.00, 15.00,
cached_input_per_1m=0.30),
"claude-haiku": ModelPricing("anthropic", "claude-haiku-4-20250414", 0.80, 4.00,
cached_input_per_1m=0.08),
"claude-opus-4": ModelPricing("anthropic", "claude-opus-4-20250514", 15.00, 75.00,
cached_input_per_1m=1.50),
"gemini-2.5-pro": ModelPricing("google", "gemini-2.5-pro", 1.25, 5.00),
"gemini-2.5-flash": ModelPricing("google", "gemini-2.5-flash", 0.15, 0.60),
}
def calculate_request_cost(
model: str,
input_tokens: int,
output_tokens: int,
cached_input_tokens: int = 0
) -> dict:
"""Calculate exact USD cost for a completed request."""
pricing = PRICING.get(model)
if not pricing:
raise ValueError(f"Unknown model '{model}'. Update PRICING table.")
uncached_input = input_tokens - cached_input_tokens
input_cost = (uncached_input / 1_000_000) * pricing.input_per_1m
cached_cost = (cached_input_tokens / 1_000_000) * pricing.cached_input_per_1m
output_cost = (output_tokens / 1_000_000) * pricing.output_per_1m
total = input_cost + cached_cost + output_cost
return {
"input_cost": round(input_cost, 6),
"cached_input_cost": round(cached_cost, 6),
"output_cost": round(output_cost, 6),
"total_cost": round(total, 6),
"model": model,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
}
# Example costs to illustrate scale:
print(calculate_request_cost("gpt-4o", input_tokens=1000, output_tokens=500))
# {'total_cost': 0.0075, ...} -- $0.0075 per short chat
print(calculate_request_cost("gpt-4o", input_tokens=100000, output_tokens=4000))
# {'total_cost': 0.29, ...} -- $0.29 for long document + response
print(calculate_request_cost("claude-opus-4", input_tokens=100000, output_tokens=4000))
# {'total_cost': 1.80, ...} -- $1.80 for same request with Opus
Budget Enforcement
Set spending limits per team with both warning thresholds and hard cutoffs. Use Redis for real-time tracking and PostgreSQL for historical analytics:
import redis.asyncio as aioredis
from dataclasses import dataclass
from typing import Optional
@dataclass
class TeamBudget:
team_id: str
daily_limit_usd: float # Hard limit - requests blocked above this
daily_warning_usd: float # Alert threshold (typically 80% of limit)
monthly_limit_usd: float
monthly_warning_usd: float
alert_slack_channel: str
alert_email: str
allow_overage: bool = False # Let customer-facing apps exceed limits
class BudgetEnforcer:
"""Real-time budget tracking and enforcement."""
def __init__(self, redis_url: str):
self.redis = aioredis.from_url(redis_url)
async def check_budget(
self, team_id: str, budget: TeamBudget, estimated_cost: float
) -> dict:
"""Check if team has budget remaining. Called before LLM request."""
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
month = datetime.now(timezone.utc).strftime("%Y-%m")
daily_key = f"spend:{team_id}:{today}"
monthly_key = f"spend:{team_id}:{month}"
# Get current spend (atomic read)
pipe = self.redis.pipeline()
pipe.get(daily_key)
pipe.get(monthly_key)
daily_raw, monthly_raw = await pipe.execute()
daily_spend = float(daily_raw or 0)
monthly_spend = float(monthly_raw or 0)
# Check hard limits
if daily_spend + estimated_cost > budget.daily_limit_usd:
if not budget.allow_overage:
await self._alert(budget, "BLOCKED",
f"Daily limit ${budget.daily_limit_usd:.2f} reached. "
f"Spent: ${daily_spend:.2f}")
return {"allowed": False, "reason": "daily_budget_exceeded",
"spent": daily_spend, "limit": budget.daily_limit_usd}
if monthly_spend + estimated_cost > budget.monthly_limit_usd:
if not budget.allow_overage:
await self._alert(budget, "BLOCKED",
f"Monthly limit ${budget.monthly_limit_usd:.2f} reached. "
f"Spent: ${monthly_spend:.2f}")
return {"allowed": False, "reason": "monthly_budget_exceeded",
"spent": monthly_spend, "limit": budget.monthly_limit_usd}
# Check warning thresholds
if daily_spend + estimated_cost > budget.daily_warning_usd:
await self._alert(budget, "WARNING",
f"Approaching daily limit: ${daily_spend:.2f}/${budget.daily_limit_usd:.2f}")
return {
"allowed": True,
"daily_remaining": budget.daily_limit_usd - daily_spend,
"monthly_remaining": budget.monthly_limit_usd - monthly_spend,
}
async def record_spend(self, team_id: str, cost_usd: float):
"""Record actual spend after request completes."""
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
month = datetime.now(timezone.utc).strftime("%Y-%m")
pipe = self.redis.pipeline()
pipe.incrbyfloat(f"spend:{team_id}:{today}", cost_usd)
pipe.incrbyfloat(f"spend:{team_id}:{month}", cost_usd)
pipe.expire(f"spend:{team_id}:{today}", 172800) # 2 days
pipe.expire(f"spend:{team_id}:{month}", 2764800) # 32 days
await pipe.execute()
async def _alert(self, budget: TeamBudget, level: str, message: str):
"""Send alert to Slack and email."""
# Deduplicate alerts (max 1 per hour per team per alert type)
alert_key = f"alert:{budget.team_id}:{level}:{datetime.now(timezone.utc).strftime('%Y-%m-%d-%H')}"
already_alerted = await self.redis.set(alert_key, "1", nx=True, ex=3600)
if not already_alerted:
return # Already sent this hour
# In production: use Slack SDK, SendGrid, PagerDuty, etc.
print(f"[{level}] {budget.team_id}: {message}")
# Team budget configuration
BUDGETS = {
"engineering": TeamBudget(
team_id="engineering",
daily_limit_usd=200, daily_warning_usd=160,
monthly_limit_usd=4000, monthly_warning_usd=3200,
alert_slack_channel="#eng-ai-costs",
alert_email="eng-lead@company.com"
),
"customer-support": TeamBudget(
team_id="customer-support",
daily_limit_usd=300, daily_warning_usd=240,
monthly_limit_usd=6000, monthly_warning_usd=4800,
alert_slack_channel="#support-costs",
alert_email="support-lead@company.com",
allow_overage=True # Never block customer-facing chatbot
),
}
allow_overage=True for customer-facing teams. A chatbot going offline because of a budget limit costs more in customer trust than the API overage. Use alerts to catch overages quickly and adjust budgets proactively.Token Optimization at Gateway Level
The gateway can reduce costs without any changes to application code by optimizing requests before forwarding them:
class GatewayOptimizer:
"""Reduce token usage transparently at the gateway layer."""
def optimize(self, body: dict, team_config: dict) -> dict:
"""Apply cost optimizations to a request."""
messages = body.get("messages", [])
optimizations_applied = []
# 1. Trim conversation history
max_history = team_config.get("max_history_turns", 20)
if len(messages) > max_history + 1: # +1 for system prompt
system = [messages[0]] if messages[0]["role"] == "system" else []
trimmed = system + messages[-(max_history):]
body["messages"] = trimmed
optimizations_applied.append(f"trimmed_history:{len(messages)}->{len(trimmed)}")
# 2. Set max_tokens cap if not specified
if "max_tokens" not in body:
body["max_tokens"] = team_config.get("default_max_tokens", 4096)
optimizations_applied.append("set_max_tokens:4096")
# 3. Prevent accidental expensive models
model = body.get("model", "")
blocked_models = team_config.get("blocked_models", [])
model_overrides = team_config.get("model_overrides", {})
if model in blocked_models:
body["model"] = model_overrides.get(model, "gpt-4o-mini")
optimizations_applied.append(f"model_override:{model}->{body['model']}")
# 4. Auto-downgrade for simple tasks (opt-in per team)
if team_config.get("auto_downgrade", False):
if self._is_simple_request(messages) and model in ("gpt-4o", "claude-sonnet-4"):
body["model"] = "gpt-4o-mini"
optimizations_applied.append(f"auto_downgrade:{model}->gpt-4o-mini")
body["_optimizations"] = optimizations_applied
return body
def _is_simple_request(self, messages: list) -> bool:
"""Heuristic: short prompt with no complex instructions."""
if not messages:
return True
last = messages[-1].get("content", "")
if isinstance(last, list):
return False # Multimodal = not simple
return len(last) < 200 and len(messages) <= 3
Spending Dashboard Queries
These SQL queries power the spending dashboard. Run against your request_costs table:
-- 1. Daily spend by team (last 30 days)
SELECT team_id, DATE(created_at) as day,
SUM(total_cost_usd) as daily_spend,
COUNT(*) as requests,
SUM(input_tokens + output_tokens) as total_tokens
FROM request_costs
WHERE created_at > CURRENT_DATE - INTERVAL '30 days'
GROUP BY team_id, DATE(created_at)
ORDER BY day DESC, daily_spend DESC;
-- 2. Cost per model (which models cost the most?)
SELECT model,
COUNT(*) as requests,
ROUND(SUM(total_cost_usd)::numeric, 2) as total_cost,
ROUND(AVG(total_cost_usd)::numeric, 4) as avg_cost,
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_cost_usd)::numeric, 4) as p95_cost
FROM request_costs
WHERE created_at > CURRENT_DATE - INTERVAL '7 days'
GROUP BY model
ORDER BY total_cost DESC;
-- 3. Top spending users (find runaway scripts)
SELECT user_id, team_id,
COUNT(*) as requests,
ROUND(SUM(total_cost_usd)::numeric, 2) as total_spend,
MAX(total_cost_usd) as most_expensive_request
FROM request_costs
WHERE created_at > CURRENT_DATE - INTERVAL '1 day'
GROUP BY user_id, team_id
ORDER BY total_spend DESC
LIMIT 20;
-- 4. Savings from caching
SELECT DATE(created_at) as day,
COUNT(*) FILTER (WHERE was_cached) as cache_hits,
COUNT(*) FILTER (WHERE NOT was_cached) as api_calls,
ROUND(100.0 * COUNT(*) FILTER (WHERE was_cached) / COUNT(*)::numeric, 1) as hit_rate,
ROUND(SUM(CASE WHEN was_cached THEN estimated_uncached_cost ELSE 0 END)::numeric, 2) as money_saved
FROM request_costs
WHERE created_at > CURRENT_DATE - INTERVAL '7 days'
GROUP BY DATE(created_at)
ORDER BY day DESC;
-- 5. Spending anomaly detection
WITH daily_baseline AS (
SELECT team_id, AVG(daily_spend) as avg_daily
FROM (
SELECT team_id, DATE(created_at) as day, SUM(total_cost_usd) as daily_spend
FROM request_costs
WHERE created_at BETWEEN CURRENT_DATE - INTERVAL '30 days' AND CURRENT_DATE - INTERVAL '1 day'
GROUP BY team_id, DATE(created_at)
) t GROUP BY team_id
)
SELECT rc.team_id,
ROUND(SUM(rc.total_cost_usd)::numeric, 2) as today_spend,
ROUND(db.avg_daily::numeric, 2) as avg_daily,
ROUND((SUM(rc.total_cost_usd) / NULLIF(db.avg_daily, 0))::numeric, 1) as ratio
FROM request_costs rc
JOIN daily_baseline db ON rc.team_id = db.team_id
WHERE DATE(rc.created_at) = CURRENT_DATE
GROUP BY rc.team_id, db.avg_daily
HAVING SUM(rc.total_cost_usd) > db.avg_daily * 2
ORDER BY ratio DESC;
Chargeback Models
As AI spending grows, organizations need to attribute costs to the teams that generate them:
Direct Chargeback
Each team pays exactly what they use. Simple and fair, but discourages experimentation. Best for organizations where each department has its own P&L.
Shared Pool with Visibility
Central AI budget funds everything; teams get dashboards but no bill. Encourages adoption and experimentation. Best for startups and companies under $10K/month AI spend.
Free Tier + Overage
Each team gets a free allocation (e.g., $500/month). Usage above that is charged to their department. Balances experimentation with cost accountability.
Key Takeaways
- Calculate cost for every request using a pricing table updated monthly. Prices change frequently across providers.
- Use Redis for real-time budget enforcement (low latency) and PostgreSQL for historical analytics (complex queries).
- Set warning alerts at 80% of budget limit. Set
allow_overage=truefor customer-facing applications. - Gateway-level optimizations (history trimming, max_tokens capping, model downgrading) can reduce costs 20-40% without code changes.
- Start with shared pool budgeting, move to free-tier-plus-overage at $10K/month, and direct chargeback at $50K/month.
What Is Next
In the next lesson, we will build the security and compliance layer — API key management with automatic rotation, PII filtering to prevent data leaks, audit logging for SOC2, and data residency routing for GDPR and HIPAA.
Lilly Tech Systems