The single biggest cost optimization for LLM-powered systems is not using Opus 4.6 for everything. A model router analyzes each request and routes it to the cheapest model that can handle it with acceptable quality. This lesson implements one from scratch.
Router Architecture
┌──────────────┐
│ Incoming │
│ Request │
└──────┬───────┘
│
┌──────▼───────┐
│ Classifier │
│ (Haiku) │
└──────┬───────┘
│
┌────────────┼────────────┐
│ │ │
┌──────▼──────┐ ┌──▼────┐ ┌─────▼─────┐
│ Opus 4.6 │ │Sonnet │ │ Haiku │
│ $5/$25 │ │$3/$15 │ │ $0.25/$1 │
│ Complex │ │Medium │ │ Simple │
└─────────────┘ └───────┘ └───────────┘
Task Classifier
Use Haiku (the cheapest model) to classify incoming requests:
from anthropic import Anthropic
from dataclasses import dataclass
from enum import Enum
import json
class ModelTier(Enum):
HAIKU = "claude-haiku-4-5-20241022"
SONNET = "claude-sonnet-4-5-20241022"
OPUS = "claude-opus-4-6-20260205"
@dataclass
class RoutingDecision:
tier: ModelTier
confidence: float
reasoning: str
estimated_cost_ratio: float # 1.0 = Opus baseline
class TaskClassifier:
"""Classify tasks to determine optimal model tier."""
CLASSIFICATION_PROMPT = """Classify this task into one of three tiers.
TIER 1 (HAIKU - simple):
- Text formatting, extraction, classification
- Simple Q&A with clear answers
- Data transformation, JSON parsing
- Translation of straightforward text
TIER 2 (SONNET - moderate):
- Code generation for standard patterns
- Document summarization
- Bug fixing with clear error messages
- Content creation with moderate creativity
TIER 3 (OPUS - complex):
- Architecture design and review
- Security audits and vulnerability analysis
- Novel algorithm design
- Multi-step reasoning across large contexts
- Tasks requiring maximum accuracy
- Research synthesis from multiple sources
Task to classify:
{task}
Respond with JSON only:
{{"tier": 1|2|3, "confidence": 0.0-1.0, "reasoning": "..."}}"""
def __init__(self):
self.client = Anthropic()
def classify(self, prompt: str, system: str = "",
override_tier: int = None) -> RoutingDecision:
"""Classify a task and return a routing decision."""
if override_tier:
tier_map = {1: ModelTier.HAIKU, 2: ModelTier.SONNET,
3: ModelTier.OPUS}
return RoutingDecision(
tier=tier_map[override_tier],
confidence=1.0,
reasoning="Manual override",
estimated_cost_ratio={1: 0.05, 2: 0.6, 3: 1.0}[override_tier]
)
# Use Haiku for classification (cheapest possible)
task_summary = f"System: {system[:200]}\nUser: {prompt[:500]}"
response = self.client.messages.create(
model=ModelTier.HAIKU.value,
max_tokens=256,
messages=[{
"role": "user",
"content": self.CLASSIFICATION_PROMPT.format(
task=task_summary
)
}]
)
text = next(b.text for b in response.content if b.type == "text")
result = json.loads(text)
tier_map = {
1: (ModelTier.HAIKU, 0.05),
2: (ModelTier.SONNET, 0.6),
3: (ModelTier.OPUS, 1.0),
}
tier, cost_ratio = tier_map.get(
result["tier"], (ModelTier.SONNET, 0.6)
)
return RoutingDecision(
tier=tier,
confidence=result["confidence"],
reasoning=result["reasoning"],
estimated_cost_ratio=cost_ratio,
)
Full Model Router
import time
@dataclass
class RouterMetrics:
total_requests: int = 0
requests_by_tier: dict = None
total_cost: float = 0.0
cost_saved: float = 0.0 # vs. sending everything to Opus
def __post_init__(self):
if self.requests_by_tier is None:
self.requests_by_tier = {"haiku": 0, "sonnet": 0, "opus": 0}
class ModelRouter:
"""Production model router with cost tracking."""
# Cost per 1M tokens for each tier
COSTS = {
ModelTier.HAIKU: {"input": 0.25, "output": 1.25},
ModelTier.SONNET: {"input": 3.0, "output": 15.0},
ModelTier.OPUS: {"input": 5.0, "output": 25.0},
}
def __init__(self, default_tier: ModelTier = ModelTier.SONNET,
confidence_threshold: float = 0.7):
self.client = Anthropic()
self.classifier = TaskClassifier()
self.default_tier = default_tier
self.confidence_threshold = confidence_threshold
self.metrics = RouterMetrics()
def route(self, prompt: str, system: str = "",
max_tokens: int = 4096,
force_tier: int = None,
thinking: dict = None) -> dict:
"""Route a request to the optimal model."""
# Classify the task
decision = self.classifier.classify(
prompt, system, override_tier=force_tier
)
# If classifier is uncertain, use default tier
if decision.confidence < self.confidence_threshold:
decision = RoutingDecision(
tier=self.default_tier,
confidence=decision.confidence,
reasoning=f"Low confidence ({decision.confidence:.2f}), "
f"using default tier",
estimated_cost_ratio=0.6,
)
# Configure thinking based on tier
if thinking is None:
thinking_config = self._default_thinking(decision.tier)
else:
thinking_config = thinking
# Execute the request
start = time.monotonic()
kwargs = {
"model": decision.tier.value,
"max_tokens": max_tokens,
"messages": [{"role": "user", "content": prompt}],
}
if system:
kwargs["system"] = system
if thinking_config and decision.tier != ModelTier.HAIKU:
kwargs["thinking"] = thinking_config
response = self.client.messages.create(**kwargs)
latency = int((time.monotonic() - start) * 1000)
# Calculate costs
actual_cost = self._calculate_cost(
decision.tier, response.usage
)
opus_cost = self._calculate_cost(
ModelTier.OPUS, response.usage
)
# Update metrics
self.metrics.total_requests += 1
tier_name = decision.tier.name.lower()
self.metrics.requests_by_tier[tier_name] = (
self.metrics.requests_by_tier.get(tier_name, 0) + 1
)
self.metrics.total_cost += actual_cost
self.metrics.cost_saved += (opus_cost - actual_cost)
# Extract response text
text = next(
b.text for b in response.content if b.type == "text"
)
return {
"text": text,
"model": decision.tier.value,
"tier": tier_name,
"routing_reason": decision.reasoning,
"confidence": decision.confidence,
"cost": actual_cost,
"cost_if_opus": opus_cost,
"savings": opus_cost - actual_cost,
"latency_ms": latency,
}
def _default_thinking(self, tier: ModelTier) -> dict:
if tier == ModelTier.OPUS:
return {"type": "adaptive"}
elif tier == ModelTier.SONNET:
return {"type": "adaptive", "effort": "standard"}
return None # Haiku: no thinking
def _calculate_cost(self, tier: ModelTier, usage) -> float:
rates = self.COSTS[tier]
input_cost = usage.input_tokens * rates["input"] / 1_000_000
output_cost = usage.output_tokens * rates["output"] / 1_000_000
return round(input_cost + output_cost, 6)
def report(self) -> str:
"""Generate a cost savings report."""
m = self.metrics
savings_pct = (
(m.cost_saved / (m.total_cost + m.cost_saved) * 100)
if (m.total_cost + m.cost_saved) > 0 else 0
)
lines = [
f"Model Router Report",
f"{'=' * 40}",
f"Total requests: {m.total_requests}",
f"Distribution:",
]
for tier, count in m.requests_by_tier.items():
pct = count / m.total_requests * 100 if m.total_requests else 0
lines.append(f" {tier:8}: {count:5} ({pct:.1f}%)")
lines.extend([
f"",
f"Total cost: ${m.total_cost:.2f}",
f"If all Opus: ${m.total_cost + m.cost_saved:.2f}",
f"Saved: ${m.cost_saved:.2f} ({savings_pct:.1f}%)",
])
return "\n".join(lines)
Usage Example
router = ModelRouter(
default_tier=ModelTier.SONNET,
confidence_threshold=0.7
)
# Simple task → routes to Haiku
result = router.route("Format this JSON: {\"name\": \"test\"}")
print(f"Routed to: {result['tier']}") # → haiku
# Medium task → routes to Sonnet
result = router.route(
"Write a Python function to merge two sorted lists efficiently"
)
print(f"Routed to: {result['tier']}") # → sonnet
# Complex task → routes to Opus
result = router.route(
"Review this distributed system architecture for single points "
"of failure, race conditions, and data consistency issues...",
system="You are a principal systems architect."
)
print(f"Routed to: {result['tier']}") # → opus
# Force a specific tier when you know best
result = router.route(
"Analyze this security vulnerability...",
force_tier=3 # Always use Opus for security
)
# Check savings
print(router.report())
Router Quality Monitoring
Track whether routing decisions are correct:
class RouterQualityMonitor:
"""Monitor whether routing decisions produce acceptable quality."""
def __init__(self):
self.evaluations: list[dict] = []
def evaluate(self, routing_result: dict,
user_satisfaction: float) -> None:
"""Record a quality evaluation."""
self.evaluations.append({
"tier": routing_result["tier"],
"confidence": routing_result["confidence"],
"satisfaction": user_satisfaction,
"cost": routing_result["cost"],
})
def tier_quality_report(self) -> dict:
"""Report quality metrics by tier."""
by_tier = {}
for e in self.evaluations:
tier = e["tier"]
by_tier.setdefault(tier, []).append(e)
report = {}
for tier, evals in by_tier.items():
satisfactions = [e["satisfaction"] for e in evals]
report[tier] = {
"avg_satisfaction": sum(satisfactions) / len(satisfactions),
"below_threshold": sum(
1 for s in satisfactions if s < 0.7
),
"total": len(evals),
"avg_cost": sum(e["cost"] for e in evals) / len(evals),
}
return report
The model router is the foundation of cost optimization. In the next lesson, you will learn to measure and present the ROI of your AI investment to justify the costs to leadership.