Lesson 18 of 46 ~25 min
Course progress
0%

Building a Production Model Router

Build a production model router that automatically selects Opus, Sonnet, or Haiku based on task classification — full Python implementation with cost tracking.

The single biggest cost optimization for LLM-powered systems is not using Opus 4.6 for everything. A model router analyzes each request and routes it to the cheapest model that can handle it with acceptable quality. This lesson implements one from scratch.

Router Architecture

                    ┌──────────────┐
                    │  Incoming    │
                    │  Request     │
                    └──────┬───────┘

                    ┌──────▼───────┐
                    │  Classifier  │
                    │  (Haiku)     │
                    └──────┬───────┘

              ┌────────────┼────────────┐
              │            │            │
       ┌──────▼──────┐ ┌──▼────┐ ┌─────▼─────┐
       │  Opus 4.6   │ │Sonnet │ │  Haiku    │
       │  $5/$25     │ │$3/$15 │ │  $0.25/$1 │
       │  Complex    │ │Medium │ │  Simple   │
       └─────────────┘ └───────┘ └───────────┘

Task Classifier

Use Haiku (the cheapest model) to classify incoming requests:

from anthropic import Anthropic
from dataclasses import dataclass
from enum import Enum
import json

class ModelTier(Enum):
    HAIKU = "claude-haiku-4-5-20241022"
    SONNET = "claude-sonnet-4-5-20241022"
    OPUS = "claude-opus-4-6-20260205"

@dataclass
class RoutingDecision:
    tier: ModelTier
    confidence: float
    reasoning: str
    estimated_cost_ratio: float  # 1.0 = Opus baseline

class TaskClassifier:
    """Classify tasks to determine optimal model tier."""

    CLASSIFICATION_PROMPT = """Classify this task into one of three tiers.

TIER 1 (HAIKU - simple):
- Text formatting, extraction, classification
- Simple Q&A with clear answers
- Data transformation, JSON parsing
- Translation of straightforward text

TIER 2 (SONNET - moderate):
- Code generation for standard patterns
- Document summarization
- Bug fixing with clear error messages
- Content creation with moderate creativity

TIER 3 (OPUS - complex):
- Architecture design and review
- Security audits and vulnerability analysis
- Novel algorithm design
- Multi-step reasoning across large contexts
- Tasks requiring maximum accuracy
- Research synthesis from multiple sources

Task to classify:
{task}

Respond with JSON only:
{{"tier": 1|2|3, "confidence": 0.0-1.0, "reasoning": "..."}}"""

    def __init__(self):
        self.client = Anthropic()

    def classify(self, prompt: str, system: str = "",
                 override_tier: int = None) -> RoutingDecision:
        """Classify a task and return a routing decision."""
        if override_tier:
            tier_map = {1: ModelTier.HAIKU, 2: ModelTier.SONNET,
                        3: ModelTier.OPUS}
            return RoutingDecision(
                tier=tier_map[override_tier],
                confidence=1.0,
                reasoning="Manual override",
                estimated_cost_ratio={1: 0.05, 2: 0.6, 3: 1.0}[override_tier]
            )

        # Use Haiku for classification (cheapest possible)
        task_summary = f"System: {system[:200]}\nUser: {prompt[:500]}"

        response = self.client.messages.create(
            model=ModelTier.HAIKU.value,
            max_tokens=256,
            messages=[{
                "role": "user",
                "content": self.CLASSIFICATION_PROMPT.format(
                    task=task_summary
                )
            }]
        )

        text = next(b.text for b in response.content if b.type == "text")
        result = json.loads(text)

        tier_map = {
            1: (ModelTier.HAIKU, 0.05),
            2: (ModelTier.SONNET, 0.6),
            3: (ModelTier.OPUS, 1.0),
        }
        tier, cost_ratio = tier_map.get(
            result["tier"], (ModelTier.SONNET, 0.6)
        )

        return RoutingDecision(
            tier=tier,
            confidence=result["confidence"],
            reasoning=result["reasoning"],
            estimated_cost_ratio=cost_ratio,
        )

Full Model Router

import time

@dataclass
class RouterMetrics:
    total_requests: int = 0
    requests_by_tier: dict = None
    total_cost: float = 0.0
    cost_saved: float = 0.0  # vs. sending everything to Opus

    def __post_init__(self):
        if self.requests_by_tier is None:
            self.requests_by_tier = {"haiku": 0, "sonnet": 0, "opus": 0}

class ModelRouter:
    """Production model router with cost tracking."""

    # Cost per 1M tokens for each tier
    COSTS = {
        ModelTier.HAIKU:  {"input": 0.25, "output": 1.25},
        ModelTier.SONNET: {"input": 3.0,  "output": 15.0},
        ModelTier.OPUS:   {"input": 5.0,  "output": 25.0},
    }

    def __init__(self, default_tier: ModelTier = ModelTier.SONNET,
                 confidence_threshold: float = 0.7):
        self.client = Anthropic()
        self.classifier = TaskClassifier()
        self.default_tier = default_tier
        self.confidence_threshold = confidence_threshold
        self.metrics = RouterMetrics()

    def route(self, prompt: str, system: str = "",
              max_tokens: int = 4096,
              force_tier: int = None,
              thinking: dict = None) -> dict:
        """Route a request to the optimal model."""
        # Classify the task
        decision = self.classifier.classify(
            prompt, system, override_tier=force_tier
        )

        # If classifier is uncertain, use default tier
        if decision.confidence < self.confidence_threshold:
            decision = RoutingDecision(
                tier=self.default_tier,
                confidence=decision.confidence,
                reasoning=f"Low confidence ({decision.confidence:.2f}), "
                         f"using default tier",
                estimated_cost_ratio=0.6,
            )

        # Configure thinking based on tier
        if thinking is None:
            thinking_config = self._default_thinking(decision.tier)
        else:
            thinking_config = thinking

        # Execute the request
        start = time.monotonic()

        kwargs = {
            "model": decision.tier.value,
            "max_tokens": max_tokens,
            "messages": [{"role": "user", "content": prompt}],
        }
        if system:
            kwargs["system"] = system
        if thinking_config and decision.tier != ModelTier.HAIKU:
            kwargs["thinking"] = thinking_config

        response = self.client.messages.create(**kwargs)
        latency = int((time.monotonic() - start) * 1000)

        # Calculate costs
        actual_cost = self._calculate_cost(
            decision.tier, response.usage
        )
        opus_cost = self._calculate_cost(
            ModelTier.OPUS, response.usage
        )

        # Update metrics
        self.metrics.total_requests += 1
        tier_name = decision.tier.name.lower()
        self.metrics.requests_by_tier[tier_name] = (
            self.metrics.requests_by_tier.get(tier_name, 0) + 1
        )
        self.metrics.total_cost += actual_cost
        self.metrics.cost_saved += (opus_cost - actual_cost)

        # Extract response text
        text = next(
            b.text for b in response.content if b.type == "text"
        )

        return {
            "text": text,
            "model": decision.tier.value,
            "tier": tier_name,
            "routing_reason": decision.reasoning,
            "confidence": decision.confidence,
            "cost": actual_cost,
            "cost_if_opus": opus_cost,
            "savings": opus_cost - actual_cost,
            "latency_ms": latency,
        }

    def _default_thinking(self, tier: ModelTier) -> dict:
        if tier == ModelTier.OPUS:
            return {"type": "adaptive"}
        elif tier == ModelTier.SONNET:
            return {"type": "adaptive", "effort": "standard"}
        return None  # Haiku: no thinking

    def _calculate_cost(self, tier: ModelTier, usage) -> float:
        rates = self.COSTS[tier]
        input_cost = usage.input_tokens * rates["input"] / 1_000_000
        output_cost = usage.output_tokens * rates["output"] / 1_000_000
        return round(input_cost + output_cost, 6)

    def report(self) -> str:
        """Generate a cost savings report."""
        m = self.metrics
        savings_pct = (
            (m.cost_saved / (m.total_cost + m.cost_saved) * 100)
            if (m.total_cost + m.cost_saved) > 0 else 0
        )

        lines = [
            f"Model Router Report",
            f"{'=' * 40}",
            f"Total requests: {m.total_requests}",
            f"Distribution:",
        ]
        for tier, count in m.requests_by_tier.items():
            pct = count / m.total_requests * 100 if m.total_requests else 0
            lines.append(f"  {tier:8}: {count:5} ({pct:.1f}%)")

        lines.extend([
            f"",
            f"Total cost:    ${m.total_cost:.2f}",
            f"If all Opus:   ${m.total_cost + m.cost_saved:.2f}",
            f"Saved:         ${m.cost_saved:.2f} ({savings_pct:.1f}%)",
        ])

        return "\n".join(lines)

Usage Example

router = ModelRouter(
    default_tier=ModelTier.SONNET,
    confidence_threshold=0.7
)

# Simple task → routes to Haiku
result = router.route("Format this JSON: {\"name\": \"test\"}")
print(f"Routed to: {result['tier']}")  # → haiku

# Medium task → routes to Sonnet
result = router.route(
    "Write a Python function to merge two sorted lists efficiently"
)
print(f"Routed to: {result['tier']}")  # → sonnet

# Complex task → routes to Opus
result = router.route(
    "Review this distributed system architecture for single points "
    "of failure, race conditions, and data consistency issues...",
    system="You are a principal systems architect."
)
print(f"Routed to: {result['tier']}")  # → opus

# Force a specific tier when you know best
result = router.route(
    "Analyze this security vulnerability...",
    force_tier=3  # Always use Opus for security
)

# Check savings
print(router.report())

Router Quality Monitoring

Track whether routing decisions are correct:

class RouterQualityMonitor:
    """Monitor whether routing decisions produce acceptable quality."""

    def __init__(self):
        self.evaluations: list[dict] = []

    def evaluate(self, routing_result: dict,
                 user_satisfaction: float) -> None:
        """Record a quality evaluation."""
        self.evaluations.append({
            "tier": routing_result["tier"],
            "confidence": routing_result["confidence"],
            "satisfaction": user_satisfaction,
            "cost": routing_result["cost"],
        })

    def tier_quality_report(self) -> dict:
        """Report quality metrics by tier."""
        by_tier = {}
        for e in self.evaluations:
            tier = e["tier"]
            by_tier.setdefault(tier, []).append(e)

        report = {}
        for tier, evals in by_tier.items():
            satisfactions = [e["satisfaction"] for e in evals]
            report[tier] = {
                "avg_satisfaction": sum(satisfactions) / len(satisfactions),
                "below_threshold": sum(
                    1 for s in satisfactions if s < 0.7
                ),
                "total": len(evals),
                "avg_cost": sum(e["cost"] for e in evals) / len(evals),
            }

        return report

The model router is the foundation of cost optimization. In the next lesson, you will learn to measure and present the ROI of your AI investment to justify the costs to leadership.