Lesson 6 of 46 ~25 min
Course progress
0%

Opus 4.6 Pricing Deep Dive

Master the complete Opus 4.6 cost model — standard vs. premium context pricing, thinking token costs, caching discounts, and competitive comparison.

Opus 4.6 pricing is not a single number. It is a multi-dimensional cost model with standard rates, premium context surcharges, thinking token overhead, and caching discounts. Understanding every dimension is essential for accurate budgeting and cost optimization.

The Complete Pricing Model

Standard Pricing (≤200K Context)

Token TypeCost per 1M Tokens
Input tokens$5.00
Output tokens$25.00
Thinking tokens$5.00 (billed as input)
Cached input (read)$0.50 (90% discount)
Cached input (write)$6.25

Premium Pricing (>200K Context)

When your conversation exceeds 200K tokens, all subsequent tokens are billed at premium rates:

Token TypeStandardPremiumMultiplier
Input$5.00$10.00
Output$25.00$50.00
Thinking$5.00$10.00
Cache read$0.50$1.00
Cache write$6.25$12.50

Model Family Comparison

                Input    Output   Context    Best For
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Opus 4.6       $5.00    $25.00   1M tokens  Hardest problems
Sonnet 4.5     $3.00    $15.00   200K       Everyday work
Haiku          $0.25    $1.25    200K       High-volume simple tasks
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
GPT-5.2        $5.00    $20.00   200K       General purpose
Gemini 3 Pro   $3.50    $10.50   1M         Long context alternative

Cost Calculator

from dataclasses import dataclass

@dataclass
class CostEstimate:
    input_cost: float
    output_cost: float
    thinking_cost: float
    cache_cost: float
    total: float
    pricing_tier: str  # "standard" or "premium"

class OpusCostCalculator:
    """Calculate exact costs for Opus 4.6 API calls."""

    STANDARD = {
        "input": 5.0,
        "output": 25.0,
        "thinking": 5.0,
        "cache_read": 0.5,
        "cache_write": 6.25,
    }
    PREMIUM = {
        "input": 10.0,
        "output": 50.0,
        "thinking": 10.0,
        "cache_read": 1.0,
        "cache_write": 12.5,
    }
    PREMIUM_THRESHOLD = 200_000

    def estimate(self, input_tokens: int, output_tokens: int,
                 thinking_tokens: int = 0,
                 cached_tokens: int = 0,
                 cache_write_tokens: int = 0) -> CostEstimate:
        """Estimate cost for a single API call."""
        # Determine pricing tier
        total_input = input_tokens + thinking_tokens + cached_tokens
        is_premium = total_input > self.PREMIUM_THRESHOLD
        rates = self.PREMIUM if is_premium else self.STANDARD

        input_cost = input_tokens * rates["input"] / 1_000_000
        output_cost = output_tokens * rates["output"] / 1_000_000
        thinking_cost = thinking_tokens * rates["thinking"] / 1_000_000
        cache_cost = (
            cached_tokens * rates["cache_read"] / 1_000_000
            + cache_write_tokens * rates["cache_write"] / 1_000_000
        )

        return CostEstimate(
            input_cost=round(input_cost, 6),
            output_cost=round(output_cost, 6),
            thinking_cost=round(thinking_cost, 6),
            cache_cost=round(cache_cost, 6),
            total=round(
                input_cost + output_cost + thinking_cost + cache_cost, 6
            ),
            pricing_tier="premium" if is_premium else "standard",
        )

    def monthly_projection(self, daily_calls: int,
                           avg_input: int, avg_output: int,
                           avg_thinking: int = 0,
                           working_days: int = 22) -> dict:
        """Project monthly costs."""
        per_call = self.estimate(avg_input, avg_output, avg_thinking)
        monthly_calls = daily_calls * working_days

        return {
            "per_call": per_call.total,
            "daily": per_call.total * daily_calls,
            "monthly": per_call.total * monthly_calls,
            "annual": per_call.total * monthly_calls * 12,
            "pricing_tier": per_call.pricing_tier,
            "breakdown": {
                "input": per_call.input_cost * monthly_calls,
                "output": per_call.output_cost * monthly_calls,
                "thinking": per_call.thinking_cost * monthly_calls,
            }
        }

# Usage
calc = OpusCostCalculator()

# Typical code review call
review_cost = calc.estimate(
    input_tokens=15_000,   # Code + system prompt
    output_tokens=3_000,   # Review comments
    thinking_tokens=8_000, # Deep analysis
)
print(f"Code review cost: ${review_cost.total:.4f}")
# → $0.1150

# Large context analysis (premium pricing)
large_context = calc.estimate(
    input_tokens=500_000,  # Full codebase
    output_tokens=5_000,   # Analysis report
    thinking_tokens=50_000,# Maximum thinking
)
print(f"Full codebase analysis: ${large_context.total:.4f}")
print(f"Pricing tier: {large_context.pricing_tier}")
# → Premium tier, significantly higher

Thinking Token Economics

Thinking tokens are the hidden cost driver. A single “maximum” effort call can consume 200K thinking tokens:

THINKING_COST_EXAMPLES = {
    "no_thinking": {
        "thinking_tokens": 0,
        "thinking_cost": "$0.00",
        "use_case": "Simple formatting, data extraction"
    },
    "quick_thinking": {
        "thinking_tokens": 1_500,
        "thinking_cost": "$0.0075",
        "use_case": "Classification, simple Q&A"
    },
    "standard_thinking": {
        "thinking_tokens": 8_000,
        "thinking_cost": "$0.04",
        "use_case": "Code generation, debugging"
    },
    "deep_thinking": {
        "thinking_tokens": 40_000,
        "thinking_cost": "$0.20",
        "use_case": "Architecture review, security audit"
    },
    "maximum_thinking": {
        "thinking_tokens": 200_000,
        "thinking_cost": "$1.00",
        "use_case": "Novel research, exhaustive analysis"
    },
}

Caching Strategies for Cost Reduction

Prompt caching can reduce input costs by up to 90%:

class CacheOptimizedClient:
    """Client that maximizes prompt caching for cost savings."""

    def __init__(self):
        from anthropic import Anthropic
        self.client = Anthropic()
        self.cache_stats = {"hits": 0, "misses": 0, "savings": 0.0}

    def query_with_stable_prefix(self, system_prompt: str,
                                  context: str,
                                  question: str) -> str:
        """Structure the request to maximize cache hits."""
        # Cache-friendly structure:
        # 1. System prompt (stable — cached)
        # 2. Context documents (stable across questions — cached)
        # 3. User question (changes each time — not cached)

        response = self.client.messages.create(
            model="claude-opus-4-6-20260205",
            max_tokens=4096,
            system=[
                {
                    "type": "text",
                    "text": system_prompt,
                    "cache_control": {"type": "ephemeral"}
                },
                {
                    "type": "text",
                    "text": f"Reference context:\n{context}",
                    "cache_control": {"type": "ephemeral"}
                }
            ],
            messages=[{"role": "user", "content": question}]
        )

        # Track cache performance
        usage = response.usage
        cached = getattr(usage, 'cache_read_input_tokens', 0)
        if cached > 0:
            self.cache_stats["hits"] += 1
            # Savings: difference between full price and cache price
            savings = cached * (5.0 - 0.5) / 1_000_000
            self.cache_stats["savings"] += savings
        else:
            self.cache_stats["misses"] += 1

        return next(b.text for b in response.content if b.type == "text")

    def report_savings(self) -> str:
        total = self.cache_stats["hits"] + self.cache_stats["misses"]
        hit_rate = (self.cache_stats["hits"] / total * 100
                    if total > 0 else 0)
        return (f"Cache hit rate: {hit_rate:.1f}%\n"
                f"Total savings: ${self.cache_stats['savings']:.2f}")

Real-World Cost Scenarios

calc = OpusCostCalculator()

scenarios = {
    "Solo developer (light)": calc.monthly_projection(
        daily_calls=20, avg_input=5_000,
        avg_output=2_000, avg_thinking=3_000
    ),
    "Solo developer (heavy)": calc.monthly_projection(
        daily_calls=100, avg_input=10_000,
        avg_output=3_000, avg_thinking=8_000
    ),
    "Small team (5 devs)": calc.monthly_projection(
        daily_calls=500, avg_input=8_000,
        avg_output=2_500, avg_thinking=5_000
    ),
    "Enterprise (50 users)": calc.monthly_projection(
        daily_calls=5000, avg_input=12_000,
        avg_output=3_000, avg_thinking=6_000
    ),
}

for name, projection in scenarios.items():
    print(f"{name}:")
    print(f"  Monthly: ${projection['monthly']:.2f}")
    print(f"  Annual:  ${projection['annual']:.2f}")
    print()

The most effective cost optimization is not about squeezing tokens — it is about using the right model for each task. In the next lesson, you will build a model router that automatically selects Opus, Sonnet, or Haiku based on task requirements.