Opus 4.6 pricing is not a single number. It is a multi-dimensional cost model with standard rates, premium context surcharges, thinking token overhead, and caching discounts. Understanding every dimension is essential for accurate budgeting and cost optimization.
The Complete Pricing Model
Standard Pricing (≤200K Context)
| Token Type | Cost per 1M Tokens |
|---|---|
| Input tokens | $5.00 |
| Output tokens | $25.00 |
| Thinking tokens | $5.00 (billed as input) |
| Cached input (read) | $0.50 (90% discount) |
| Cached input (write) | $6.25 |
Premium Pricing (>200K Context)
When your conversation exceeds 200K tokens, all subsequent tokens are billed at premium rates:
| Token Type | Standard | Premium | Multiplier |
|---|---|---|---|
| Input | $5.00 | $10.00 | 2× |
| Output | $25.00 | $50.00 | 2× |
| Thinking | $5.00 | $10.00 | 2× |
| Cache read | $0.50 | $1.00 | 2× |
| Cache write | $6.25 | $12.50 | 2× |
Model Family Comparison
Input Output Context Best For
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Opus 4.6 $5.00 $25.00 1M tokens Hardest problems
Sonnet 4.5 $3.00 $15.00 200K Everyday work
Haiku $0.25 $1.25 200K High-volume simple tasks
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
GPT-5.2 $5.00 $20.00 200K General purpose
Gemini 3 Pro $3.50 $10.50 1M Long context alternative
Cost Calculator
from dataclasses import dataclass
@dataclass
class CostEstimate:
input_cost: float
output_cost: float
thinking_cost: float
cache_cost: float
total: float
pricing_tier: str # "standard" or "premium"
class OpusCostCalculator:
"""Calculate exact costs for Opus 4.6 API calls."""
STANDARD = {
"input": 5.0,
"output": 25.0,
"thinking": 5.0,
"cache_read": 0.5,
"cache_write": 6.25,
}
PREMIUM = {
"input": 10.0,
"output": 50.0,
"thinking": 10.0,
"cache_read": 1.0,
"cache_write": 12.5,
}
PREMIUM_THRESHOLD = 200_000
def estimate(self, input_tokens: int, output_tokens: int,
thinking_tokens: int = 0,
cached_tokens: int = 0,
cache_write_tokens: int = 0) -> CostEstimate:
"""Estimate cost for a single API call."""
# Determine pricing tier
total_input = input_tokens + thinking_tokens + cached_tokens
is_premium = total_input > self.PREMIUM_THRESHOLD
rates = self.PREMIUM if is_premium else self.STANDARD
input_cost = input_tokens * rates["input"] / 1_000_000
output_cost = output_tokens * rates["output"] / 1_000_000
thinking_cost = thinking_tokens * rates["thinking"] / 1_000_000
cache_cost = (
cached_tokens * rates["cache_read"] / 1_000_000
+ cache_write_tokens * rates["cache_write"] / 1_000_000
)
return CostEstimate(
input_cost=round(input_cost, 6),
output_cost=round(output_cost, 6),
thinking_cost=round(thinking_cost, 6),
cache_cost=round(cache_cost, 6),
total=round(
input_cost + output_cost + thinking_cost + cache_cost, 6
),
pricing_tier="premium" if is_premium else "standard",
)
def monthly_projection(self, daily_calls: int,
avg_input: int, avg_output: int,
avg_thinking: int = 0,
working_days: int = 22) -> dict:
"""Project monthly costs."""
per_call = self.estimate(avg_input, avg_output, avg_thinking)
monthly_calls = daily_calls * working_days
return {
"per_call": per_call.total,
"daily": per_call.total * daily_calls,
"monthly": per_call.total * monthly_calls,
"annual": per_call.total * monthly_calls * 12,
"pricing_tier": per_call.pricing_tier,
"breakdown": {
"input": per_call.input_cost * monthly_calls,
"output": per_call.output_cost * monthly_calls,
"thinking": per_call.thinking_cost * monthly_calls,
}
}
# Usage
calc = OpusCostCalculator()
# Typical code review call
review_cost = calc.estimate(
input_tokens=15_000, # Code + system prompt
output_tokens=3_000, # Review comments
thinking_tokens=8_000, # Deep analysis
)
print(f"Code review cost: ${review_cost.total:.4f}")
# → $0.1150
# Large context analysis (premium pricing)
large_context = calc.estimate(
input_tokens=500_000, # Full codebase
output_tokens=5_000, # Analysis report
thinking_tokens=50_000,# Maximum thinking
)
print(f"Full codebase analysis: ${large_context.total:.4f}")
print(f"Pricing tier: {large_context.pricing_tier}")
# → Premium tier, significantly higher
Thinking Token Economics
Thinking tokens are the hidden cost driver. A single “maximum” effort call can consume 200K thinking tokens:
THINKING_COST_EXAMPLES = {
"no_thinking": {
"thinking_tokens": 0,
"thinking_cost": "$0.00",
"use_case": "Simple formatting, data extraction"
},
"quick_thinking": {
"thinking_tokens": 1_500,
"thinking_cost": "$0.0075",
"use_case": "Classification, simple Q&A"
},
"standard_thinking": {
"thinking_tokens": 8_000,
"thinking_cost": "$0.04",
"use_case": "Code generation, debugging"
},
"deep_thinking": {
"thinking_tokens": 40_000,
"thinking_cost": "$0.20",
"use_case": "Architecture review, security audit"
},
"maximum_thinking": {
"thinking_tokens": 200_000,
"thinking_cost": "$1.00",
"use_case": "Novel research, exhaustive analysis"
},
}
Caching Strategies for Cost Reduction
Prompt caching can reduce input costs by up to 90%:
class CacheOptimizedClient:
"""Client that maximizes prompt caching for cost savings."""
def __init__(self):
from anthropic import Anthropic
self.client = Anthropic()
self.cache_stats = {"hits": 0, "misses": 0, "savings": 0.0}
def query_with_stable_prefix(self, system_prompt: str,
context: str,
question: str) -> str:
"""Structure the request to maximize cache hits."""
# Cache-friendly structure:
# 1. System prompt (stable — cached)
# 2. Context documents (stable across questions — cached)
# 3. User question (changes each time — not cached)
response = self.client.messages.create(
model="claude-opus-4-6-20260205",
max_tokens=4096,
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"}
},
{
"type": "text",
"text": f"Reference context:\n{context}",
"cache_control": {"type": "ephemeral"}
}
],
messages=[{"role": "user", "content": question}]
)
# Track cache performance
usage = response.usage
cached = getattr(usage, 'cache_read_input_tokens', 0)
if cached > 0:
self.cache_stats["hits"] += 1
# Savings: difference between full price and cache price
savings = cached * (5.0 - 0.5) / 1_000_000
self.cache_stats["savings"] += savings
else:
self.cache_stats["misses"] += 1
return next(b.text for b in response.content if b.type == "text")
def report_savings(self) -> str:
total = self.cache_stats["hits"] + self.cache_stats["misses"]
hit_rate = (self.cache_stats["hits"] / total * 100
if total > 0 else 0)
return (f"Cache hit rate: {hit_rate:.1f}%\n"
f"Total savings: ${self.cache_stats['savings']:.2f}")
Real-World Cost Scenarios
calc = OpusCostCalculator()
scenarios = {
"Solo developer (light)": calc.monthly_projection(
daily_calls=20, avg_input=5_000,
avg_output=2_000, avg_thinking=3_000
),
"Solo developer (heavy)": calc.monthly_projection(
daily_calls=100, avg_input=10_000,
avg_output=3_000, avg_thinking=8_000
),
"Small team (5 devs)": calc.monthly_projection(
daily_calls=500, avg_input=8_000,
avg_output=2_500, avg_thinking=5_000
),
"Enterprise (50 users)": calc.monthly_projection(
daily_calls=5000, avg_input=12_000,
avg_output=3_000, avg_thinking=6_000
),
}
for name, projection in scenarios.items():
print(f"{name}:")
print(f" Monthly: ${projection['monthly']:.2f}")
print(f" Annual: ${projection['annual']:.2f}")
print()
The most effective cost optimization is not about squeezing tokens — it is about using the right model for each task. In the next lesson, you will build a model router that automatically selects Opus, Sonnet, or Haiku based on task requirements.