Thinking tokens are billed at the same rate as input tokens ($5/1M for Opus 4.6). Without budget controls, a single “maximum” effort request can consume 200K thinking tokens — costing $1.00 just for the thinking phase.
Setting Budget Caps
# Hard cap on thinking tokens
response = client.messages.create(
model="claude-opus-4-6-20260205",
max_tokens=4096,
thinking={
"type": "adaptive",
"effort": "deep",
"budget_tokens": 30000 # Hard cap at 30K thinking tokens
},
messages=[{"role": "user", "content": prompt}]
)
Budget Templates by Task
THINKING_BUDGETS = {
# Quick tasks — minimal thinking
"formatting": {"effort": "none", "budget_tokens": 0},
"classification": {"effort": "quick", "budget_tokens": 2000},
"simple_qa": {"effort": "quick", "budget_tokens": 1500},
# Standard tasks — moderate thinking
"code_generation": {"effort": "standard", "budget_tokens": 8000},
"code_review": {"effort": "standard", "budget_tokens": 10000},
"documentation": {"effort": "standard", "budget_tokens": 5000},
"debugging": {"effort": "standard", "budget_tokens": 15000},
# Deep tasks — significant thinking
"architecture_review": {"effort": "deep", "budget_tokens": 40000},
"bug_root_cause": {"effort": "deep", "budget_tokens": 30000},
"data_migration_plan": {"effort": "deep", "budget_tokens": 35000},
# Maximum tasks — exhaustive thinking
"security_audit": {"effort": "maximum", "budget_tokens": 100000},
"novel_algorithm": {"effort": "maximum", "budget_tokens": 150000},
"compliance_review": {"effort": "maximum", "budget_tokens": 80000},
}
Dynamic Budget Adjustment
Start conservative and increase the budget if the model indicates it needs more thinking:
class DynamicBudget:
def __init__(self, initial_budget: int = 10000, max_budget: int = 100000):
self.budget = initial_budget
self.max_budget = max_budget
self.history: list[dict] = []
def get_config(self) -> dict:
return {
"type": "adaptive",
"budget_tokens": self.budget
}
def adjust(self, response) -> None:
"""Adjust budget based on response quality signals."""
thinking_used = sum(
len(b.thinking) for b in response.content if b.type == "thinking"
)
# If model used >90% of budget, it might need more
usage_ratio = thinking_used / self.budget if self.budget > 0 else 0
if usage_ratio > 0.9 and self.budget < self.max_budget:
self.budget = min(self.budget * 2, self.max_budget)
print(f"⬆️ Budget increased to {self.budget:,} tokens")
elif usage_ratio < 0.3 and self.budget > 2000:
self.budget = max(self.budget // 2, 2000)
print(f"⬇️ Budget decreased to {self.budget:,} tokens")
self.history.append({
"budget": self.budget,
"used": thinking_used,
"ratio": usage_ratio
})
Cost Monitoring Dashboard
class ThinkingCostMonitor:
def __init__(self):
self.daily_thinking_tokens = 0
self.daily_thinking_cost = 0.0
def record(self, response):
thinking_tokens = getattr(response.usage, 'thinking_tokens', 0)
cost = thinking_tokens * 5 / 1_000_000 # $5/1M tokens
self.daily_thinking_tokens += thinking_tokens
self.daily_thinking_cost += cost
if self.daily_thinking_cost > 50.0:
print(f"🚨 Daily thinking cost: ${self.daily_thinking_cost:.2f}")
elif self.daily_thinking_cost > 20.0:
print(f"⚠️ Daily thinking cost: ${self.daily_thinking_cost:.2f}")
def report(self) -> str:
return (f"Thinking tokens today: {self.daily_thinking_tokens:,}\n"
f"Thinking cost today: ${self.daily_thinking_cost:.2f}")
In the next lesson, we explore when adaptive thinking fails and how to handle those cases.