With a 1M token context window, you might think RAG is dead. It is not — but the decision of when to use RAG versus loading everything into context has fundamentally changed. This lesson covers the new calculus and practical implementation patterns.
RAG vs. Full Context: The Decision Framework
┌─────────────────────────────────────────────────────┐
│ Total Knowledge Base │
├─────────────────────────────────────────────────────┤
│ < 200K tokens → Load everything (standard) │
│ 200K - 800K tokens → Load everything (premium $) │
│ 800K - 1M tokens → Selective loading + RAG │
│ > 1M tokens → RAG required │
└─────────────────────────────────────────────────────┘
| Factor | Full Context | RAG |
|---|---|---|
| Knowledge base < 200K | ✅ Preferred — cheapest, simplest | Overengineered |
| Knowledge base > 1M | ❌ Impossible | ✅ Required |
| Query needs broad context | ✅ Better — sees everything | May miss connections |
| Query is narrowly targeted | Wasteful — paying for unused context | ✅ Retrieves only what is needed |
| Latency-sensitive | Slower for large contexts | ✅ Faster retrieval + smaller prompt |
| Cost-sensitive | Premium pricing above 200K | ✅ Standard pricing with smaller prompts |
| Accuracy requirement | ✅ Higher — no retrieval errors | Risk of retrieving wrong chunks |
Basic RAG Implementation
from anthropic import Anthropic
import numpy as np
class SimpleRAG:
"""RAG system optimized for Opus 4.6."""
def __init__(self, embedding_model: str = "text-embedding-3-large"):
self.client = Anthropic()
self.documents: list[dict] = []
self.embeddings: list[np.ndarray] = []
self.embedding_model = embedding_model
def add_document(self, content: str, metadata: dict = None,
chunk_size: int = 1000, overlap: int = 200):
"""Chunk and store a document."""
chunks = self._chunk_text(content, chunk_size, overlap)
for i, chunk in enumerate(chunks):
embedding = self._embed(chunk)
self.documents.append({
"content": chunk,
"chunk_index": i,
"total_chunks": len(chunks),
"metadata": metadata or {},
})
self.embeddings.append(embedding)
def query(self, question: str, top_k: int = 10,
min_similarity: float = 0.7) -> str:
"""Retrieve relevant chunks and generate an answer."""
query_embedding = self._embed(question)
# Rank by cosine similarity
similarities = [
self._cosine_similarity(query_embedding, emb)
for emb in self.embeddings
]
ranked = sorted(
enumerate(similarities), key=lambda x: x[1], reverse=True
)
# Filter and select top_k
relevant = [
(idx, sim) for idx, sim in ranked
if sim >= min_similarity
][:top_k]
if not relevant:
return "No relevant documents found for this query."
# Build context from retrieved chunks
context = self._build_context(relevant)
# Generate answer with Opus 4.6
response = self.client.messages.create(
model="claude-opus-4-6-20260205",
max_tokens=4096,
thinking={"type": "adaptive"},
system="""Answer questions based solely on the provided context.
If the context doesn't contain enough information, say so explicitly.
Cite the source document and chunk for each claim.""",
messages=[{
"role": "user",
"content": f"""Context:
{context}
Question: {question}"""
}]
)
return next(b.text for b in response.content if b.type == "text")
def _chunk_text(self, text: str, size: int, overlap: int) -> list[str]:
words = text.split()
chunks = []
start = 0
while start < len(words):
end = start + size
chunks.append(" ".join(words[start:end]))
start = end - overlap
return chunks
def _build_context(self, relevant: list[tuple]) -> str:
sections = []
for idx, sim in relevant:
doc = self.documents[idx]
meta = doc.get("metadata", {})
source = meta.get("source", "unknown")
sections.append(
f"[Source: {source}, Chunk {doc['chunk_index']+1}/"
f"{doc['total_chunks']}, Relevance: {sim:.2f}]\n"
f"{doc['content']}"
)
return "\n\n---\n\n".join(sections)
def _embed(self, text: str) -> np.ndarray:
# Placeholder — replace with your embedding provider
# OpenAI, Cohere, or local model
return np.random.randn(1536) # Replace with actual embedding call
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
Hybrid Approach: RAG + Full Context
The most powerful pattern for medium-sized knowledge bases (200K–1M tokens) combines both strategies:
class HybridRAG:
"""Combine full context loading with RAG for optimal results."""
def __init__(self):
self.client = Anthropic()
self.core_docs: list[str] = [] # Always loaded (< 200K total)
self.extended_docs: list[dict] = [] # RAG-retrieved on demand
self.embeddings: list[np.ndarray] = []
def add_core_document(self, content: str):
"""Add a document to always-loaded core context."""
self.core_docs.append(content)
def add_extended_document(self, content: str, metadata: dict = None):
"""Add a document to the RAG-retrievable extended store."""
embedding = self._embed(content)
self.extended_docs.append({
"content": content,
"metadata": metadata or {},
})
self.embeddings.append(embedding)
def query(self, question: str) -> str:
# Always include core documents
core_context = "\n\n---\n\n".join(self.core_docs)
# Retrieve relevant extended documents
query_embedding = self._embed(question)
similarities = [
self._cosine_similarity(query_embedding, emb)
for emb in self.embeddings
]
ranked = sorted(
enumerate(similarities), key=lambda x: x[1], reverse=True
)[:5]
extended_context = "\n\n---\n\n".join(
self.extended_docs[idx]["content"]
for idx, _ in ranked
)
# Estimate token usage
total_context = core_context + extended_context
estimated_tokens = len(total_context) // 4 # Rough estimate
# Choose pricing tier awareness
if estimated_tokens > 200_000:
print(f"⚠️ Using premium context pricing "
f"(~{estimated_tokens:,} tokens)")
response = self.client.messages.create(
model="claude-opus-4-6-20260205",
max_tokens=4096,
thinking={"type": "adaptive"},
system=f"""You have access to two types of context:
1. CORE CONTEXT: Always-available reference material
2. RETRIEVED CONTEXT: Dynamically retrieved based on the question
Prioritize core context for foundational answers, supplement with
retrieved context for specific details.""",
messages=[{
"role": "user",
"content": f"""## Core Context
{core_context}
## Retrieved Context
{extended_context}
## Question
{question}"""
}]
)
return next(b.text for b in response.content if b.type == "text")
def _embed(self, text: str) -> np.ndarray:
return np.random.randn(1536) # Replace with actual embedding
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
Chunking Strategies for Opus 4.6
With a 1M token window, your chunking strategy should be different from models with smaller context windows:
CHUNKING_STRATEGIES = {
"small_context_model": {
"chunk_size": 500, # Small chunks for 4K-8K context models
"overlap": 50,
"rationale": "Must fit many chunks in limited space"
},
"opus_46_standard": {
"chunk_size": 2000, # Larger chunks preserve more context
"overlap": 200,
"rationale": "1M context allows bigger chunks, better coherence"
},
"opus_46_document": {
"chunk_size": 5000, # Document-level chunks for dense material
"overlap": 500,
"rationale": "Entire sections/chapters as single chunks"
},
"opus_46_full_doc": {
"chunk_size": None, # No chunking — load entire document
"overlap": 0,
"rationale": "Document fits in context, no chunking needed"
},
}
def select_chunk_strategy(doc_tokens: int, total_docs: int) -> dict:
"""Select optimal chunking strategy based on corpus size."""
total_tokens = doc_tokens * total_docs
if total_tokens < 200_000:
return CHUNKING_STRATEGIES["opus_46_full_doc"]
elif total_tokens < 800_000:
return CHUNKING_STRATEGIES["opus_46_document"]
elif total_tokens < 5_000_000:
return CHUNKING_STRATEGIES["opus_46_standard"]
else:
return CHUNKING_STRATEGIES["small_context_model"]
RAG Quality Evaluation
class RAGEvaluator:
"""Evaluate RAG retrieval quality."""
def __init__(self):
self.client = Anthropic()
def evaluate_retrieval(self, question: str, retrieved_chunks: list[str],
expected_answer: str) -> dict:
"""Score retrieval quality for a test case."""
response = self.client.messages.create(
model="claude-opus-4-6-20260205",
max_tokens=1024,
messages=[{
"role": "user",
"content": f"""Evaluate this RAG retrieval:
Question: {question}
Expected answer contains: {expected_answer}
Retrieved chunks:
{chr(10).join(f'[{i}] {c[:200]}...' for i, c in enumerate(retrieved_chunks))}
Score (JSON):
- "relevance": 0.0-1.0 (are retrieved chunks relevant?)
- "coverage": 0.0-1.0 (do chunks contain enough info to answer?)
- "noise": 0.0-1.0 (how much irrelevant content was retrieved?)
- "missing": list of information needed but not retrieved"""
}]
)
text = next(b.text for b in response.content if b.type == "text")
import json
return json.loads(text)
When to Skip RAG Entirely
With Opus 4.6’s 1M token window, many traditional RAG use cases are better served by direct context loading:
- Company documentation (< 200K tokens): Load it all. No embedding pipeline needed.
- Codebase Q&A (< 500K tokens): Load the full codebase. RAG over code is error-prone.
- Meeting transcripts: Load all transcripts for the relevant time period directly.
- Small knowledge bases: If it fits in standard context, skip the complexity.
RAG remains essential for: large-scale document stores (millions of documents), real-time data integration, multi-tenant systems where each user has different knowledge bases, and cost optimization when most of the knowledge base is irrelevant to any given query.
In the next lesson, you will learn how to deploy Opus 4.6 across cloud providers — AWS Bedrock, Google Vertex AI, and Microsoft Foundry.