Context Window Limits
Copy
Model Context Window Output Limit
-----------------------------------------------------------
GPT-4o 128,000 16,384
GPT-4o-mini 128,000 16,384
Claude 3.5 Sonnet 200,000 8,192
Gemini 1.5 Pro 2,000,000 8,192
Llama 3.3 70B 128,000 4,096
Token Counting
Using tiktoken
Copy
import tiktoken
from typing import List, Dict
class TokenCounter:
"""Accurate token counting for OpenAI models"""
ENCODINGS = {
"gpt-4o": "o200k_base",
"gpt-4o-mini": "o200k_base",
"gpt-4-turbo": "cl100k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base"
}
def __init__(self, model: str = "gpt-4o"):
encoding_name = self.ENCODINGS.get(model, "o200k_base")
self.encoding = tiktoken.get_encoding(encoding_name)
self.model = model
def count(self, text: str) -> int:
"""Count tokens in text"""
return len(self.encoding.encode(text))
def count_messages(self, messages: List[Dict[str, str]]) -> int:
"""Count tokens in chat messages (includes overhead)"""
tokens = 0
# Per-message overhead
for message in messages:
tokens += 4 # <|im_start|>{role}\n{content}<|im_end|>\n
for key, value in message.items():
tokens += self.count(str(value))
tokens += 2 # Priming for assistant response
return tokens
def truncate_to_limit(
self,
text: str,
max_tokens: int,
from_end: bool = False
) -> str:
"""Truncate text to token limit"""
tokens = self.encoding.encode(text)
if len(tokens) <= max_tokens:
return text
if from_end:
truncated = tokens[-max_tokens:]
else:
truncated = tokens[:max_tokens]
return self.encoding.decode(truncated)
def split_by_tokens(
self,
text: str,
chunk_size: int,
overlap: int = 0
) -> List[str]:
"""Split text into chunks by token count"""
tokens = self.encoding.encode(text)
chunks = []
start = 0
while start < len(tokens):
end = start + chunk_size
chunk_tokens = tokens[start:end]
chunks.append(self.encoding.decode(chunk_tokens))
start = end - overlap
return chunks
# Usage
counter = TokenCounter("gpt-4o")
text = "Your long document here..."
token_count = counter.count(text)
print(f"Token count: {token_count}")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]
message_tokens = counter.count_messages(messages)
print(f"Message tokens: {message_tokens}")
Context Compression
LLMLingua Compression
Copy
from llmlingua import PromptCompressor
class ContextCompressor:
"""Compress context while preserving meaning"""
def __init__(
self,
model_name: str = "microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
target_ratio: float = 0.5
):
self.compressor = PromptCompressor(
model_name=model_name,
use_llmlingua2=True
)
self.target_ratio = target_ratio
def compress(
self,
context: str,
question: str = None,
rate: float = None
) -> dict:
"""Compress context text"""
result = self.compressor.compress_prompt(
context,
instruction=question or "",
question=question or "",
rate=rate or self.target_ratio,
condition_compare=True,
condition_in_question="after"
)
return {
"compressed": result["compressed_prompt"],
"original_tokens": result["origin_tokens"],
"compressed_tokens": result["compressed_tokens"],
"ratio": result["ratio"]
}
# Usage
compressor = ContextCompressor(target_ratio=0.3)
long_context = """
Machine learning is a subset of artificial intelligence that enables
systems to learn and improve from experience without being explicitly
programmed. It focuses on developing algorithms that can access data
and use it to learn for themselves...
"""
result = compressor.compress(
context=long_context,
question="What is machine learning?"
)
print(f"Compression ratio: {result['ratio']:.2%}")
print(f"Original: {result['original_tokens']} tokens")
print(f"Compressed: {result['compressed_tokens']} tokens")
Extractive Compression
Copy
from openai import OpenAI
from typing import List
client = OpenAI()
class ExtractiveCompressor:
"""Extract relevant sentences for compression"""
def __init__(self, target_sentences: int = 5):
self.target_sentences = target_sentences
def compress(
self,
context: str,
query: str
) -> str:
"""Extract most relevant sentences"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": f"""Extract the {self.target_sentences} most relevant sentences from the context that help answer the query. Return only the extracted sentences, one per line."""
},
{
"role": "user",
"content": f"Query: {query}\n\nContext:\n{context}"
}
],
temperature=0
)
return response.choices[0].message.content
# Usage
extractor = ExtractiveCompressor(target_sentences=3)
compressed = extractor.compress(
context=long_document,
query="What are the key benefits?"
)
Sliding Window Strategies
Copy
from typing import List, Optional
from dataclasses import dataclass
@dataclass
class WindowConfig:
max_tokens: int = 4000
overlap_tokens: int = 200
preserve_system: bool = True
preserve_recent: int = 5 # Always keep last N messages
class SlidingWindowManager:
"""Manage conversation with sliding window"""
def __init__(self, config: WindowConfig = None):
self.config = config or WindowConfig()
self.counter = TokenCounter()
self.messages: List[dict] = []
self.system_message: Optional[dict] = None
def add_message(self, role: str, content: str):
"""Add message and apply window if needed"""
message = {"role": role, "content": content}
if role == "system":
self.system_message = message
else:
self.messages.append(message)
self._apply_window()
def _apply_window(self):
"""Trim messages to fit window"""
if not self.messages:
return
# Calculate current token count
all_messages = self._get_all_messages()
total_tokens = self.counter.count_messages(all_messages)
if total_tokens <= self.config.max_tokens:
return
# Keep system message and recent messages
preserved = self.messages[-self.config.preserve_recent:]
trimmable = self.messages[:-self.config.preserve_recent]
# Remove oldest messages until within limit
while trimmable and total_tokens > self.config.max_tokens:
trimmable.pop(0)
self.messages = trimmable + preserved
all_messages = self._get_all_messages()
total_tokens = self.counter.count_messages(all_messages)
def _get_all_messages(self) -> List[dict]:
messages = []
if self.system_message:
messages.append(self.system_message)
messages.extend(self.messages)
return messages
def get_messages(self) -> List[dict]:
return self._get_all_messages()
def get_token_count(self) -> int:
return self.counter.count_messages(self._get_all_messages())
# Chunked processing for long documents
class ChunkedProcessor:
"""Process long documents in chunks with overlap"""
def __init__(
self,
max_chunk_tokens: int = 4000,
overlap_tokens: int = 200
):
self.max_chunk_tokens = max_chunk_tokens
self.overlap_tokens = overlap_tokens
self.counter = TokenCounter()
def process_document(
self,
document: str,
process_fn,
aggregate_fn = None
) -> List:
"""Process document in chunks"""
chunks = self.counter.split_by_tokens(
document,
self.max_chunk_tokens,
self.overlap_tokens
)
results = []
for i, chunk in enumerate(chunks):
result = process_fn(chunk, chunk_index=i, total_chunks=len(chunks))
results.append(result)
if aggregate_fn:
return aggregate_fn(results)
return results
Summarization Strategies
Hierarchical Summarization
Copy
from openai import OpenAI
from typing import List
client = OpenAI()
class HierarchicalSummarizer:
"""Summarize long documents hierarchically"""
def __init__(
self,
chunk_size: int = 4000,
summary_ratio: float = 0.3
):
self.chunk_size = chunk_size
self.summary_ratio = summary_ratio
self.counter = TokenCounter()
def _summarize_chunk(self, chunk: str, max_tokens: int) -> str:
"""Summarize a single chunk"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Summarize the following text concisely while preserving key information."
},
{"role": "user", "content": chunk}
],
max_tokens=max_tokens,
temperature=0.3
)
return response.choices[0].message.content
def summarize(self, document: str) -> str:
"""Hierarchically summarize document"""
doc_tokens = self.counter.count(document)
# If short enough, summarize directly
if doc_tokens <= self.chunk_size:
target_tokens = int(doc_tokens * self.summary_ratio)
return self._summarize_chunk(document, target_tokens)
# Split into chunks
chunks = self.counter.split_by_tokens(document, self.chunk_size)
# Summarize each chunk
summaries = []
for chunk in chunks:
chunk_tokens = self.counter.count(chunk)
target_tokens = int(chunk_tokens * self.summary_ratio)
summary = self._summarize_chunk(chunk, max(target_tokens, 100))
summaries.append(summary)
# Combine summaries
combined = "\n\n".join(summaries)
# Recursively summarize if still too long
if self.counter.count(combined) > self.chunk_size:
return self.summarize(combined)
return combined
# Map-Reduce Summarization
class MapReduceSummarizer:
"""Map-reduce style summarization"""
def __init__(self, chunk_size: int = 4000):
self.chunk_size = chunk_size
self.counter = TokenCounter()
def summarize(
self,
document: str,
focus_query: str = None
) -> str:
"""Summarize with optional focus"""
chunks = self.counter.split_by_tokens(document, self.chunk_size)
# Map: Extract key points from each chunk
key_points = []
for chunk in chunks:
system = "Extract key points from this text."
if focus_query:
system += f" Focus on information related to: {focus_query}"
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": system},
{"role": "user", "content": chunk}
],
temperature=0.3
)
key_points.append(response.choices[0].message.content)
# Reduce: Combine key points
combined_points = "\n\n".join(key_points)
reduce_prompt = "Synthesize these key points into a coherent summary."
if focus_query:
reduce_prompt += f" Focus on: {focus_query}"
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": reduce_prompt},
{"role": "user", "content": combined_points}
],
temperature=0.3
)
return response.choices[0].message.content
Conversation Memory Management
Copy
from typing import List, Optional
from dataclasses import dataclass
from datetime import datetime
@dataclass
class ConversationTurn:
role: str
content: str
timestamp: datetime
token_count: int
summary: Optional[str] = None
class ConversationMemory:
"""Manage long conversations with summarization"""
def __init__(
self,
max_tokens: int = 4000,
summary_threshold: int = 2000,
keep_recent: int = 4
):
self.max_tokens = max_tokens
self.summary_threshold = summary_threshold
self.keep_recent = keep_recent
self.counter = TokenCounter()
self.turns: List[ConversationTurn] = []
self.running_summary: str = ""
self.system_message: Optional[str] = None
def set_system(self, content: str):
self.system_message = content
def add_turn(self, role: str, content: str):
"""Add a conversation turn"""
turn = ConversationTurn(
role=role,
content=content,
timestamp=datetime.now(),
token_count=self.counter.count(content)
)
self.turns.append(turn)
# Check if summarization needed
self._maybe_summarize()
def _maybe_summarize(self):
"""Summarize old turns if needed"""
total = self._calculate_total_tokens()
if total <= self.max_tokens:
return
# Summarize older turns
to_summarize = self.turns[:-self.keep_recent]
if not to_summarize:
return
# Create summary
summary_text = self._summarize_turns(to_summarize)
# Update state
self.running_summary = self._merge_summaries(
self.running_summary,
summary_text
)
self.turns = self.turns[-self.keep_recent:]
def _calculate_total_tokens(self) -> int:
total = 0
if self.system_message:
total += self.counter.count(self.system_message)
if self.running_summary:
total += self.counter.count(self.running_summary)
for turn in self.turns:
total += turn.token_count
return total
def _summarize_turns(self, turns: List[ConversationTurn]) -> str:
"""Summarize a list of turns"""
conversation = "\n".join([
f"{t.role}: {t.content}"
for t in turns
])
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Summarize this conversation, preserving key information, decisions, and context needed for continuity."
},
{"role": "user", "content": conversation}
],
temperature=0.3
)
return response.choices[0].message.content
def _merge_summaries(self, old: str, new: str) -> str:
"""Merge old and new summaries"""
if not old:
return new
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Merge these two conversation summaries into one coherent summary."
},
{"role": "user", "content": f"Previous summary:\n{old}\n\nNew summary:\n{new}"}
],
temperature=0.3
)
return response.choices[0].message.content
def get_messages(self) -> List[dict]:
"""Get messages for API call"""
messages = []
if self.system_message:
messages.append({
"role": "system",
"content": self.system_message
})
if self.running_summary:
messages.append({
"role": "system",
"content": f"Previous conversation summary: {self.running_summary}"
})
for turn in self.turns:
messages.append({
"role": turn.role,
"content": turn.content
})
return messages
Dynamic Context Selection
Copy
from typing import List, Dict
from dataclasses import dataclass
@dataclass
class ContextItem:
content: str
relevance: float
token_count: int
source: str
class DynamicContextManager:
"""Select relevant context within token budget"""
def __init__(self, max_tokens: int = 4000):
self.max_tokens = max_tokens
self.counter = TokenCounter()
def select_context(
self,
query: str,
items: List[ContextItem],
reserve_tokens: int = 500 # Reserve for query and response
) -> List[ContextItem]:
"""Select context items within budget"""
available_tokens = self.max_tokens - reserve_tokens
# Sort by relevance
sorted_items = sorted(items, key=lambda x: x.relevance, reverse=True)
selected = []
used_tokens = 0
for item in sorted_items:
if used_tokens + item.token_count <= available_tokens:
selected.append(item)
used_tokens += item.token_count
return selected
def build_context(
self,
query: str,
items: List[ContextItem],
format_fn = None
) -> str:
"""Build context string from selected items"""
selected = self.select_context(query, items)
if format_fn:
return format_fn(selected)
# Default formatting
context_parts = []
for item in selected:
context_parts.append(f"[Source: {item.source}]\n{item.content}")
return "\n\n---\n\n".join(context_parts)
Token Usage Summary
| Strategy | Use Case | Token Savings |
|---|---|---|
| Sliding Window | Long conversations | 50-70% |
| Summarization | Document processing | 60-80% |
| Compression | Context reduction | 30-70% |
| Dynamic Selection | RAG context | Variable |
| Chunked Processing | Long documents | N/A (enables) |
What is Next
LLM Testing
Learn testing strategies for LLM applications