Skip to main content
Managing context windows effectively is critical for LLM applications that handle long documents, conversations, or complex queries.

Context Window Limits

Model                    Context Window    Output Limit
-----------------------------------------------------------
GPT-4o                   128,000          16,384
GPT-4o-mini              128,000          16,384
Claude 3.5 Sonnet        200,000          8,192
Gemini 1.5 Pro           2,000,000        8,192
Llama 3.3 70B            128,000          4,096

Token Counting

Using tiktoken

import tiktoken
from typing import List, Dict

class TokenCounter:
    """Accurate token counting for OpenAI models"""
    
    ENCODINGS = {
        "gpt-4o": "o200k_base",
        "gpt-4o-mini": "o200k_base",
        "gpt-4-turbo": "cl100k_base",
        "gpt-3.5-turbo": "cl100k_base",
        "text-embedding-3-small": "cl100k_base",
        "text-embedding-3-large": "cl100k_base"
    }
    
    def __init__(self, model: str = "gpt-4o"):
        encoding_name = self.ENCODINGS.get(model, "o200k_base")
        self.encoding = tiktoken.get_encoding(encoding_name)
        self.model = model
    
    def count(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.encoding.encode(text))
    
    def count_messages(self, messages: List[Dict[str, str]]) -> int:
        """Count tokens in chat messages (includes overhead)"""
        tokens = 0
        
        # Per-message overhead
        for message in messages:
            tokens += 4  # <|im_start|>{role}\n{content}<|im_end|>\n
            for key, value in message.items():
                tokens += self.count(str(value))
        
        tokens += 2  # Priming for assistant response
        
        return tokens
    
    def truncate_to_limit(
        self,
        text: str,
        max_tokens: int,
        from_end: bool = False
    ) -> str:
        """Truncate text to token limit"""
        tokens = self.encoding.encode(text)
        
        if len(tokens) <= max_tokens:
            return text
        
        if from_end:
            truncated = tokens[-max_tokens:]
        else:
            truncated = tokens[:max_tokens]
        
        return self.encoding.decode(truncated)
    
    def split_by_tokens(
        self,
        text: str,
        chunk_size: int,
        overlap: int = 0
    ) -> List[str]:
        """Split text into chunks by token count"""
        tokens = self.encoding.encode(text)
        chunks = []
        
        start = 0
        while start < len(tokens):
            end = start + chunk_size
            chunk_tokens = tokens[start:end]
            chunks.append(self.encoding.decode(chunk_tokens))
            start = end - overlap
        
        return chunks

# Usage
counter = TokenCounter("gpt-4o")

text = "Your long document here..."
token_count = counter.count(text)
print(f"Token count: {token_count}")

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
]
message_tokens = counter.count_messages(messages)
print(f"Message tokens: {message_tokens}")

Context Compression

LLMLingua Compression

from llmlingua import PromptCompressor

class ContextCompressor:
    """Compress context while preserving meaning"""
    
    def __init__(
        self,
        model_name: str = "microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
        target_ratio: float = 0.5
    ):
        self.compressor = PromptCompressor(
            model_name=model_name,
            use_llmlingua2=True
        )
        self.target_ratio = target_ratio
    
    def compress(
        self,
        context: str,
        question: str = None,
        rate: float = None
    ) -> dict:
        """Compress context text"""
        
        result = self.compressor.compress_prompt(
            context,
            instruction=question or "",
            question=question or "",
            rate=rate or self.target_ratio,
            condition_compare=True,
            condition_in_question="after"
        )
        
        return {
            "compressed": result["compressed_prompt"],
            "original_tokens": result["origin_tokens"],
            "compressed_tokens": result["compressed_tokens"],
            "ratio": result["ratio"]
        }

# Usage
compressor = ContextCompressor(target_ratio=0.3)

long_context = """
Machine learning is a subset of artificial intelligence that enables 
systems to learn and improve from experience without being explicitly 
programmed. It focuses on developing algorithms that can access data 
and use it to learn for themselves...
"""

result = compressor.compress(
    context=long_context,
    question="What is machine learning?"
)

print(f"Compression ratio: {result['ratio']:.2%}")
print(f"Original: {result['original_tokens']} tokens")
print(f"Compressed: {result['compressed_tokens']} tokens")

Extractive Compression

from openai import OpenAI
from typing import List

client = OpenAI()

class ExtractiveCompressor:
    """Extract relevant sentences for compression"""
    
    def __init__(self, target_sentences: int = 5):
        self.target_sentences = target_sentences
    
    def compress(
        self,
        context: str,
        query: str
    ) -> str:
        """Extract most relevant sentences"""
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": f"""Extract the {self.target_sentences} most relevant sentences from the context that help answer the query. Return only the extracted sentences, one per line."""
                },
                {
                    "role": "user",
                    "content": f"Query: {query}\n\nContext:\n{context}"
                }
            ],
            temperature=0
        )
        
        return response.choices[0].message.content

# Usage
extractor = ExtractiveCompressor(target_sentences=3)
compressed = extractor.compress(
    context=long_document,
    query="What are the key benefits?"
)

Sliding Window Strategies

from typing import List, Optional
from dataclasses import dataclass

@dataclass
class WindowConfig:
    max_tokens: int = 4000
    overlap_tokens: int = 200
    preserve_system: bool = True
    preserve_recent: int = 5  # Always keep last N messages

class SlidingWindowManager:
    """Manage conversation with sliding window"""
    
    def __init__(self, config: WindowConfig = None):
        self.config = config or WindowConfig()
        self.counter = TokenCounter()
        self.messages: List[dict] = []
        self.system_message: Optional[dict] = None
    
    def add_message(self, role: str, content: str):
        """Add message and apply window if needed"""
        message = {"role": role, "content": content}
        
        if role == "system":
            self.system_message = message
        else:
            self.messages.append(message)
        
        self._apply_window()
    
    def _apply_window(self):
        """Trim messages to fit window"""
        if not self.messages:
            return
        
        # Calculate current token count
        all_messages = self._get_all_messages()
        total_tokens = self.counter.count_messages(all_messages)
        
        if total_tokens <= self.config.max_tokens:
            return
        
        # Keep system message and recent messages
        preserved = self.messages[-self.config.preserve_recent:]
        trimmable = self.messages[:-self.config.preserve_recent]
        
        # Remove oldest messages until within limit
        while trimmable and total_tokens > self.config.max_tokens:
            trimmable.pop(0)
            self.messages = trimmable + preserved
            all_messages = self._get_all_messages()
            total_tokens = self.counter.count_messages(all_messages)
    
    def _get_all_messages(self) -> List[dict]:
        messages = []
        if self.system_message:
            messages.append(self.system_message)
        messages.extend(self.messages)
        return messages
    
    def get_messages(self) -> List[dict]:
        return self._get_all_messages()
    
    def get_token_count(self) -> int:
        return self.counter.count_messages(self._get_all_messages())

# Chunked processing for long documents
class ChunkedProcessor:
    """Process long documents in chunks with overlap"""
    
    def __init__(
        self,
        max_chunk_tokens: int = 4000,
        overlap_tokens: int = 200
    ):
        self.max_chunk_tokens = max_chunk_tokens
        self.overlap_tokens = overlap_tokens
        self.counter = TokenCounter()
    
    def process_document(
        self,
        document: str,
        process_fn,
        aggregate_fn = None
    ) -> List:
        """Process document in chunks"""
        
        chunks = self.counter.split_by_tokens(
            document,
            self.max_chunk_tokens,
            self.overlap_tokens
        )
        
        results = []
        for i, chunk in enumerate(chunks):
            result = process_fn(chunk, chunk_index=i, total_chunks=len(chunks))
            results.append(result)
        
        if aggregate_fn:
            return aggregate_fn(results)
        
        return results

Summarization Strategies

Hierarchical Summarization

from openai import OpenAI
from typing import List

client = OpenAI()

class HierarchicalSummarizer:
    """Summarize long documents hierarchically"""
    
    def __init__(
        self,
        chunk_size: int = 4000,
        summary_ratio: float = 0.3
    ):
        self.chunk_size = chunk_size
        self.summary_ratio = summary_ratio
        self.counter = TokenCounter()
    
    def _summarize_chunk(self, chunk: str, max_tokens: int) -> str:
        """Summarize a single chunk"""
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "Summarize the following text concisely while preserving key information."
                },
                {"role": "user", "content": chunk}
            ],
            max_tokens=max_tokens,
            temperature=0.3
        )
        return response.choices[0].message.content
    
    def summarize(self, document: str) -> str:
        """Hierarchically summarize document"""
        
        doc_tokens = self.counter.count(document)
        
        # If short enough, summarize directly
        if doc_tokens <= self.chunk_size:
            target_tokens = int(doc_tokens * self.summary_ratio)
            return self._summarize_chunk(document, target_tokens)
        
        # Split into chunks
        chunks = self.counter.split_by_tokens(document, self.chunk_size)
        
        # Summarize each chunk
        summaries = []
        for chunk in chunks:
            chunk_tokens = self.counter.count(chunk)
            target_tokens = int(chunk_tokens * self.summary_ratio)
            summary = self._summarize_chunk(chunk, max(target_tokens, 100))
            summaries.append(summary)
        
        # Combine summaries
        combined = "\n\n".join(summaries)
        
        # Recursively summarize if still too long
        if self.counter.count(combined) > self.chunk_size:
            return self.summarize(combined)
        
        return combined

# Map-Reduce Summarization
class MapReduceSummarizer:
    """Map-reduce style summarization"""
    
    def __init__(self, chunk_size: int = 4000):
        self.chunk_size = chunk_size
        self.counter = TokenCounter()
    
    def summarize(
        self,
        document: str,
        focus_query: str = None
    ) -> str:
        """Summarize with optional focus"""
        
        chunks = self.counter.split_by_tokens(document, self.chunk_size)
        
        # Map: Extract key points from each chunk
        key_points = []
        for chunk in chunks:
            system = "Extract key points from this text."
            if focus_query:
                system += f" Focus on information related to: {focus_query}"
            
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system},
                    {"role": "user", "content": chunk}
                ],
                temperature=0.3
            )
            key_points.append(response.choices[0].message.content)
        
        # Reduce: Combine key points
        combined_points = "\n\n".join(key_points)
        
        reduce_prompt = "Synthesize these key points into a coherent summary."
        if focus_query:
            reduce_prompt += f" Focus on: {focus_query}"
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": reduce_prompt},
                {"role": "user", "content": combined_points}
            ],
            temperature=0.3
        )
        
        return response.choices[0].message.content

Conversation Memory Management

from typing import List, Optional
from dataclasses import dataclass
from datetime import datetime

@dataclass
class ConversationTurn:
    role: str
    content: str
    timestamp: datetime
    token_count: int
    summary: Optional[str] = None

class ConversationMemory:
    """Manage long conversations with summarization"""
    
    def __init__(
        self,
        max_tokens: int = 4000,
        summary_threshold: int = 2000,
        keep_recent: int = 4
    ):
        self.max_tokens = max_tokens
        self.summary_threshold = summary_threshold
        self.keep_recent = keep_recent
        self.counter = TokenCounter()
        
        self.turns: List[ConversationTurn] = []
        self.running_summary: str = ""
        self.system_message: Optional[str] = None
    
    def set_system(self, content: str):
        self.system_message = content
    
    def add_turn(self, role: str, content: str):
        """Add a conversation turn"""
        turn = ConversationTurn(
            role=role,
            content=content,
            timestamp=datetime.now(),
            token_count=self.counter.count(content)
        )
        self.turns.append(turn)
        
        # Check if summarization needed
        self._maybe_summarize()
    
    def _maybe_summarize(self):
        """Summarize old turns if needed"""
        total = self._calculate_total_tokens()
        
        if total <= self.max_tokens:
            return
        
        # Summarize older turns
        to_summarize = self.turns[:-self.keep_recent]
        
        if not to_summarize:
            return
        
        # Create summary
        summary_text = self._summarize_turns(to_summarize)
        
        # Update state
        self.running_summary = self._merge_summaries(
            self.running_summary,
            summary_text
        )
        self.turns = self.turns[-self.keep_recent:]
    
    def _calculate_total_tokens(self) -> int:
        total = 0
        if self.system_message:
            total += self.counter.count(self.system_message)
        if self.running_summary:
            total += self.counter.count(self.running_summary)
        for turn in self.turns:
            total += turn.token_count
        return total
    
    def _summarize_turns(self, turns: List[ConversationTurn]) -> str:
        """Summarize a list of turns"""
        conversation = "\n".join([
            f"{t.role}: {t.content}"
            for t in turns
        ])
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "Summarize this conversation, preserving key information, decisions, and context needed for continuity."
                },
                {"role": "user", "content": conversation}
            ],
            temperature=0.3
        )
        
        return response.choices[0].message.content
    
    def _merge_summaries(self, old: str, new: str) -> str:
        """Merge old and new summaries"""
        if not old:
            return new
        
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "Merge these two conversation summaries into one coherent summary."
                },
                {"role": "user", "content": f"Previous summary:\n{old}\n\nNew summary:\n{new}"}
            ],
            temperature=0.3
        )
        
        return response.choices[0].message.content
    
    def get_messages(self) -> List[dict]:
        """Get messages for API call"""
        messages = []
        
        if self.system_message:
            messages.append({
                "role": "system",
                "content": self.system_message
            })
        
        if self.running_summary:
            messages.append({
                "role": "system",
                "content": f"Previous conversation summary: {self.running_summary}"
            })
        
        for turn in self.turns:
            messages.append({
                "role": turn.role,
                "content": turn.content
            })
        
        return messages

Dynamic Context Selection

from typing import List, Dict
from dataclasses import dataclass

@dataclass
class ContextItem:
    content: str
    relevance: float
    token_count: int
    source: str

class DynamicContextManager:
    """Select relevant context within token budget"""
    
    def __init__(self, max_tokens: int = 4000):
        self.max_tokens = max_tokens
        self.counter = TokenCounter()
    
    def select_context(
        self,
        query: str,
        items: List[ContextItem],
        reserve_tokens: int = 500  # Reserve for query and response
    ) -> List[ContextItem]:
        """Select context items within budget"""
        
        available_tokens = self.max_tokens - reserve_tokens
        
        # Sort by relevance
        sorted_items = sorted(items, key=lambda x: x.relevance, reverse=True)
        
        selected = []
        used_tokens = 0
        
        for item in sorted_items:
            if used_tokens + item.token_count <= available_tokens:
                selected.append(item)
                used_tokens += item.token_count
        
        return selected
    
    def build_context(
        self,
        query: str,
        items: List[ContextItem],
        format_fn = None
    ) -> str:
        """Build context string from selected items"""
        
        selected = self.select_context(query, items)
        
        if format_fn:
            return format_fn(selected)
        
        # Default formatting
        context_parts = []
        for item in selected:
            context_parts.append(f"[Source: {item.source}]\n{item.content}")
        
        return "\n\n---\n\n".join(context_parts)

Token Usage Summary

StrategyUse CaseToken Savings
Sliding WindowLong conversations50-70%
SummarizationDocument processing60-80%
CompressionContext reduction30-70%
Dynamic SelectionRAG contextVariable
Chunked ProcessingLong documentsN/A (enables)

What is Next

LLM Testing

Learn testing strategies for LLM applications