Skip to main content
December 2025 Update: Complete guide to memory architectures for LLMs, from conversation buffers to persistent vector memory.

Why Memory Matters

Without memory, every LLM interaction starts fresh:
Without Memory                 With Memory
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
User: "I'm John"               User: "I'm John"
AI: "Nice to meet you!"        AI: "Nice to meet you, John!"

User: "What's my name?"        User: "What's my name?"
AI: "I don't know."            AI: "Your name is John."

Memory Architecture Overview

┌─────────────────────────────────────────────────────────────┐
│                      Memory System                          │
├─────────────────────────────────────────────────────────────┤
│  ┌─────────────┐  ┌─────────────┐  ┌─────────────────────┐  │
│  │   Buffer    │  │   Summary   │  │      Vector         │  │
│  │   Memory    │  │   Memory    │  │      Memory         │  │
│  │             │  │             │  │                     │  │
│  │ Last N msgs │  │ Compressed  │  │ Semantic search     │  │
│  │ Full detail │  │ summaries   │  │ over all history    │  │
│  └─────────────┘  └─────────────┘  └─────────────────────┘  │
│        │                │                    │               │
│        └────────────────┴────────────────────┘               │
│                         │                                    │
│                    ┌────▼────┐                               │
│                    │ Context │                               │
│                    │ Builder │                               │
│                    └─────────┘                               │
└─────────────────────────────────────────────────────────────┘
Memory TypeBest ForLimitations
BufferRecent contextLimited by context window
SummarySession overviewLoses detail
VectorLong-term recallRetrieval latency
EntityFacts about thingsStructured only

Buffer Memory (Short-Term)

The simplest approach: keep the last N messages.
from dataclasses import dataclass, field
from typing import Optional
from openai import OpenAI

client = OpenAI()

@dataclass
class Message:
    role: str
    content: str
    timestamp: float = field(default_factory=lambda: time.time())

class BufferMemory:
    """Simple buffer memory - keeps last N messages"""
    
    def __init__(self, max_messages: int = 20):
        self.max_messages = max_messages
        self.messages: list[Message] = []
    
    def add(self, role: str, content: str):
        """Add a message to memory"""
        self.messages.append(Message(role=role, content=content))
        
        # Trim if exceeds max
        if len(self.messages) > self.max_messages:
            self.messages = self.messages[-self.max_messages:]
    
    def get_messages(self) -> list[dict]:
        """Get messages for LLM context"""
        return [
            {"role": m.role, "content": m.content}
            for m in self.messages
        ]
    
    def clear(self):
        """Clear all memory"""
        self.messages = []

# Usage
memory = BufferMemory(max_messages=10)

def chat(user_input: str) -> str:
    # Add user message to memory
    memory.add("user", user_input)
    
    # Create messages with history
    messages = [
        {"role": "system", "content": "You are a helpful assistant."}
    ] + memory.get_messages()
    
    # Generate response
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )
    
    assistant_message = response.choices[0].message.content
    
    # Add assistant response to memory
    memory.add("assistant", assistant_message)
    
    return assistant_message

Token-Based Buffer

Limit by tokens instead of message count:
import tiktoken

class TokenBufferMemory:
    """Buffer memory with token limit"""
    
    def __init__(self, max_tokens: int = 4000, model: str = "gpt-4o"):
        self.max_tokens = max_tokens
        self.encoder = tiktoken.encoding_for_model(model)
        self.messages: list[Message] = []
    
    def _count_tokens(self, text: str) -> int:
        return len(self.encoder.encode(text))
    
    def _total_tokens(self) -> int:
        return sum(
            self._count_tokens(m.content) + 4  # +4 for message overhead
            for m in self.messages
        )
    
    def add(self, role: str, content: str):
        self.messages.append(Message(role=role, content=content))
        
        # Trim oldest messages until under token limit
        while self._total_tokens() > self.max_tokens and len(self.messages) > 1:
            self.messages.pop(0)
    
    def get_messages(self) -> list[dict]:
        return [
            {"role": m.role, "content": m.content}
            for m in self.messages
        ]

Summary Memory

Compress conversation history into summaries:
class SummaryMemory:
    """Memory that summarizes older messages"""
    
    def __init__(
        self,
        buffer_size: int = 10,
        summary_interval: int = 5
    ):
        self.buffer_size = buffer_size
        self.summary_interval = summary_interval
        self.messages: list[Message] = []
        self.summary: Optional[str] = None
        self.messages_since_summary = 0
    
    def add(self, role: str, content: str):
        self.messages.append(Message(role=role, content=content))
        self.messages_since_summary += 1
        
        # Create summary when buffer is full
        if len(self.messages) > self.buffer_size:
            self._update_summary()
    
    def _update_summary(self):
        """Summarize older messages"""
        # Take oldest messages to summarize
        to_summarize = self.messages[:-self.buffer_size//2]
        
        if not to_summarize:
            return
        
        # Format messages for summarization
        conversation = "\n".join([
            f"{m.role}: {m.content}" for m in to_summarize
        ])
        
        # Create summary
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "Summarize this conversation, preserving key facts, decisions, and context."
                },
                {"role": "user", "content": conversation}
            ]
        )
        
        new_summary = response.choices[0].message.content
        
        # Combine with existing summary if present
        if self.summary:
            self.summary = f"{self.summary}\n\nUpdated: {new_summary}"
        else:
            self.summary = new_summary
        
        # Keep only recent messages
        self.messages = self.messages[-self.buffer_size//2:]
        self.messages_since_summary = 0
    
    def get_context(self) -> list[dict]:
        """Get context for LLM"""
        context = []
        
        # Add summary if exists
        if self.summary:
            context.append({
                "role": "system",
                "content": f"Previous conversation summary:\n{self.summary}"
            })
        
        # Add recent messages
        context.extend([
            {"role": m.role, "content": m.content}
            for m in self.messages
        ])
        
        return context

# Usage
memory = SummaryMemory(buffer_size=10)

def chat_with_summary(user_input: str) -> str:
    memory.add("user", user_input)
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant."}
    ] + memory.get_context()
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )
    
    assistant_message = response.choices[0].message.content
    memory.add("assistant", assistant_message)
    
    return assistant_message

Vector Memory (Long-Term)

Store and retrieve memories semantically:
from openai import OpenAI
import numpy as np
from datetime import datetime
import json

client = OpenAI()

class VectorMemory:
    """Long-term memory using vector embeddings"""
    
    def __init__(self, embedding_model: str = "text-embedding-3-small"):
        self.embedding_model = embedding_model
        self.memories: list[dict] = []
        self.embeddings: list[np.ndarray] = []
    
    def _get_embedding(self, text: str) -> np.ndarray:
        response = client.embeddings.create(
            model=self.embedding_model,
            input=text
        )
        return np.array(response.data[0].embedding)
    
    def add(
        self,
        content: str,
        metadata: dict = None
    ):
        """Add a memory"""
        embedding = self._get_embedding(content)
        
        memory = {
            "content": content,
            "timestamp": datetime.now().isoformat(),
            "metadata": metadata or {}
        }
        
        self.memories.append(memory)
        self.embeddings.append(embedding)
    
    def search(
        self,
        query: str,
        top_k: int = 5,
        threshold: float = 0.7
    ) -> list[dict]:
        """Search for relevant memories"""
        if not self.memories:
            return []
        
        query_embedding = self._get_embedding(query)
        
        # Calculate cosine similarities
        similarities = []
        for i, emb in enumerate(self.embeddings):
            similarity = np.dot(query_embedding, emb) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(emb)
            )
            similarities.append((i, similarity))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Return top results above threshold
        results = []
        for idx, score in similarities[:top_k]:
            if score >= threshold:
                memory = self.memories[idx].copy()
                memory["similarity"] = score
                results.append(memory)
        
        return results
    
    def add_conversation(self, role: str, content: str):
        """Add a conversation turn as memory"""
        self.add(
            content=f"{role}: {content}",
            metadata={"type": "conversation", "role": role}
        )
    
    def save(self, path: str):
        """Save memories to file"""
        data = {
            "memories": self.memories,
            "embeddings": [e.tolist() for e in self.embeddings]
        }
        with open(path, "w") as f:
            json.dump(data, f)
    
    def load(self, path: str):
        """Load memories from file"""
        with open(path) as f:
            data = json.load(f)
        self.memories = data["memories"]
        self.embeddings = [np.array(e) for e in data["embeddings"]]

# Usage with LLM
vector_memory = VectorMemory()

def chat_with_long_term_memory(user_input: str) -> str:
    # Search for relevant memories
    relevant_memories = vector_memory.search(user_input, top_k=3)
    
    # Build context from memories
    memory_context = ""
    if relevant_memories:
        memory_context = "Relevant memories:\n" + "\n".join([
            f"- {m['content']} (similarity: {m['similarity']:.2f})"
            for m in relevant_memories
        ])
    
    # Add current message to memory
    vector_memory.add_conversation("user", user_input)
    
    # Generate response
    messages = [
        {
            "role": "system",
            "content": f"You are a helpful assistant with long-term memory.\n\n{memory_context}"
        },
        {"role": "user", "content": user_input}
    ]
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )
    
    assistant_message = response.choices[0].message.content
    
    # Add response to memory
    vector_memory.add_conversation("assistant", assistant_message)
    
    return assistant_message

Entity Memory

Track facts about specific entities:
from dataclasses import dataclass, field
from typing import Any

@dataclass
class EntityInfo:
    entity_type: str
    attributes: dict = field(default_factory=dict)
    last_updated: float = field(default_factory=lambda: time.time())

class EntityMemory:
    """Memory for tracking entities and their attributes"""
    
    def __init__(self):
        self.entities: dict[str, EntityInfo] = {}
    
    def update_entity(
        self,
        name: str,
        entity_type: str,
        attributes: dict
    ):
        """Update or create entity"""
        if name in self.entities:
            self.entities[name].attributes.update(attributes)
            self.entities[name].last_updated = time.time()
        else:
            self.entities[name] = EntityInfo(
                entity_type=entity_type,
                attributes=attributes
            )
    
    def get_entity(self, name: str) -> Optional[EntityInfo]:
        return self.entities.get(name)
    
    def extract_entities_from_text(self, text: str) -> list[dict]:
        """Use LLM to extract entities from text"""
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": """Extract entities and their attributes from the text.
                    Return JSON: [{"name": "...", "type": "person|place|thing", "attributes": {...}}]"""
                },
                {"role": "user", "content": text}
            ],
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        return result.get("entities", [])
    
    def process_and_store(self, text: str):
        """Extract and store entities from text"""
        entities = self.extract_entities_from_text(text)
        
        for entity in entities:
            self.update_entity(
                name=entity["name"],
                entity_type=entity["type"],
                attributes=entity.get("attributes", {})
            )
    
    def get_context(self) -> str:
        """Get entity context for LLM"""
        if not self.entities:
            return ""
        
        context = "Known entities:\n"
        for name, info in self.entities.items():
            attrs = ", ".join(
                f"{k}: {v}" for k, v in info.attributes.items()
            )
            context += f"- {name} ({info.entity_type}): {attrs}\n"
        
        return context

# Usage
entity_memory = EntityMemory()

def chat_with_entity_memory(user_input: str) -> str:
    # Extract entities from user input
    entity_memory.process_and_store(user_input)
    
    # Get entity context
    entity_context = entity_memory.get_context()
    
    messages = [
        {
            "role": "system",
            "content": f"You are a helpful assistant.\n\n{entity_context}"
        },
        {"role": "user", "content": user_input}
    ]
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )
    
    assistant_message = response.choices[0].message.content
    
    # Also extract entities from response
    entity_memory.process_and_store(assistant_message)
    
    return assistant_message

Hybrid Memory System

Combine all memory types for maximum effectiveness:
class HybridMemory:
    """Combines buffer, summary, vector, and entity memory"""
    
    def __init__(
        self,
        buffer_size: int = 10,
        vector_threshold: float = 0.75
    ):
        self.buffer = BufferMemory(max_messages=buffer_size)
        self.summary = SummaryMemory(buffer_size=buffer_size * 2)
        self.vector = VectorMemory()
        self.entity = EntityMemory()
        self.vector_threshold = vector_threshold
    
    def add(self, role: str, content: str):
        """Add message to all memory systems"""
        # Short-term
        self.buffer.add(role, content)
        self.summary.add(role, content)
        
        # Long-term
        self.vector.add_conversation(role, content)
        
        # Entity extraction
        self.entity.process_and_store(content)
    
    def get_context(self, query: str) -> dict:
        """Get comprehensive context for query"""
        
        # Recent messages (short-term)
        recent = self.buffer.get_messages()
        
        # Summary of older conversation
        summary = self.summary.summary
        
        # Relevant long-term memories
        long_term = self.vector.search(query, top_k=3, threshold=self.vector_threshold)
        
        # Entity knowledge
        entities = self.entity.get_context()
        
        return {
            "recent_messages": recent,
            "summary": summary,
            "long_term_memories": long_term,
            "entities": entities
        }
    
    def build_messages(
        self,
        query: str,
        system_prompt: str
    ) -> list[dict]:
        """Build message list for LLM"""
        context = self.get_context(query)
        
        # Build system message with context
        system_content = system_prompt
        
        if context["summary"]:
            system_content += f"\n\nConversation summary:\n{context['summary']}"
        
        if context["long_term_memories"]:
            memories = "\n".join([
                f"- {m['content']}" for m in context["long_term_memories"]
            ])
            system_content += f"\n\nRelevant memories:\n{memories}"
        
        if context["entities"]:
            system_content += f"\n\n{context['entities']}"
        
        messages = [{"role": "system", "content": system_content}]
        messages.extend(context["recent_messages"])
        
        return messages
    
    def save(self, path: str):
        """Persist memory to disk"""
        self.vector.save(f"{path}_vector.json")
        # Add more persistence as needed
    
    def load(self, path: str):
        """Load memory from disk"""
        self.vector.load(f"{path}_vector.json")

# Usage
memory = HybridMemory(buffer_size=10)

def chat_with_hybrid_memory(user_input: str) -> str:
    messages = memory.build_messages(
        query=user_input,
        system_prompt="You are a helpful assistant with perfect memory."
    )
    
    # Add current user message
    messages.append({"role": "user", "content": user_input})
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )
    
    assistant_message = response.choices[0].message.content
    
    # Update memory
    memory.add("user", user_input)
    memory.add("assistant", assistant_message)
    
    return assistant_message

Memory with LangChain

LangChain provides built-in memory implementations:
from langchain_openai import ChatOpenAI
from langchain.memory import (
    ConversationBufferMemory,
    ConversationSummaryMemory,
    ConversationBufferWindowMemory,
    VectorStoreRetrieverMemory
)
from langchain.chains import ConversationChain
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

llm = ChatOpenAI(model="gpt-4o")

# Buffer Memory
buffer_memory = ConversationBufferMemory()

# Window Memory (last k interactions)
window_memory = ConversationBufferWindowMemory(k=5)

# Summary Memory
summary_memory = ConversationSummaryMemory(llm=llm)

# Vector Store Memory
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts([""], embeddings)
vector_memory = VectorStoreRetrieverMemory(
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
)

# Use in chain
chain = ConversationChain(
    llm=llm,
    memory=summary_memory,
    verbose=True
)

response = chain.predict(input="Hi, I'm Alice!")
response = chain.predict(input="What's my name?")

Key Takeaways

Choose the Right Memory

Buffer for recent context, summary for sessions, vector for long-term

Hybrid is Best

Combine memory types for comprehensive context

Persist Important Data

Store long-term memories and entities to disk

Token Management

Always monitor token usage from memory context

What’s Next

Cost Optimization & Token Management

Learn strategies for optimizing LLM costs and managing tokens efficiently