December 2025 Update: Complete guide to memory architectures for LLMs, from conversation buffers to persistent vector memory.
Why Memory Matters
Without memory, every LLM interaction starts fresh:Copy
Without Memory With Memory
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
User: "I'm John" User: "I'm John"
AI: "Nice to meet you!" AI: "Nice to meet you, John!"
User: "What's my name?" User: "What's my name?"
AI: "I don't know." AI: "Your name is John."
Memory Architecture Overview
Copy
┌─────────────────────────────────────────────────────────────┐
│ Memory System │
├─────────────────────────────────────────────────────────────┤
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────┐ │
│ │ Buffer │ │ Summary │ │ Vector │ │
│ │ Memory │ │ Memory │ │ Memory │ │
│ │ │ │ │ │ │ │
│ │ Last N msgs │ │ Compressed │ │ Semantic search │ │
│ │ Full detail │ │ summaries │ │ over all history │ │
│ └─────────────┘ └─────────────┘ └─────────────────────┘ │
│ │ │ │ │
│ └────────────────┴────────────────────┘ │
│ │ │
│ ┌────▼────┐ │
│ │ Context │ │
│ │ Builder │ │
│ └─────────┘ │
└─────────────────────────────────────────────────────────────┘
| Memory Type | Best For | Limitations |
|---|---|---|
| Buffer | Recent context | Limited by context window |
| Summary | Session overview | Loses detail |
| Vector | Long-term recall | Retrieval latency |
| Entity | Facts about things | Structured only |
Buffer Memory (Short-Term)
The simplest approach: keep the last N messages.Copy
from dataclasses import dataclass, field
from typing import Optional
from openai import OpenAI
client = OpenAI()
@dataclass
class Message:
role: str
content: str
timestamp: float = field(default_factory=lambda: time.time())
class BufferMemory:
"""Simple buffer memory - keeps last N messages"""
def __init__(self, max_messages: int = 20):
self.max_messages = max_messages
self.messages: list[Message] = []
def add(self, role: str, content: str):
"""Add a message to memory"""
self.messages.append(Message(role=role, content=content))
# Trim if exceeds max
if len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:]
def get_messages(self) -> list[dict]:
"""Get messages for LLM context"""
return [
{"role": m.role, "content": m.content}
for m in self.messages
]
def clear(self):
"""Clear all memory"""
self.messages = []
# Usage
memory = BufferMemory(max_messages=10)
def chat(user_input: str) -> str:
# Add user message to memory
memory.add("user", user_input)
# Create messages with history
messages = [
{"role": "system", "content": "You are a helpful assistant."}
] + memory.get_messages()
# Generate response
response = client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_message = response.choices[0].message.content
# Add assistant response to memory
memory.add("assistant", assistant_message)
return assistant_message
Token-Based Buffer
Limit by tokens instead of message count:Copy
import tiktoken
class TokenBufferMemory:
"""Buffer memory with token limit"""
def __init__(self, max_tokens: int = 4000, model: str = "gpt-4o"):
self.max_tokens = max_tokens
self.encoder = tiktoken.encoding_for_model(model)
self.messages: list[Message] = []
def _count_tokens(self, text: str) -> int:
return len(self.encoder.encode(text))
def _total_tokens(self) -> int:
return sum(
self._count_tokens(m.content) + 4 # +4 for message overhead
for m in self.messages
)
def add(self, role: str, content: str):
self.messages.append(Message(role=role, content=content))
# Trim oldest messages until under token limit
while self._total_tokens() > self.max_tokens and len(self.messages) > 1:
self.messages.pop(0)
def get_messages(self) -> list[dict]:
return [
{"role": m.role, "content": m.content}
for m in self.messages
]
Summary Memory
Compress conversation history into summaries:Copy
class SummaryMemory:
"""Memory that summarizes older messages"""
def __init__(
self,
buffer_size: int = 10,
summary_interval: int = 5
):
self.buffer_size = buffer_size
self.summary_interval = summary_interval
self.messages: list[Message] = []
self.summary: Optional[str] = None
self.messages_since_summary = 0
def add(self, role: str, content: str):
self.messages.append(Message(role=role, content=content))
self.messages_since_summary += 1
# Create summary when buffer is full
if len(self.messages) > self.buffer_size:
self._update_summary()
def _update_summary(self):
"""Summarize older messages"""
# Take oldest messages to summarize
to_summarize = self.messages[:-self.buffer_size//2]
if not to_summarize:
return
# Format messages for summarization
conversation = "\n".join([
f"{m.role}: {m.content}" for m in to_summarize
])
# Create summary
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Summarize this conversation, preserving key facts, decisions, and context."
},
{"role": "user", "content": conversation}
]
)
new_summary = response.choices[0].message.content
# Combine with existing summary if present
if self.summary:
self.summary = f"{self.summary}\n\nUpdated: {new_summary}"
else:
self.summary = new_summary
# Keep only recent messages
self.messages = self.messages[-self.buffer_size//2:]
self.messages_since_summary = 0
def get_context(self) -> list[dict]:
"""Get context for LLM"""
context = []
# Add summary if exists
if self.summary:
context.append({
"role": "system",
"content": f"Previous conversation summary:\n{self.summary}"
})
# Add recent messages
context.extend([
{"role": m.role, "content": m.content}
for m in self.messages
])
return context
# Usage
memory = SummaryMemory(buffer_size=10)
def chat_with_summary(user_input: str) -> str:
memory.add("user", user_input)
messages = [
{"role": "system", "content": "You are a helpful assistant."}
] + memory.get_context()
response = client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_message = response.choices[0].message.content
memory.add("assistant", assistant_message)
return assistant_message
Vector Memory (Long-Term)
Store and retrieve memories semantically:Copy
from openai import OpenAI
import numpy as np
from datetime import datetime
import json
client = OpenAI()
class VectorMemory:
"""Long-term memory using vector embeddings"""
def __init__(self, embedding_model: str = "text-embedding-3-small"):
self.embedding_model = embedding_model
self.memories: list[dict] = []
self.embeddings: list[np.ndarray] = []
def _get_embedding(self, text: str) -> np.ndarray:
response = client.embeddings.create(
model=self.embedding_model,
input=text
)
return np.array(response.data[0].embedding)
def add(
self,
content: str,
metadata: dict = None
):
"""Add a memory"""
embedding = self._get_embedding(content)
memory = {
"content": content,
"timestamp": datetime.now().isoformat(),
"metadata": metadata or {}
}
self.memories.append(memory)
self.embeddings.append(embedding)
def search(
self,
query: str,
top_k: int = 5,
threshold: float = 0.7
) -> list[dict]:
"""Search for relevant memories"""
if not self.memories:
return []
query_embedding = self._get_embedding(query)
# Calculate cosine similarities
similarities = []
for i, emb in enumerate(self.embeddings):
similarity = np.dot(query_embedding, emb) / (
np.linalg.norm(query_embedding) * np.linalg.norm(emb)
)
similarities.append((i, similarity))
# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)
# Return top results above threshold
results = []
for idx, score in similarities[:top_k]:
if score >= threshold:
memory = self.memories[idx].copy()
memory["similarity"] = score
results.append(memory)
return results
def add_conversation(self, role: str, content: str):
"""Add a conversation turn as memory"""
self.add(
content=f"{role}: {content}",
metadata={"type": "conversation", "role": role}
)
def save(self, path: str):
"""Save memories to file"""
data = {
"memories": self.memories,
"embeddings": [e.tolist() for e in self.embeddings]
}
with open(path, "w") as f:
json.dump(data, f)
def load(self, path: str):
"""Load memories from file"""
with open(path) as f:
data = json.load(f)
self.memories = data["memories"]
self.embeddings = [np.array(e) for e in data["embeddings"]]
# Usage with LLM
vector_memory = VectorMemory()
def chat_with_long_term_memory(user_input: str) -> str:
# Search for relevant memories
relevant_memories = vector_memory.search(user_input, top_k=3)
# Build context from memories
memory_context = ""
if relevant_memories:
memory_context = "Relevant memories:\n" + "\n".join([
f"- {m['content']} (similarity: {m['similarity']:.2f})"
for m in relevant_memories
])
# Add current message to memory
vector_memory.add_conversation("user", user_input)
# Generate response
messages = [
{
"role": "system",
"content": f"You are a helpful assistant with long-term memory.\n\n{memory_context}"
},
{"role": "user", "content": user_input}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_message = response.choices[0].message.content
# Add response to memory
vector_memory.add_conversation("assistant", assistant_message)
return assistant_message
Entity Memory
Track facts about specific entities:Copy
from dataclasses import dataclass, field
from typing import Any
@dataclass
class EntityInfo:
entity_type: str
attributes: dict = field(default_factory=dict)
last_updated: float = field(default_factory=lambda: time.time())
class EntityMemory:
"""Memory for tracking entities and their attributes"""
def __init__(self):
self.entities: dict[str, EntityInfo] = {}
def update_entity(
self,
name: str,
entity_type: str,
attributes: dict
):
"""Update or create entity"""
if name in self.entities:
self.entities[name].attributes.update(attributes)
self.entities[name].last_updated = time.time()
else:
self.entities[name] = EntityInfo(
entity_type=entity_type,
attributes=attributes
)
def get_entity(self, name: str) -> Optional[EntityInfo]:
return self.entities.get(name)
def extract_entities_from_text(self, text: str) -> list[dict]:
"""Use LLM to extract entities from text"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": """Extract entities and their attributes from the text.
Return JSON: [{"name": "...", "type": "person|place|thing", "attributes": {...}}]"""
},
{"role": "user", "content": text}
],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
return result.get("entities", [])
def process_and_store(self, text: str):
"""Extract and store entities from text"""
entities = self.extract_entities_from_text(text)
for entity in entities:
self.update_entity(
name=entity["name"],
entity_type=entity["type"],
attributes=entity.get("attributes", {})
)
def get_context(self) -> str:
"""Get entity context for LLM"""
if not self.entities:
return ""
context = "Known entities:\n"
for name, info in self.entities.items():
attrs = ", ".join(
f"{k}: {v}" for k, v in info.attributes.items()
)
context += f"- {name} ({info.entity_type}): {attrs}\n"
return context
# Usage
entity_memory = EntityMemory()
def chat_with_entity_memory(user_input: str) -> str:
# Extract entities from user input
entity_memory.process_and_store(user_input)
# Get entity context
entity_context = entity_memory.get_context()
messages = [
{
"role": "system",
"content": f"You are a helpful assistant.\n\n{entity_context}"
},
{"role": "user", "content": user_input}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_message = response.choices[0].message.content
# Also extract entities from response
entity_memory.process_and_store(assistant_message)
return assistant_message
Hybrid Memory System
Combine all memory types for maximum effectiveness:Copy
class HybridMemory:
"""Combines buffer, summary, vector, and entity memory"""
def __init__(
self,
buffer_size: int = 10,
vector_threshold: float = 0.75
):
self.buffer = BufferMemory(max_messages=buffer_size)
self.summary = SummaryMemory(buffer_size=buffer_size * 2)
self.vector = VectorMemory()
self.entity = EntityMemory()
self.vector_threshold = vector_threshold
def add(self, role: str, content: str):
"""Add message to all memory systems"""
# Short-term
self.buffer.add(role, content)
self.summary.add(role, content)
# Long-term
self.vector.add_conversation(role, content)
# Entity extraction
self.entity.process_and_store(content)
def get_context(self, query: str) -> dict:
"""Get comprehensive context for query"""
# Recent messages (short-term)
recent = self.buffer.get_messages()
# Summary of older conversation
summary = self.summary.summary
# Relevant long-term memories
long_term = self.vector.search(query, top_k=3, threshold=self.vector_threshold)
# Entity knowledge
entities = self.entity.get_context()
return {
"recent_messages": recent,
"summary": summary,
"long_term_memories": long_term,
"entities": entities
}
def build_messages(
self,
query: str,
system_prompt: str
) -> list[dict]:
"""Build message list for LLM"""
context = self.get_context(query)
# Build system message with context
system_content = system_prompt
if context["summary"]:
system_content += f"\n\nConversation summary:\n{context['summary']}"
if context["long_term_memories"]:
memories = "\n".join([
f"- {m['content']}" for m in context["long_term_memories"]
])
system_content += f"\n\nRelevant memories:\n{memories}"
if context["entities"]:
system_content += f"\n\n{context['entities']}"
messages = [{"role": "system", "content": system_content}]
messages.extend(context["recent_messages"])
return messages
def save(self, path: str):
"""Persist memory to disk"""
self.vector.save(f"{path}_vector.json")
# Add more persistence as needed
def load(self, path: str):
"""Load memory from disk"""
self.vector.load(f"{path}_vector.json")
# Usage
memory = HybridMemory(buffer_size=10)
def chat_with_hybrid_memory(user_input: str) -> str:
messages = memory.build_messages(
query=user_input,
system_prompt="You are a helpful assistant with perfect memory."
)
# Add current user message
messages.append({"role": "user", "content": user_input})
response = client.chat.completions.create(
model="gpt-4o",
messages=messages
)
assistant_message = response.choices[0].message.content
# Update memory
memory.add("user", user_input)
memory.add("assistant", assistant_message)
return assistant_message
Memory with LangChain
LangChain provides built-in memory implementations:Copy
from langchain_openai import ChatOpenAI
from langchain.memory import (
ConversationBufferMemory,
ConversationSummaryMemory,
ConversationBufferWindowMemory,
VectorStoreRetrieverMemory
)
from langchain.chains import ConversationChain
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
llm = ChatOpenAI(model="gpt-4o")
# Buffer Memory
buffer_memory = ConversationBufferMemory()
# Window Memory (last k interactions)
window_memory = ConversationBufferWindowMemory(k=5)
# Summary Memory
summary_memory = ConversationSummaryMemory(llm=llm)
# Vector Store Memory
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts([""], embeddings)
vector_memory = VectorStoreRetrieverMemory(
retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
)
# Use in chain
chain = ConversationChain(
llm=llm,
memory=summary_memory,
verbose=True
)
response = chain.predict(input="Hi, I'm Alice!")
response = chain.predict(input="What's my name?")
Key Takeaways
Choose the Right Memory
Buffer for recent context, summary for sessions, vector for long-term
Hybrid is Best
Combine memory types for comprehensive context
Persist Important Data
Store long-term memories and entities to disk
Token Management
Always monitor token usage from memory context
What’s Next
Cost Optimization & Token Management
Learn strategies for optimizing LLM costs and managing tokens efficiently