HyDE: Hypothetical Document Embeddings
Generate a hypothetical answer to improve retrieval:Copy
from openai import OpenAI
import numpy as np
class HyDERetriever:
"""Hypothetical Document Embeddings for improved retrieval."""
def __init__(
self,
documents: list[str],
model: str = "gpt-4o-mini",
embedding_model: str = "text-embedding-3-small"
):
self.client = OpenAI()
self.model = model
self.embedding_model = embedding_model
self.documents = documents
self.doc_embeddings = self._embed_documents()
def _embed_documents(self) -> np.ndarray:
"""Embed all documents."""
response = self.client.embeddings.create(
model=self.embedding_model,
input=self.documents
)
return np.array([e.embedding for e in response.data])
def _generate_hypothetical_answer(self, query: str) -> str:
"""Generate a hypothetical answer to the query."""
prompt = f"""Given this question, write a passage that would answer it.
Write as if you're quoting from a document that contains the answer.
Be specific and detailed.
Question: {query}
Hypothetical document passage:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.7
)
return response.choices[0].message.content
def retrieve(
self,
query: str,
top_k: int = 5,
use_hyde: bool = True
) -> list[tuple[str, float]]:
"""Retrieve documents using HyDE or direct query."""
# Generate hypothetical document if using HyDE
if use_hyde:
search_text = self._generate_hypothetical_answer(query)
else:
search_text = query
# Embed search text
response = self.client.embeddings.create(
model=self.embedding_model,
input=[search_text]
)
query_embedding = np.array(response.data[0].embedding)
# Calculate similarities
similarities = np.dot(self.doc_embeddings, query_embedding)
# Get top-k results
top_indices = np.argsort(similarities)[::-1][:top_k]
results = [
(self.documents[i], similarities[i])
for i in top_indices
]
return results
# Usage
documents = [
"The Python programming language was created by Guido van Rossum and first released in 1991.",
"Machine learning is a subset of artificial intelligence focused on building systems that learn from data.",
"The Great Wall of China is over 13,000 miles long and was built over many centuries.",
"Quantum computing uses quantum-mechanical phenomena to perform computation.",
"The human brain contains approximately 86 billion neurons.",
]
retriever = HyDERetriever(documents)
query = "When was Python created and by whom?"
# Compare HyDE vs direct retrieval
print("With HyDE:")
results = retriever.retrieve(query, top_k=2, use_hyde=True)
for doc, score in results:
print(f" [{score:.3f}] {doc[:80]}...")
print("\nWithout HyDE:")
results = retriever.retrieve(query, top_k=2, use_hyde=False)
for doc, score in results:
print(f" [{score:.3f}] {doc[:80]}...")
Multi-Query Retrieval
Generate multiple query variations for broader retrieval:Copy
from openai import OpenAI
import numpy as np
from collections import defaultdict
class MultiQueryRetriever:
"""Generate multiple query variations for improved recall."""
def __init__(
self,
documents: list[str],
model: str = "gpt-4o-mini"
):
self.client = OpenAI()
self.model = model
self.documents = documents
self.doc_embeddings = self._embed_documents()
def _embed_documents(self) -> np.ndarray:
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=self.documents
)
return np.array([e.embedding for e in response.data])
def generate_query_variations(
self,
query: str,
num_variations: int = 3
) -> list[str]:
"""Generate alternative phrasings of the query."""
prompt = f"""Generate {num_variations} different versions of this search query.
Each version should capture the same intent but use different words or perspectives.
Make them diverse to improve search coverage.
Original query: {query}
Return only the queries, one per line, without numbering."""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
variations = [
line.strip()
for line in response.choices[0].message.content.split("\n")
if line.strip()
]
# Include original query
return [query] + variations[:num_variations]
def retrieve(
self,
query: str,
top_k: int = 5,
num_variations: int = 3
) -> list[tuple[str, float]]:
"""Retrieve using multiple query variations."""
# Generate variations
queries = self.generate_query_variations(query, num_variations)
# Embed all queries
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=queries
)
query_embeddings = np.array([e.embedding for e in response.data])
# Aggregate scores across all queries
doc_scores = defaultdict(list)
for query_embedding in query_embeddings:
similarities = np.dot(self.doc_embeddings, query_embedding)
for i, score in enumerate(similarities):
doc_scores[i].append(score)
# Use max score for each document (or could use mean)
final_scores = [
(i, max(scores))
for i, scores in doc_scores.items()
]
# Sort and get top-k
final_scores.sort(key=lambda x: x[1], reverse=True)
return [
(self.documents[i], score)
for i, score in final_scores[:top_k]
]
# Usage
documents = [
"Python is a high-level programming language known for its simple syntax.",
"Snake charming is an ancient practice found in parts of Asia and Africa.",
"The python snake is one of the largest snake species in the world.",
"Django and Flask are popular Python web frameworks.",
"Anaconda is both a snake species and a Python distribution.",
]
retriever = MultiQueryRetriever(documents)
query = "Python programming frameworks"
results = retriever.retrieve(query, top_k=3)
print(f"Query: {query}\n")
for doc, score in results:
print(f"[{score:.3f}] {doc}")
Parent Document Retrieval
Retrieve small chunks but return larger parent documents:Copy
from openai import OpenAI
from dataclasses import dataclass
import numpy as np
import uuid
@dataclass
class Chunk:
"""A chunk with reference to parent document."""
id: str
parent_id: str
text: str
start_pos: int
end_pos: int
@dataclass
class ParentDocument:
"""A parent document with its chunks."""
id: str
text: str
chunks: list[Chunk]
class ParentDocumentRetriever:
"""Retrieve chunks and return parent documents."""
def __init__(
self,
chunk_size: int = 200,
chunk_overlap: int = 50
):
self.client = OpenAI()
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.parents: dict[str, ParentDocument] = {}
self.chunks: list[Chunk] = []
self.chunk_embeddings: np.ndarray = None
def add_document(self, text: str) -> str:
"""Add a document and create chunks."""
parent_id = str(uuid.uuid4())
# Create chunks
chunks = self._create_chunks(text, parent_id)
parent = ParentDocument(
id=parent_id,
text=text,
chunks=chunks
)
self.parents[parent_id] = parent
self.chunks.extend(chunks)
# Recompute embeddings
self._update_embeddings()
return parent_id
def _create_chunks(self, text: str, parent_id: str) -> list[Chunk]:
"""Split text into overlapping chunks."""
chunks = []
start = 0
while start < len(text):
end = min(start + self.chunk_size, len(text))
# Find word boundary
if end < len(text):
space_pos = text.rfind(" ", start, end)
if space_pos > start:
end = space_pos
chunks.append(Chunk(
id=str(uuid.uuid4()),
parent_id=parent_id,
text=text[start:end].strip(),
start_pos=start,
end_pos=end
))
start = end - self.chunk_overlap
return chunks
def _update_embeddings(self):
"""Update embeddings for all chunks."""
if not self.chunks:
return
texts = [c.text for c in self.chunks]
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=texts
)
self.chunk_embeddings = np.array([e.embedding for e in response.data])
def retrieve(
self,
query: str,
top_k: int = 3,
return_parent: bool = True
) -> list[tuple[str, float]]:
"""Retrieve chunks or parent documents."""
if self.chunk_embeddings is None:
return []
# Embed query
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=[query]
)
query_embedding = np.array(response.data[0].embedding)
# Calculate similarities
similarities = np.dot(self.chunk_embeddings, query_embedding)
top_indices = np.argsort(similarities)[::-1][:top_k]
if return_parent:
# Return unique parent documents
seen_parents = set()
results = []
for i in top_indices:
chunk = self.chunks[i]
if chunk.parent_id not in seen_parents:
seen_parents.add(chunk.parent_id)
parent = self.parents[chunk.parent_id]
results.append((parent.text, similarities[i]))
return results
else:
return [
(self.chunks[i].text, similarities[i])
for i in top_indices
]
# Usage
retriever = ParentDocumentRetriever(chunk_size=100, chunk_overlap=20)
# Add documents
doc1 = """
Machine learning is a branch of artificial intelligence that enables systems to learn
from data. It includes supervised learning, unsupervised learning, and reinforcement
learning. Deep learning, a subset of machine learning, uses neural networks with many
layers to learn complex patterns.
"""
doc2 = """
Natural language processing (NLP) combines linguistics and machine learning to enable
computers to understand human language. Key tasks include sentiment analysis, named
entity recognition, and machine translation. Modern NLP relies heavily on transformer
architectures like BERT and GPT.
"""
retriever.add_document(doc1)
retriever.add_document(doc2)
query = "deep learning neural networks"
print("Retrieved chunks:")
for text, score in retriever.retrieve(query, return_parent=False):
print(f"[{score:.3f}] {text[:100]}...")
print("\nRetrieved parent documents:")
for text, score in retriever.retrieve(query, return_parent=True):
print(f"[{score:.3f}] {text[:150]}...")
Query Decomposition
Break complex queries into sub-queries:Copy
from openai import OpenAI
import json
class QueryDecomposer:
"""Decompose complex queries into simpler sub-queries."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def decompose(self, query: str) -> list[str]:
"""Break query into sub-queries."""
prompt = f"""Analyze this complex query and break it into simpler sub-queries
that can be answered independently.
Query: {query}
Return as JSON: {{"sub_queries": ["query1", "query2", ...]}}
Only decompose if necessary. For simple queries, return the original query."""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("sub_queries", [query])
def retrieve_and_synthesize(
self,
query: str,
retriever,
top_k: int = 3
) -> str:
"""Decompose query, retrieve for each, and synthesize answer."""
# Decompose query
sub_queries = self.decompose(query)
# Retrieve for each sub-query
all_context = []
for sub_query in sub_queries:
results = retriever.retrieve(sub_query, top_k=top_k)
context = [doc for doc, score in results]
all_context.extend(context)
# Deduplicate context
unique_context = list(dict.fromkeys(all_context))
# Synthesize answer
context_text = "\n\n".join(unique_context)
synthesis_prompt = f"""Based on the following context, answer the question.
Context:
{context_text}
Question: {query}
Provide a comprehensive answer based on the context."""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": synthesis_prompt}]
)
return response.choices[0].message.content
# Usage
decomposer = QueryDecomposer()
complex_query = """
What are the main differences between supervised and unsupervised learning,
and how does deep learning relate to each of them?
"""
sub_queries = decomposer.decompose(complex_query)
print("Decomposed queries:")
for i, sq in enumerate(sub_queries, 1):
print(f" {i}. {sq}")
Corrective RAG (CRAG)
Self-correct retrieval based on relevance assessment:Copy
from openai import OpenAI
import json
import numpy as np
class CorrectiveRAG:
"""RAG with self-correction for improved accuracy."""
def __init__(
self,
documents: list[str],
model: str = "gpt-4o-mini"
):
self.client = OpenAI()
self.model = model
self.documents = documents
self.doc_embeddings = self._embed_documents()
def _embed_documents(self) -> np.ndarray:
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=self.documents
)
return np.array([e.embedding for e in response.data])
def _retrieve(self, query: str, top_k: int) -> list[tuple[str, float]]:
"""Basic retrieval."""
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=[query]
)
query_embedding = np.array(response.data[0].embedding)
similarities = np.dot(self.doc_embeddings, query_embedding)
top_indices = np.argsort(similarities)[::-1][:top_k]
return [(self.documents[i], similarities[i]) for i in top_indices]
def _assess_relevance(
self,
query: str,
document: str
) -> dict:
"""Assess if document is relevant to query."""
prompt = f"""Assess if this document is relevant to the query.
Query: {query}
Document: {document}
Respond with JSON:
{{
"is_relevant": true/false,
"relevance_score": 0.0-1.0,
"reasoning": "brief explanation"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def _refine_query(self, query: str, context: str) -> str:
"""Refine query based on initial context."""
prompt = f"""Based on this context, refine the query to get better results.
Original query: {query}
Available context: {context}
If the context is insufficient, create a more specific or alternative query.
Return only the refined query."""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content.strip()
def retrieve(
self,
query: str,
top_k: int = 5,
relevance_threshold: float = 0.5,
max_iterations: int = 2
) -> list[str]:
"""Retrieve with self-correction."""
current_query = query
all_relevant_docs = []
for iteration in range(max_iterations):
# Retrieve documents
results = self._retrieve(current_query, top_k)
# Assess relevance of each document
relevant_docs = []
irrelevant_count = 0
for doc, score in results:
assessment = self._assess_relevance(query, doc)
if assessment["relevance_score"] >= relevance_threshold:
if doc not in all_relevant_docs:
relevant_docs.append(doc)
all_relevant_docs.append(doc)
else:
irrelevant_count += 1
# If too many irrelevant, refine query
if irrelevant_count > top_k // 2 and iteration < max_iterations - 1:
context = "\n".join(relevant_docs) if relevant_docs else "No relevant context found."
current_query = self._refine_query(query, context)
print(f"Refined query: {current_query}")
else:
break
return all_relevant_docs
def answer(
self,
query: str,
top_k: int = 5
) -> str:
"""Retrieve and generate answer with CRAG."""
relevant_docs = self.retrieve(query, top_k)
if not relevant_docs:
return "I couldn't find relevant information to answer this question."
context = "\n\n".join(relevant_docs)
prompt = f"""Answer the question based on the following context.
If the context doesn't contain enough information, say so.
Context:
{context}
Question: {query}
Answer:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Usage
documents = [
"Python 3.12 was released in October 2023 with improved error messages.",
"The Django web framework is built on Python and follows the MTV pattern.",
"Machine learning models can be trained using scikit-learn in Python.",
"Flask is a lightweight Python web framework for building APIs.",
"Rust is a systems programming language focused on safety and performance.",
]
crag = CorrectiveRAG(documents)
query = "What Python web frameworks are available?"
answer = crag.answer(query)
print(f"Query: {query}")
print(f"Answer: {answer}")
Reciprocal Rank Fusion
Combine results from multiple retrieval methods:Copy
import numpy as np
from collections import defaultdict
class RRFRetriever:
"""Combine multiple retrievers using Reciprocal Rank Fusion."""
def __init__(self, retrievers: list, k: int = 60):
self.retrievers = retrievers
self.k = k # RRF parameter
def retrieve(self, query: str, top_k: int = 10) -> list[tuple[str, float]]:
"""Retrieve using RRF to combine results."""
doc_scores = defaultdict(float)
# Get results from each retriever
for retriever in self.retrievers:
results = retriever.retrieve(query)
for rank, (doc, _) in enumerate(results):
# RRF formula: 1 / (k + rank)
doc_scores[doc] += 1.0 / (self.k + rank + 1)
# Sort by combined score
sorted_docs = sorted(
doc_scores.items(),
key=lambda x: x[1],
reverse=True
)
return sorted_docs[:top_k]
# Can combine with keyword (BM25) and semantic retrievers
# See semantic-search.mdx for full implementation
Advanced RAG Best Practices
- Use HyDE for queries that are different from document style
- Multi-query retrieval improves recall for ambiguous queries
- Parent document retrieval preserves context for answers
- Always assess retrieval quality before generation
- Combine multiple techniques for best results
Practice Exercise
Build an advanced RAG system that:- Implements HyDE for query transformation
- Uses multi-query retrieval for improved recall
- Applies parent document retrieval for context
- Includes self-correction with relevance assessment
- Combines methods using reciprocal rank fusion
- Measuring retrieval quality improvements
- Balancing latency vs quality tradeoffs
- Handling edge cases gracefully
- Providing explainable retrieval decisions