December 2025 Update: Comprehensive guide to embeddings including model selection, dimensionality, fine-tuning, and production patterns.
What Are Embeddings?
Embeddings convert text into dense numerical vectors that capture semantic meaning:Copy
Text Embedding Vector
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"I love pizza" → [0.12, -0.34, 0.87, ..., 0.23]
"Pizza is my favorite" → [0.11, -0.32, 0.85, ..., 0.21] ← Similar!
"I hate broccoli" → [-0.45, 0.12, -0.33, ..., 0.67] ← Different
| Use Case | What Embeddings Enable |
|---|---|
| Semantic Search | Find documents by meaning, not keywords |
| RAG | Retrieve relevant context for LLMs |
| Clustering | Group similar content automatically |
| Recommendations | Find similar items/users |
| Deduplication | Detect near-duplicate content |
Embedding Models Comparison
Copy
# Model comparison (December 2024)
EMBEDDING_MODELS = {
# OpenAI
"text-embedding-3-small": {
"dimensions": 1536,
"max_tokens": 8191,
"cost_per_1m": 0.02,
"quality": "good",
"speed": "fast"
},
"text-embedding-3-large": {
"dimensions": 3072,
"max_tokens": 8191,
"cost_per_1m": 0.13,
"quality": "excellent",
"speed": "medium"
},
# Cohere
"embed-english-v3.0": {
"dimensions": 1024,
"max_tokens": 512,
"cost_per_1m": 0.10,
"quality": "excellent",
"speed": "fast"
},
# Open Source (via HuggingFace)
"BAAI/bge-large-en-v1.5": {
"dimensions": 1024,
"max_tokens": 512,
"cost_per_1m": 0, # Free if self-hosted
"quality": "excellent",
"speed": "varies"
},
"sentence-transformers/all-MiniLM-L6-v2": {
"dimensions": 384,
"max_tokens": 256,
"cost_per_1m": 0,
"quality": "good",
"speed": "very fast"
}
}
Getting Embeddings
OpenAI Embeddings
Copy
from openai import OpenAI
import numpy as np
client = OpenAI()
def get_embedding(
text: str,
model: str = "text-embedding-3-small"
) -> np.ndarray:
"""Get embedding for a single text"""
response = client.embeddings.create(
model=model,
input=text
)
return np.array(response.data[0].embedding)
def get_embeddings_batch(
texts: list[str],
model: str = "text-embedding-3-small"
) -> list[np.ndarray]:
"""Get embeddings for multiple texts efficiently"""
response = client.embeddings.create(
model=model,
input=texts
)
return [np.array(e.embedding) for e in response.data]
# Usage
embedding = get_embedding("What is machine learning?")
print(f"Dimensions: {len(embedding)}")
# Batch processing
texts = ["Hello world", "How are you?", "Machine learning is cool"]
embeddings = get_embeddings_batch(texts)
Dimensionality Reduction
OpenAI’s text-embedding-3 models support native dimension reduction:Copy
def get_embedding_with_dimensions(
text: str,
dimensions: int = 256,
model: str = "text-embedding-3-small"
) -> np.ndarray:
"""Get embedding with reduced dimensions"""
response = client.embeddings.create(
model=model,
input=text,
dimensions=dimensions # 256, 512, 1024, 1536...
)
return np.array(response.data[0].embedding)
# Smaller embeddings = faster search, less storage
small_embedding = get_embedding_with_dimensions("Hello", dimensions=256)
print(f"Reduced dimensions: {len(small_embedding)}")
Open Source Embeddings
Copy
from sentence_transformers import SentenceTransformer
import numpy as np
class LocalEmbedder:
"""Local embedding using sentence-transformers"""
def __init__(self, model_name: str = "BAAI/bge-large-en-v1.5"):
self.model = SentenceTransformer(model_name)
def embed(self, text: str) -> np.ndarray:
return self.model.encode(text, normalize_embeddings=True)
def embed_batch(self, texts: list[str]) -> np.ndarray:
return self.model.encode(
texts,
normalize_embeddings=True,
batch_size=32,
show_progress_bar=True
)
def embed_with_instruction(
self,
text: str,
instruction: str = "Represent this sentence for retrieval:"
) -> np.ndarray:
"""Some models perform better with instructions"""
return self.model.encode(
f"{instruction} {text}",
normalize_embeddings=True
)
# Usage
embedder = LocalEmbedder()
embedding = embedder.embed("What is artificial intelligence?")
Similarity Metrics
Cosine Similarity
Copy
import numpy as np
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Compute cosine similarity between two vectors"""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def cosine_similarity_normalized(a: np.ndarray, b: np.ndarray) -> float:
"""For pre-normalized vectors, just dot product"""
return np.dot(a, b)
# Usage
emb1 = get_embedding("I love pizza")
emb2 = get_embedding("Pizza is my favorite food")
emb3 = get_embedding("The weather is nice today")
print(f"Similar: {cosine_similarity(emb1, emb2):.3f}") # ~0.85
print(f"Different: {cosine_similarity(emb1, emb3):.3f}") # ~0.40
Other Metrics
Copy
def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
"""L2 distance - lower is more similar"""
return np.linalg.norm(a - b)
def dot_product(a: np.ndarray, b: np.ndarray) -> float:
"""Dot product - higher is more similar (for normalized vectors)"""
return np.dot(a, b)
def manhattan_distance(a: np.ndarray, b: np.ndarray) -> float:
"""L1 distance - lower is more similar"""
return np.sum(np.abs(a - b))
# When to use each:
# - Cosine: General purpose, magnitude-invariant
# - Euclidean: When magnitude matters
# - Dot product: For normalized vectors (fastest)
# - Manhattan: Sparse vectors, high dimensions
Building a Similarity Search Engine
Copy
import numpy as np
from typing import List, Tuple
from dataclasses import dataclass
@dataclass
class Document:
id: str
text: str
embedding: np.ndarray = None
metadata: dict = None
class VectorSearchEngine:
"""Simple in-memory vector search"""
def __init__(self, embedding_model: str = "text-embedding-3-small"):
self.model = embedding_model
self.documents: List[Document] = []
self.embeddings: np.ndarray = None
self.client = OpenAI()
def add_documents(self, documents: List[Document]):
"""Add documents and compute embeddings"""
texts = [doc.text for doc in documents]
# Batch embed
response = self.client.embeddings.create(
model=self.model,
input=texts
)
for doc, emb_data in zip(documents, response.data):
doc.embedding = np.array(emb_data.embedding)
self.documents.append(doc)
# Build matrix for fast search
self._rebuild_index()
def _rebuild_index(self):
"""Rebuild the embedding matrix"""
if self.documents:
self.embeddings = np.vstack([
doc.embedding for doc in self.documents
])
def search(
self,
query: str,
top_k: int = 5,
threshold: float = 0.0
) -> List[Tuple[Document, float]]:
"""Search for similar documents"""
# Embed query
response = self.client.embeddings.create(
model=self.model,
input=query
)
query_embedding = np.array(response.data[0].embedding)
# Compute similarities (matrix operation)
similarities = np.dot(self.embeddings, query_embedding) / (
np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
)
# Get top-k
top_indices = np.argsort(similarities)[::-1][:top_k]
results = []
for idx in top_indices:
score = similarities[idx]
if score >= threshold:
results.append((self.documents[idx], float(score)))
return results
# Usage
engine = VectorSearchEngine()
documents = [
Document(id="1", text="Machine learning is a subset of AI"),
Document(id="2", text="Deep learning uses neural networks"),
Document(id="3", text="Python is a programming language"),
Document(id="4", text="Natural language processing handles text"),
]
engine.add_documents(documents)
results = engine.search("What is artificial intelligence?", top_k=3)
for doc, score in results:
print(f"[{score:.3f}] {doc.text}")
Hybrid Search: Embeddings + Keywords
Combine semantic search with keyword matching:Copy
from rank_bm25 import BM25Okapi
import numpy as np
class HybridSearchEngine:
"""Combines vector search with BM25 keyword search"""
def __init__(self, alpha: float = 0.5):
self.alpha = alpha # Weight for semantic vs keyword
self.vector_engine = VectorSearchEngine()
self.bm25 = None
self.tokenized_docs = []
def add_documents(self, documents: List[Document]):
"""Add documents to both indices"""
# Vector index
self.vector_engine.add_documents(documents)
# BM25 index
self.tokenized_docs = [
doc.text.lower().split() for doc in documents
]
self.bm25 = BM25Okapi(self.tokenized_docs)
def search(
self,
query: str,
top_k: int = 5
) -> List[Tuple[Document, float]]:
"""Hybrid search combining semantic and keyword scores"""
# Semantic search
semantic_results = self.vector_engine.search(query, top_k=top_k * 2)
# BM25 keyword search
tokenized_query = query.lower().split()
bm25_scores = self.bm25.get_scores(tokenized_query)
# Normalize scores
semantic_scores = {r[0].id: r[1] for r in semantic_results}
# Normalize BM25 scores to 0-1
max_bm25 = max(bm25_scores) if max(bm25_scores) > 0 else 1
normalized_bm25 = {
self.vector_engine.documents[i].id: score / max_bm25
for i, score in enumerate(bm25_scores)
}
# Combine scores
combined_scores = {}
all_doc_ids = set(semantic_scores.keys()) | set(normalized_bm25.keys())
for doc_id in all_doc_ids:
semantic = semantic_scores.get(doc_id, 0)
keyword = normalized_bm25.get(doc_id, 0)
combined_scores[doc_id] = (
self.alpha * semantic + (1 - self.alpha) * keyword
)
# Sort by combined score
sorted_ids = sorted(
combined_scores.keys(),
key=lambda x: combined_scores[x],
reverse=True
)[:top_k]
# Return documents with scores
doc_map = {d.id: d for d in self.vector_engine.documents}
return [
(doc_map[doc_id], combined_scores[doc_id])
for doc_id in sorted_ids
]
# Usage
hybrid = HybridSearchEngine(alpha=0.7) # 70% semantic, 30% keyword
hybrid.add_documents(documents)
results = hybrid.search("What is AI and machine learning?")
Embedding Optimization
Batching and Rate Limiting
Copy
import asyncio
from openai import AsyncOpenAI
from typing import List
import time
class OptimizedEmbedder:
"""Efficient batch embedding with rate limiting"""
def __init__(
self,
model: str = "text-embedding-3-small",
batch_size: int = 100,
requests_per_minute: int = 3000
):
self.model = model
self.batch_size = batch_size
self.min_interval = 60 / requests_per_minute
self.client = AsyncOpenAI()
self.last_request_time = 0
async def _rate_limit(self):
"""Ensure we don't exceed rate limits"""
elapsed = time.time() - self.last_request_time
if elapsed < self.min_interval:
await asyncio.sleep(self.min_interval - elapsed)
self.last_request_time = time.time()
async def embed_batch(self, texts: List[str]) -> List[np.ndarray]:
"""Embed a batch of texts"""
await self._rate_limit()
response = await self.client.embeddings.create(
model=self.model,
input=texts
)
return [np.array(e.embedding) for e in response.data]
async def embed_all(self, texts: List[str]) -> List[np.ndarray]:
"""Embed all texts with batching"""
all_embeddings = []
for i in range(0, len(texts), self.batch_size):
batch = texts[i:i + self.batch_size]
embeddings = await self.embed_batch(batch)
all_embeddings.extend(embeddings)
print(f"Processed {min(i + self.batch_size, len(texts))}/{len(texts)}")
return all_embeddings
# Usage
async def main():
embedder = OptimizedEmbedder()
texts = ["Text " + str(i) for i in range(1000)]
embeddings = await embedder.embed_all(texts)
print(f"Embedded {len(embeddings)} texts")
asyncio.run(main())
Caching Embeddings
Copy
import hashlib
import pickle
from pathlib import Path
class CachedEmbedder:
"""Cache embeddings to avoid recomputation"""
def __init__(self, cache_dir: str = ".embedding_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self.client = OpenAI()
def _cache_key(self, text: str, model: str) -> str:
content = f"{model}:{text}"
return hashlib.sha256(content.encode()).hexdigest()
def _cache_path(self, key: str) -> Path:
return self.cache_dir / f"{key}.pkl"
def get_embedding(
self,
text: str,
model: str = "text-embedding-3-small"
) -> np.ndarray:
"""Get embedding, using cache if available"""
key = self._cache_key(text, model)
cache_path = self._cache_path(key)
# Check cache
if cache_path.exists():
with open(cache_path, "rb") as f:
return pickle.load(f)
# Compute embedding
response = self.client.embeddings.create(
model=model,
input=text
)
embedding = np.array(response.data[0].embedding)
# Cache it
with open(cache_path, "wb") as f:
pickle.dump(embedding, f)
return embedding
Fine-Tuning Embeddings
For domain-specific applications, fine-tune embedding models:Copy
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
def fine_tune_embeddings(
model_name: str,
training_data: List[Tuple[str, str, float]], # (text1, text2, similarity)
output_path: str,
epochs: int = 3
):
"""Fine-tune an embedding model on domain data"""
# Load base model
model = SentenceTransformer(model_name)
# Prepare training data
train_examples = [
InputExample(texts=[t1, t2], label=sim)
for t1, t2, sim in training_data
]
train_dataloader = DataLoader(
train_examples,
shuffle=True,
batch_size=16
)
# Use cosine similarity loss
train_loss = losses.CosineSimilarityLoss(model)
# Train
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=epochs,
warmup_steps=100,
output_path=output_path
)
return model
# Example training data for a medical domain
training_data = [
("patient has fever", "elevated temperature", 0.9),
("patient has fever", "broken leg", 0.1),
("chest pain", "cardiac symptoms", 0.85),
# ... more examples
]
model = fine_tune_embeddings(
"sentence-transformers/all-MiniLM-L6-v2",
training_data,
"medical-embeddings"
)
Key Takeaways
Choose the Right Model
Balance quality, speed, and cost for your use case
Normalize Your Vectors
Pre-normalize for faster similarity search
Hybrid Search Works
Combine semantic + keyword for best results
Cache Everything
Embeddings are deterministic - cache aggressively
What’s Next
AI Streaming
Master streaming responses for real-time AI applications