Skip to main content
December 2025 Update: Comprehensive guide to embeddings including model selection, dimensionality, fine-tuning, and production patterns.

What Are Embeddings?

Embeddings convert text into dense numerical vectors that capture semantic meaning:
Text                          Embedding Vector
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
"I love pizza"         →      [0.12, -0.34, 0.87, ..., 0.23]
"Pizza is my favorite" →      [0.11, -0.32, 0.85, ..., 0.21]  ← Similar!
"I hate broccoli"      →      [-0.45, 0.12, -0.33, ..., 0.67] ← Different
Use CaseWhat Embeddings Enable
Semantic SearchFind documents by meaning, not keywords
RAGRetrieve relevant context for LLMs
ClusteringGroup similar content automatically
RecommendationsFind similar items/users
DeduplicationDetect near-duplicate content

Embedding Models Comparison

# Model comparison (December 2024)
EMBEDDING_MODELS = {
    # OpenAI
    "text-embedding-3-small": {
        "dimensions": 1536,
        "max_tokens": 8191,
        "cost_per_1m": 0.02,
        "quality": "good",
        "speed": "fast"
    },
    "text-embedding-3-large": {
        "dimensions": 3072,
        "max_tokens": 8191,
        "cost_per_1m": 0.13,
        "quality": "excellent",
        "speed": "medium"
    },
    
    # Cohere
    "embed-english-v3.0": {
        "dimensions": 1024,
        "max_tokens": 512,
        "cost_per_1m": 0.10,
        "quality": "excellent",
        "speed": "fast"
    },
    
    # Open Source (via HuggingFace)
    "BAAI/bge-large-en-v1.5": {
        "dimensions": 1024,
        "max_tokens": 512,
        "cost_per_1m": 0,  # Free if self-hosted
        "quality": "excellent",
        "speed": "varies"
    },
    "sentence-transformers/all-MiniLM-L6-v2": {
        "dimensions": 384,
        "max_tokens": 256,
        "cost_per_1m": 0,
        "quality": "good",
        "speed": "very fast"
    }
}

Getting Embeddings

OpenAI Embeddings

from openai import OpenAI
import numpy as np

client = OpenAI()

def get_embedding(
    text: str,
    model: str = "text-embedding-3-small"
) -> np.ndarray:
    """Get embedding for a single text"""
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return np.array(response.data[0].embedding)

def get_embeddings_batch(
    texts: list[str],
    model: str = "text-embedding-3-small"
) -> list[np.ndarray]:
    """Get embeddings for multiple texts efficiently"""
    response = client.embeddings.create(
        model=model,
        input=texts
    )
    return [np.array(e.embedding) for e in response.data]

# Usage
embedding = get_embedding("What is machine learning?")
print(f"Dimensions: {len(embedding)}")

# Batch processing
texts = ["Hello world", "How are you?", "Machine learning is cool"]
embeddings = get_embeddings_batch(texts)

Dimensionality Reduction

OpenAI’s text-embedding-3 models support native dimension reduction:
def get_embedding_with_dimensions(
    text: str,
    dimensions: int = 256,
    model: str = "text-embedding-3-small"
) -> np.ndarray:
    """Get embedding with reduced dimensions"""
    response = client.embeddings.create(
        model=model,
        input=text,
        dimensions=dimensions  # 256, 512, 1024, 1536...
    )
    return np.array(response.data[0].embedding)

# Smaller embeddings = faster search, less storage
small_embedding = get_embedding_with_dimensions("Hello", dimensions=256)
print(f"Reduced dimensions: {len(small_embedding)}")

Open Source Embeddings

from sentence_transformers import SentenceTransformer
import numpy as np

class LocalEmbedder:
    """Local embedding using sentence-transformers"""
    
    def __init__(self, model_name: str = "BAAI/bge-large-en-v1.5"):
        self.model = SentenceTransformer(model_name)
    
    def embed(self, text: str) -> np.ndarray:
        return self.model.encode(text, normalize_embeddings=True)
    
    def embed_batch(self, texts: list[str]) -> np.ndarray:
        return self.model.encode(
            texts,
            normalize_embeddings=True,
            batch_size=32,
            show_progress_bar=True
        )
    
    def embed_with_instruction(
        self,
        text: str,
        instruction: str = "Represent this sentence for retrieval:"
    ) -> np.ndarray:
        """Some models perform better with instructions"""
        return self.model.encode(
            f"{instruction} {text}",
            normalize_embeddings=True
        )

# Usage
embedder = LocalEmbedder()
embedding = embedder.embed("What is artificial intelligence?")

Similarity Metrics

Cosine Similarity

import numpy as np

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two vectors"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def cosine_similarity_normalized(a: np.ndarray, b: np.ndarray) -> float:
    """For pre-normalized vectors, just dot product"""
    return np.dot(a, b)

# Usage
emb1 = get_embedding("I love pizza")
emb2 = get_embedding("Pizza is my favorite food")
emb3 = get_embedding("The weather is nice today")

print(f"Similar: {cosine_similarity(emb1, emb2):.3f}")  # ~0.85
print(f"Different: {cosine_similarity(emb1, emb3):.3f}")  # ~0.40

Other Metrics

def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
    """L2 distance - lower is more similar"""
    return np.linalg.norm(a - b)

def dot_product(a: np.ndarray, b: np.ndarray) -> float:
    """Dot product - higher is more similar (for normalized vectors)"""
    return np.dot(a, b)

def manhattan_distance(a: np.ndarray, b: np.ndarray) -> float:
    """L1 distance - lower is more similar"""
    return np.sum(np.abs(a - b))

# When to use each:
# - Cosine: General purpose, magnitude-invariant
# - Euclidean: When magnitude matters
# - Dot product: For normalized vectors (fastest)
# - Manhattan: Sparse vectors, high dimensions

Building a Similarity Search Engine

import numpy as np
from typing import List, Tuple
from dataclasses import dataclass

@dataclass
class Document:
    id: str
    text: str
    embedding: np.ndarray = None
    metadata: dict = None

class VectorSearchEngine:
    """Simple in-memory vector search"""
    
    def __init__(self, embedding_model: str = "text-embedding-3-small"):
        self.model = embedding_model
        self.documents: List[Document] = []
        self.embeddings: np.ndarray = None
        self.client = OpenAI()
    
    def add_documents(self, documents: List[Document]):
        """Add documents and compute embeddings"""
        texts = [doc.text for doc in documents]
        
        # Batch embed
        response = self.client.embeddings.create(
            model=self.model,
            input=texts
        )
        
        for doc, emb_data in zip(documents, response.data):
            doc.embedding = np.array(emb_data.embedding)
            self.documents.append(doc)
        
        # Build matrix for fast search
        self._rebuild_index()
    
    def _rebuild_index(self):
        """Rebuild the embedding matrix"""
        if self.documents:
            self.embeddings = np.vstack([
                doc.embedding for doc in self.documents
            ])
    
    def search(
        self,
        query: str,
        top_k: int = 5,
        threshold: float = 0.0
    ) -> List[Tuple[Document, float]]:
        """Search for similar documents"""
        # Embed query
        response = self.client.embeddings.create(
            model=self.model,
            input=query
        )
        query_embedding = np.array(response.data[0].embedding)
        
        # Compute similarities (matrix operation)
        similarities = np.dot(self.embeddings, query_embedding) / (
            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
        )
        
        # Get top-k
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            score = similarities[idx]
            if score >= threshold:
                results.append((self.documents[idx], float(score)))
        
        return results

# Usage
engine = VectorSearchEngine()

documents = [
    Document(id="1", text="Machine learning is a subset of AI"),
    Document(id="2", text="Deep learning uses neural networks"),
    Document(id="3", text="Python is a programming language"),
    Document(id="4", text="Natural language processing handles text"),
]

engine.add_documents(documents)

results = engine.search("What is artificial intelligence?", top_k=3)
for doc, score in results:
    print(f"[{score:.3f}] {doc.text}")

Hybrid Search: Embeddings + Keywords

Combine semantic search with keyword matching:
from rank_bm25 import BM25Okapi
import numpy as np

class HybridSearchEngine:
    """Combines vector search with BM25 keyword search"""
    
    def __init__(self, alpha: float = 0.5):
        self.alpha = alpha  # Weight for semantic vs keyword
        self.vector_engine = VectorSearchEngine()
        self.bm25 = None
        self.tokenized_docs = []
    
    def add_documents(self, documents: List[Document]):
        """Add documents to both indices"""
        # Vector index
        self.vector_engine.add_documents(documents)
        
        # BM25 index
        self.tokenized_docs = [
            doc.text.lower().split() for doc in documents
        ]
        self.bm25 = BM25Okapi(self.tokenized_docs)
    
    def search(
        self,
        query: str,
        top_k: int = 5
    ) -> List[Tuple[Document, float]]:
        """Hybrid search combining semantic and keyword scores"""
        
        # Semantic search
        semantic_results = self.vector_engine.search(query, top_k=top_k * 2)
        
        # BM25 keyword search
        tokenized_query = query.lower().split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        
        # Normalize scores
        semantic_scores = {r[0].id: r[1] for r in semantic_results}
        
        # Normalize BM25 scores to 0-1
        max_bm25 = max(bm25_scores) if max(bm25_scores) > 0 else 1
        normalized_bm25 = {
            self.vector_engine.documents[i].id: score / max_bm25
            for i, score in enumerate(bm25_scores)
        }
        
        # Combine scores
        combined_scores = {}
        all_doc_ids = set(semantic_scores.keys()) | set(normalized_bm25.keys())
        
        for doc_id in all_doc_ids:
            semantic = semantic_scores.get(doc_id, 0)
            keyword = normalized_bm25.get(doc_id, 0)
            combined_scores[doc_id] = (
                self.alpha * semantic + (1 - self.alpha) * keyword
            )
        
        # Sort by combined score
        sorted_ids = sorted(
            combined_scores.keys(),
            key=lambda x: combined_scores[x],
            reverse=True
        )[:top_k]
        
        # Return documents with scores
        doc_map = {d.id: d for d in self.vector_engine.documents}
        return [
            (doc_map[doc_id], combined_scores[doc_id])
            for doc_id in sorted_ids
        ]

# Usage
hybrid = HybridSearchEngine(alpha=0.7)  # 70% semantic, 30% keyword
hybrid.add_documents(documents)
results = hybrid.search("What is AI and machine learning?")

Embedding Optimization

Batching and Rate Limiting

import asyncio
from openai import AsyncOpenAI
from typing import List
import time

class OptimizedEmbedder:
    """Efficient batch embedding with rate limiting"""
    
    def __init__(
        self,
        model: str = "text-embedding-3-small",
        batch_size: int = 100,
        requests_per_minute: int = 3000
    ):
        self.model = model
        self.batch_size = batch_size
        self.min_interval = 60 / requests_per_minute
        self.client = AsyncOpenAI()
        self.last_request_time = 0
    
    async def _rate_limit(self):
        """Ensure we don't exceed rate limits"""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_interval:
            await asyncio.sleep(self.min_interval - elapsed)
        self.last_request_time = time.time()
    
    async def embed_batch(self, texts: List[str]) -> List[np.ndarray]:
        """Embed a batch of texts"""
        await self._rate_limit()
        
        response = await self.client.embeddings.create(
            model=self.model,
            input=texts
        )
        
        return [np.array(e.embedding) for e in response.data]
    
    async def embed_all(self, texts: List[str]) -> List[np.ndarray]:
        """Embed all texts with batching"""
        all_embeddings = []
        
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            embeddings = await self.embed_batch(batch)
            all_embeddings.extend(embeddings)
            
            print(f"Processed {min(i + self.batch_size, len(texts))}/{len(texts)}")
        
        return all_embeddings

# Usage
async def main():
    embedder = OptimizedEmbedder()
    
    texts = ["Text " + str(i) for i in range(1000)]
    embeddings = await embedder.embed_all(texts)
    print(f"Embedded {len(embeddings)} texts")

asyncio.run(main())

Caching Embeddings

import hashlib
import pickle
from pathlib import Path

class CachedEmbedder:
    """Cache embeddings to avoid recomputation"""
    
    def __init__(self, cache_dir: str = ".embedding_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.client = OpenAI()
    
    def _cache_key(self, text: str, model: str) -> str:
        content = f"{model}:{text}"
        return hashlib.sha256(content.encode()).hexdigest()
    
    def _cache_path(self, key: str) -> Path:
        return self.cache_dir / f"{key}.pkl"
    
    def get_embedding(
        self,
        text: str,
        model: str = "text-embedding-3-small"
    ) -> np.ndarray:
        """Get embedding, using cache if available"""
        key = self._cache_key(text, model)
        cache_path = self._cache_path(key)
        
        # Check cache
        if cache_path.exists():
            with open(cache_path, "rb") as f:
                return pickle.load(f)
        
        # Compute embedding
        response = self.client.embeddings.create(
            model=model,
            input=text
        )
        embedding = np.array(response.data[0].embedding)
        
        # Cache it
        with open(cache_path, "wb") as f:
            pickle.dump(embedding, f)
        
        return embedding

Fine-Tuning Embeddings

For domain-specific applications, fine-tune embedding models:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

def fine_tune_embeddings(
    model_name: str,
    training_data: List[Tuple[str, str, float]],  # (text1, text2, similarity)
    output_path: str,
    epochs: int = 3
):
    """Fine-tune an embedding model on domain data"""
    
    # Load base model
    model = SentenceTransformer(model_name)
    
    # Prepare training data
    train_examples = [
        InputExample(texts=[t1, t2], label=sim)
        for t1, t2, sim in training_data
    ]
    
    train_dataloader = DataLoader(
        train_examples,
        shuffle=True,
        batch_size=16
    )
    
    # Use cosine similarity loss
    train_loss = losses.CosineSimilarityLoss(model)
    
    # Train
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=100,
        output_path=output_path
    )
    
    return model

# Example training data for a medical domain
training_data = [
    ("patient has fever", "elevated temperature", 0.9),
    ("patient has fever", "broken leg", 0.1),
    ("chest pain", "cardiac symptoms", 0.85),
    # ... more examples
]

model = fine_tune_embeddings(
    "sentence-transformers/all-MiniLM-L6-v2",
    training_data,
    "medical-embeddings"
)

Key Takeaways

Choose the Right Model

Balance quality, speed, and cost for your use case

Normalize Your Vectors

Pre-normalize for faster similarity search

Hybrid Search Works

Combine semantic + keyword for best results

Cache Everything

Embeddings are deterministic - cache aggressively

What’s Next

AI Streaming

Master streaming responses for real-time AI applications