Skip to main content
Testing LLM applications requires different strategies than traditional software due to non-deterministic outputs and external API dependencies.

Testing Challenges

Challenge                     Solution
-------------------------------------------------------------
Non-deterministic outputs     Semantic assertions, temperature=0
API costs during tests        Mocking, caching, model switching
Slow test execution           Parallel tests, smaller models
Output validation             Schema validation, fuzzy matching
Regression testing            Golden datasets, similarity scores

Mocking LLM Responses

Basic Mocking with pytest

import pytest
from unittest.mock import Mock, patch, AsyncMock
from dataclasses import dataclass

@dataclass
class MockChoice:
    message: Mock

@dataclass
class MockResponse:
    choices: list
    usage: Mock

def create_mock_response(content: str) -> MockResponse:
    """Create a mock OpenAI response"""
    message = Mock()
    message.content = content
    message.tool_calls = None
    
    choice = MockChoice(message=message)
    
    usage = Mock()
    usage.prompt_tokens = 10
    usage.completion_tokens = 20
    usage.total_tokens = 30
    
    return MockResponse(choices=[choice], usage=usage)

# Synchronous mocking
def test_chat_completion():
    with patch("openai.OpenAI") as mock_openai:
        mock_client = Mock()
        mock_openai.return_value = mock_client
        
        mock_client.chat.completions.create.return_value = create_mock_response(
            "Mocked response"
        )
        
        # Your code that uses OpenAI
        from openai import OpenAI
        client = OpenAI()
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": "Hello"}]
        )
        
        assert response.choices[0].message.content == "Mocked response"

# Async mocking
@pytest.mark.asyncio
async def test_async_chat_completion():
    with patch("openai.AsyncOpenAI") as mock_openai:
        mock_client = AsyncMock()
        mock_openai.return_value = mock_client
        
        mock_client.chat.completions.create.return_value = create_mock_response(
            "Async mocked response"
        )
        
        from openai import AsyncOpenAI
        client = AsyncOpenAI()
        response = await client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": "Hello"}]
        )
        
        assert response.choices[0].message.content == "Async mocked response"

LLM Mock Fixture

import pytest
from typing import Dict, List, Optional, Callable
from dataclasses import dataclass

@dataclass
class MockConfig:
    responses: Dict[str, str]  # Query pattern -> response
    default_response: str = "Default mock response"
    call_log: List = None

class LLMMocker:
    """Configurable LLM mocker"""
    
    def __init__(self, config: MockConfig):
        self.config = config
        self.calls = []
    
    def get_response(self, messages: List[dict]) -> str:
        """Get mock response based on messages"""
        user_message = next(
            (m["content"] for m in messages if m["role"] == "user"),
            ""
        )
        
        self.calls.append({
            "messages": messages,
            "user_message": user_message
        })
        
        # Check for pattern matches
        for pattern, response in self.config.responses.items():
            if pattern.lower() in user_message.lower():
                return response
        
        return self.config.default_response
    
    def create_mock_response(self, content: str):
        return create_mock_response(content)

@pytest.fixture
def llm_mocker():
    """Pytest fixture for LLM mocking"""
    def _create_mocker(responses: Dict[str, str] = None, default: str = None):
        config = MockConfig(
            responses=responses or {},
            default_response=default or "Mock response"
        )
        return LLMMocker(config)
    
    return _create_mocker

# Usage in tests
def test_with_llm_mocker(llm_mocker):
    mocker = llm_mocker(
        responses={
            "weather": "The weather is sunny.",
            "time": "It is 3:00 PM."
        },
        default="I don't understand."
    )
    
    with patch("openai.OpenAI") as mock_openai:
        mock_client = Mock()
        mock_openai.return_value = mock_client
        
        def side_effect(**kwargs):
            messages = kwargs.get("messages", [])
            content = mocker.get_response(messages)
            return mocker.create_mock_response(content)
        
        mock_client.chat.completions.create.side_effect = side_effect
        
        # Your test code
        client = mock_openai()
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": "What's the weather?"}]
        )
        
        assert "sunny" in response.choices[0].message.content
        assert len(mocker.calls) == 1

Response Caching for Tests

import json
import hashlib
from pathlib import Path
from typing import Optional

class ResponseCache:
    """Cache LLM responses for reproducible tests"""
    
    def __init__(self, cache_dir: str = ".llm_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
    
    def _get_key(self, model: str, messages: list) -> str:
        """Generate cache key from request"""
        content = json.dumps({
            "model": model,
            "messages": messages
        }, sort_keys=True)
        return hashlib.sha256(content.encode()).hexdigest()
    
    def get(self, model: str, messages: list) -> Optional[dict]:
        """Get cached response"""
        key = self._get_key(model, messages)
        cache_file = self.cache_dir / f"{key}.json"
        
        if cache_file.exists():
            with open(cache_file) as f:
                return json.load(f)
        return None
    
    def set(self, model: str, messages: list, response: dict):
        """Cache response"""
        key = self._get_key(model, messages)
        cache_file = self.cache_dir / f"{key}.json"
        
        with open(cache_file, "w") as f:
            json.dump(response, f)
    
    def clear(self):
        """Clear all cached responses"""
        for f in self.cache_dir.glob("*.json"):
            f.unlink()

class CachedLLMClient:
    """LLM client with response caching"""
    
    def __init__(self, client, cache: ResponseCache = None):
        self.client = client
        self.cache = cache or ResponseCache()
    
    def complete(self, model: str, messages: list, **kwargs) -> dict:
        # Check cache
        cached = self.cache.get(model, messages)
        if cached:
            return cached
        
        # Make actual request
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        
        # Convert to dict and cache
        response_dict = {
            "content": response.choices[0].message.content,
            "model": response.model,
            "usage": {
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        }
        
        self.cache.set(model, messages, response_dict)
        
        return response_dict

# Fixture for cached testing
@pytest.fixture
def cached_llm():
    """Provides cached LLM client for tests"""
    from openai import OpenAI
    cache = ResponseCache(".test_llm_cache")
    client = CachedLLMClient(OpenAI(), cache)
    return client

Semantic Assertions

from openai import OpenAI
from typing import List

client = OpenAI()

class SemanticAssertions:
    """Semantic comparison for LLM outputs"""
    
    def __init__(self, threshold: float = 0.8):
        self.threshold = threshold
    
    def get_embedding(self, text: str) -> List[float]:
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    
    def cosine_similarity(self, a: List[float], b: List[float]) -> float:
        import numpy as np
        a = np.array(a)
        b = np.array(b)
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    def assert_similar(self, actual: str, expected: str, threshold: float = None):
        """Assert texts are semantically similar"""
        threshold = threshold or self.threshold
        
        actual_emb = self.get_embedding(actual)
        expected_emb = self.get_embedding(expected)
        
        similarity = self.cosine_similarity(actual_emb, expected_emb)
        
        assert similarity >= threshold, (
            f"Semantic similarity {similarity:.3f} below threshold {threshold}\n"
            f"Actual: {actual[:100]}...\n"
            f"Expected: {expected[:100]}..."
        )
    
    def assert_contains_concept(self, text: str, concept: str, threshold: float = 0.7):
        """Assert text contains a semantic concept"""
        text_emb = self.get_embedding(text)
        concept_emb = self.get_embedding(concept)
        
        similarity = self.cosine_similarity(text_emb, concept_emb)
        
        assert similarity >= threshold, (
            f"Concept '{concept}' not found in text (similarity: {similarity:.3f})"
        )

# Usage
semantic = SemanticAssertions()

def test_response_semantics():
    response = "Machine learning enables computers to learn from data"
    expected = "ML allows systems to improve from experience"
    
    semantic.assert_similar(response, expected, threshold=0.75)
    semantic.assert_contains_concept(response, "artificial intelligence")

Schema Validation

from pydantic import BaseModel, ValidationError
from typing import Optional, List
import json

class ProductRecommendation(BaseModel):
    product_name: str
    price: float
    reasoning: str
    confidence: float

class RecommendationResponse(BaseModel):
    recommendations: List[ProductRecommendation]
    summary: str

def test_structured_output():
    """Test that LLM returns valid structured output"""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "Return product recommendations as JSON"
            },
            {
                "role": "user",
                "content": "Recommend laptops under $1000"
            }
        ],
        response_format={"type": "json_object"}
    )
    
    content = response.choices[0].message.content
    
    # Parse JSON
    try:
        data = json.loads(content)
    except json.JSONDecodeError as e:
        pytest.fail(f"Invalid JSON: {e}")
    
    # Validate schema
    try:
        validated = RecommendationResponse.model_validate(data)
    except ValidationError as e:
        pytest.fail(f"Schema validation failed: {e}")
    
    # Additional assertions
    assert len(validated.recommendations) > 0
    assert all(r.price < 1000 for r in validated.recommendations)
    assert all(0 <= r.confidence <= 1 for r in validated.recommendations)

Golden Dataset Testing

import json
from dataclasses import dataclass
from typing import List, Callable
from pathlib import Path

@dataclass
class TestCase:
    id: str
    input: str
    expected_output: str
    category: str = "general"
    metadata: dict = None

@dataclass
class TestResult:
    test_case: TestCase
    actual_output: str
    passed: bool
    score: float
    error: str = None

class GoldenDatasetTester:
    """Test against golden dataset"""
    
    def __init__(
        self,
        dataset_path: str,
        evaluator: Callable[[str, str], float]
    ):
        self.dataset = self._load_dataset(dataset_path)
        self.evaluator = evaluator
    
    def _load_dataset(self, path: str) -> List[TestCase]:
        with open(path) as f:
            data = json.load(f)
        
        return [TestCase(**tc) for tc in data["test_cases"]]
    
    def run_test(
        self,
        generate_fn: Callable[[str], str],
        threshold: float = 0.8
    ) -> dict:
        """Run all tests and return results"""
        
        results = []
        passed = 0
        failed = 0
        
        for tc in self.dataset:
            try:
                actual = generate_fn(tc.input)
                score = self.evaluator(actual, tc.expected_output)
                is_passed = score >= threshold
                
                result = TestResult(
                    test_case=tc,
                    actual_output=actual,
                    passed=is_passed,
                    score=score
                )
                
                if is_passed:
                    passed += 1
                else:
                    failed += 1
                    
            except Exception as e:
                result = TestResult(
                    test_case=tc,
                    actual_output="",
                    passed=False,
                    score=0.0,
                    error=str(e)
                )
                failed += 1
            
            results.append(result)
        
        return {
            "total": len(self.dataset),
            "passed": passed,
            "failed": failed,
            "pass_rate": passed / len(self.dataset),
            "results": results
        }

# Example golden dataset (test_cases.json):
"""
{
    "test_cases": [
        {
            "id": "greeting_1",
            "input": "Say hello",
            "expected_output": "Hello! How can I help you today?",
            "category": "greeting"
        },
        {
            "id": "math_1",
            "input": "What is 2+2?",
            "expected_output": "4",
            "category": "math"
        }
    ]
}
"""

# Usage
def similarity_evaluator(actual: str, expected: str) -> float:
    # Use semantic similarity
    semantic = SemanticAssertions()
    actual_emb = semantic.get_embedding(actual)
    expected_emb = semantic.get_embedding(expected)
    return semantic.cosine_similarity(actual_emb, expected_emb)

def generate_response(prompt: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

tester = GoldenDatasetTester(
    "test_cases.json",
    evaluator=similarity_evaluator
)

results = tester.run_test(generate_response, threshold=0.75)
print(f"Pass rate: {results['pass_rate']:.1%}")

Integration Testing

import pytest
from fastapi.testclient import TestClient
from httpx import AsyncClient
import asyncio

# Test FastAPI LLM endpoints
def test_chat_endpoint(client: TestClient, llm_mocker):
    mocker = llm_mocker(
        responses={"hello": "Hi there!"},
        default="I don't understand"
    )
    
    with patch("app.llm_client") as mock_client:
        # Configure mock
        mock_client.complete.side_effect = lambda **k: {
            "content": mocker.get_response(k["messages"])
        }
        
        response = client.post("/chat", json={
            "message": "hello"
        })
        
        assert response.status_code == 200
        assert "Hi there" in response.json()["response"]

# Test streaming endpoint
@pytest.mark.asyncio
async def test_streaming_endpoint():
    async with AsyncClient(app=app, base_url="http://test") as client:
        async with client.stream(
            "POST",
            "/chat/stream",
            json={"message": "Hello"}
        ) as response:
            chunks = []
            async for line in response.aiter_lines():
                if line.startswith("data: "):
                    chunks.append(line[6:])
            
            assert len(chunks) > 0

# Test RAG pipeline
def test_rag_pipeline():
    # Setup
    documents = ["Doc 1 content", "Doc 2 content"]
    
    # Mock embedding and completion
    with patch("openai.OpenAI") as mock_openai:
        mock_client = Mock()
        mock_openai.return_value = mock_client
        
        # Mock embeddings
        mock_client.embeddings.create.return_value = Mock(
            data=[Mock(embedding=[0.1] * 1536)]
        )
        
        # Mock completion
        mock_client.chat.completions.create.return_value = create_mock_response(
            "Based on the context, the answer is..."
        )
        
        # Run RAG pipeline
        result = rag_pipeline.query(
            question="What is in the documents?",
            documents=documents
        )
        
        assert "context" in result.lower() or "answer" in result.lower()

Performance Testing

import time
from statistics import mean, stdev
from typing import Callable, List
from dataclasses import dataclass

@dataclass
class LatencyResult:
    mean_ms: float
    p50_ms: float
    p95_ms: float
    p99_ms: float
    std_dev: float

class LatencyTester:
    """Test LLM response latency"""
    
    def __init__(self, iterations: int = 10):
        self.iterations = iterations
    
    def measure(
        self,
        func: Callable,
        *args,
        **kwargs
    ) -> LatencyResult:
        """Measure function latency"""
        
        latencies = []
        
        for _ in range(self.iterations):
            start = time.perf_counter()
            func(*args, **kwargs)
            end = time.perf_counter()
            latencies.append((end - start) * 1000)  # Convert to ms
        
        latencies.sort()
        
        return LatencyResult(
            mean_ms=mean(latencies),
            p50_ms=latencies[len(latencies) // 2],
            p95_ms=latencies[int(len(latencies) * 0.95)],
            p99_ms=latencies[int(len(latencies) * 0.99)],
            std_dev=stdev(latencies) if len(latencies) > 1 else 0
        )

def test_latency_requirements():
    """Test that latency meets requirements"""
    tester = LatencyTester(iterations=10)
    
    def make_request():
        client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": "Hi"}],
            max_tokens=50
        )
    
    result = tester.measure(make_request)
    
    # Assert latency requirements
    assert result.p95_ms < 2000, f"P95 latency {result.p95_ms}ms exceeds 2s"
    assert result.mean_ms < 1000, f"Mean latency {result.mean_ms}ms exceeds 1s"

Test Configuration

# conftest.py
import pytest
import os

@pytest.fixture(scope="session")
def llm_client():
    """Provide LLM client for tests"""
    if os.getenv("USE_MOCK_LLM", "true").lower() == "true":
        return MockLLMClient()
    else:
        from openai import OpenAI
        return OpenAI()

@pytest.fixture
def small_model():
    """Use smaller model for faster tests"""
    return "gpt-4o-mini"

@pytest.fixture
def deterministic_settings():
    """Settings for reproducible outputs"""
    return {
        "temperature": 0,
        "seed": 42
    }

# pytest.ini
"""
[pytest]
markers =
    slow: marks tests as slow
    integration: marks tests as integration tests
    llm: marks tests that call real LLM APIs

env =
    USE_MOCK_LLM=true
    OPENAI_API_KEY=test-key
"""

# Run only fast tests
# pytest -m "not slow"

# Run integration tests
# pytest -m integration

# Run with real LLM
# USE_MOCK_LLM=false pytest -m llm

Test Strategy Summary

Test TypeSpeedCostCoverage
Unit (mocked)FastFreeLogic
CachedFastOne-timeRegression
IntegrationMediumLowEndpoints
Golden DatasetSlowMediumQuality
SemanticSlowMediumMeaning

What is Next

Production Logging

Learn structured logging and debugging for LLM applications