Testing Challenges
Copy
Challenge Solution
-------------------------------------------------------------
Non-deterministic outputs Semantic assertions, temperature=0
API costs during tests Mocking, caching, model switching
Slow test execution Parallel tests, smaller models
Output validation Schema validation, fuzzy matching
Regression testing Golden datasets, similarity scores
Mocking LLM Responses
Basic Mocking with pytest
Copy
import pytest
from unittest.mock import Mock, patch, AsyncMock
from dataclasses import dataclass
@dataclass
class MockChoice:
message: Mock
@dataclass
class MockResponse:
choices: list
usage: Mock
def create_mock_response(content: str) -> MockResponse:
"""Create a mock OpenAI response"""
message = Mock()
message.content = content
message.tool_calls = None
choice = MockChoice(message=message)
usage = Mock()
usage.prompt_tokens = 10
usage.completion_tokens = 20
usage.total_tokens = 30
return MockResponse(choices=[choice], usage=usage)
# Synchronous mocking
def test_chat_completion():
with patch("openai.OpenAI") as mock_openai:
mock_client = Mock()
mock_openai.return_value = mock_client
mock_client.chat.completions.create.return_value = create_mock_response(
"Mocked response"
)
# Your code that uses OpenAI
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello"}]
)
assert response.choices[0].message.content == "Mocked response"
# Async mocking
@pytest.mark.asyncio
async def test_async_chat_completion():
with patch("openai.AsyncOpenAI") as mock_openai:
mock_client = AsyncMock()
mock_openai.return_value = mock_client
mock_client.chat.completions.create.return_value = create_mock_response(
"Async mocked response"
)
from openai import AsyncOpenAI
client = AsyncOpenAI()
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello"}]
)
assert response.choices[0].message.content == "Async mocked response"
LLM Mock Fixture
Copy
import pytest
from typing import Dict, List, Optional, Callable
from dataclasses import dataclass
@dataclass
class MockConfig:
responses: Dict[str, str] # Query pattern -> response
default_response: str = "Default mock response"
call_log: List = None
class LLMMocker:
"""Configurable LLM mocker"""
def __init__(self, config: MockConfig):
self.config = config
self.calls = []
def get_response(self, messages: List[dict]) -> str:
"""Get mock response based on messages"""
user_message = next(
(m["content"] for m in messages if m["role"] == "user"),
""
)
self.calls.append({
"messages": messages,
"user_message": user_message
})
# Check for pattern matches
for pattern, response in self.config.responses.items():
if pattern.lower() in user_message.lower():
return response
return self.config.default_response
def create_mock_response(self, content: str):
return create_mock_response(content)
@pytest.fixture
def llm_mocker():
"""Pytest fixture for LLM mocking"""
def _create_mocker(responses: Dict[str, str] = None, default: str = None):
config = MockConfig(
responses=responses or {},
default_response=default or "Mock response"
)
return LLMMocker(config)
return _create_mocker
# Usage in tests
def test_with_llm_mocker(llm_mocker):
mocker = llm_mocker(
responses={
"weather": "The weather is sunny.",
"time": "It is 3:00 PM."
},
default="I don't understand."
)
with patch("openai.OpenAI") as mock_openai:
mock_client = Mock()
mock_openai.return_value = mock_client
def side_effect(**kwargs):
messages = kwargs.get("messages", [])
content = mocker.get_response(messages)
return mocker.create_mock_response(content)
mock_client.chat.completions.create.side_effect = side_effect
# Your test code
client = mock_openai()
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "What's the weather?"}]
)
assert "sunny" in response.choices[0].message.content
assert len(mocker.calls) == 1
Response Caching for Tests
Copy
import json
import hashlib
from pathlib import Path
from typing import Optional
class ResponseCache:
"""Cache LLM responses for reproducible tests"""
def __init__(self, cache_dir: str = ".llm_cache"):
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
def _get_key(self, model: str, messages: list) -> str:
"""Generate cache key from request"""
content = json.dumps({
"model": model,
"messages": messages
}, sort_keys=True)
return hashlib.sha256(content.encode()).hexdigest()
def get(self, model: str, messages: list) -> Optional[dict]:
"""Get cached response"""
key = self._get_key(model, messages)
cache_file = self.cache_dir / f"{key}.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
return None
def set(self, model: str, messages: list, response: dict):
"""Cache response"""
key = self._get_key(model, messages)
cache_file = self.cache_dir / f"{key}.json"
with open(cache_file, "w") as f:
json.dump(response, f)
def clear(self):
"""Clear all cached responses"""
for f in self.cache_dir.glob("*.json"):
f.unlink()
class CachedLLMClient:
"""LLM client with response caching"""
def __init__(self, client, cache: ResponseCache = None):
self.client = client
self.cache = cache or ResponseCache()
def complete(self, model: str, messages: list, **kwargs) -> dict:
# Check cache
cached = self.cache.get(model, messages)
if cached:
return cached
# Make actual request
response = self.client.chat.completions.create(
model=model,
messages=messages,
**kwargs
)
# Convert to dict and cache
response_dict = {
"content": response.choices[0].message.content,
"model": response.model,
"usage": {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens
}
}
self.cache.set(model, messages, response_dict)
return response_dict
# Fixture for cached testing
@pytest.fixture
def cached_llm():
"""Provides cached LLM client for tests"""
from openai import OpenAI
cache = ResponseCache(".test_llm_cache")
client = CachedLLMClient(OpenAI(), cache)
return client
Semantic Assertions
Copy
from openai import OpenAI
from typing import List
client = OpenAI()
class SemanticAssertions:
"""Semantic comparison for LLM outputs"""
def __init__(self, threshold: float = 0.8):
self.threshold = threshold
def get_embedding(self, text: str) -> List[float]:
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def cosine_similarity(self, a: List[float], b: List[float]) -> float:
import numpy as np
a = np.array(a)
b = np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def assert_similar(self, actual: str, expected: str, threshold: float = None):
"""Assert texts are semantically similar"""
threshold = threshold or self.threshold
actual_emb = self.get_embedding(actual)
expected_emb = self.get_embedding(expected)
similarity = self.cosine_similarity(actual_emb, expected_emb)
assert similarity >= threshold, (
f"Semantic similarity {similarity:.3f} below threshold {threshold}\n"
f"Actual: {actual[:100]}...\n"
f"Expected: {expected[:100]}..."
)
def assert_contains_concept(self, text: str, concept: str, threshold: float = 0.7):
"""Assert text contains a semantic concept"""
text_emb = self.get_embedding(text)
concept_emb = self.get_embedding(concept)
similarity = self.cosine_similarity(text_emb, concept_emb)
assert similarity >= threshold, (
f"Concept '{concept}' not found in text (similarity: {similarity:.3f})"
)
# Usage
semantic = SemanticAssertions()
def test_response_semantics():
response = "Machine learning enables computers to learn from data"
expected = "ML allows systems to improve from experience"
semantic.assert_similar(response, expected, threshold=0.75)
semantic.assert_contains_concept(response, "artificial intelligence")
Schema Validation
Copy
from pydantic import BaseModel, ValidationError
from typing import Optional, List
import json
class ProductRecommendation(BaseModel):
product_name: str
price: float
reasoning: str
confidence: float
class RecommendationResponse(BaseModel):
recommendations: List[ProductRecommendation]
summary: str
def test_structured_output():
"""Test that LLM returns valid structured output"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "Return product recommendations as JSON"
},
{
"role": "user",
"content": "Recommend laptops under $1000"
}
],
response_format={"type": "json_object"}
)
content = response.choices[0].message.content
# Parse JSON
try:
data = json.loads(content)
except json.JSONDecodeError as e:
pytest.fail(f"Invalid JSON: {e}")
# Validate schema
try:
validated = RecommendationResponse.model_validate(data)
except ValidationError as e:
pytest.fail(f"Schema validation failed: {e}")
# Additional assertions
assert len(validated.recommendations) > 0
assert all(r.price < 1000 for r in validated.recommendations)
assert all(0 <= r.confidence <= 1 for r in validated.recommendations)
Golden Dataset Testing
Copy
import json
from dataclasses import dataclass
from typing import List, Callable
from pathlib import Path
@dataclass
class TestCase:
id: str
input: str
expected_output: str
category: str = "general"
metadata: dict = None
@dataclass
class TestResult:
test_case: TestCase
actual_output: str
passed: bool
score: float
error: str = None
class GoldenDatasetTester:
"""Test against golden dataset"""
def __init__(
self,
dataset_path: str,
evaluator: Callable[[str, str], float]
):
self.dataset = self._load_dataset(dataset_path)
self.evaluator = evaluator
def _load_dataset(self, path: str) -> List[TestCase]:
with open(path) as f:
data = json.load(f)
return [TestCase(**tc) for tc in data["test_cases"]]
def run_test(
self,
generate_fn: Callable[[str], str],
threshold: float = 0.8
) -> dict:
"""Run all tests and return results"""
results = []
passed = 0
failed = 0
for tc in self.dataset:
try:
actual = generate_fn(tc.input)
score = self.evaluator(actual, tc.expected_output)
is_passed = score >= threshold
result = TestResult(
test_case=tc,
actual_output=actual,
passed=is_passed,
score=score
)
if is_passed:
passed += 1
else:
failed += 1
except Exception as e:
result = TestResult(
test_case=tc,
actual_output="",
passed=False,
score=0.0,
error=str(e)
)
failed += 1
results.append(result)
return {
"total": len(self.dataset),
"passed": passed,
"failed": failed,
"pass_rate": passed / len(self.dataset),
"results": results
}
# Example golden dataset (test_cases.json):
"""
{
"test_cases": [
{
"id": "greeting_1",
"input": "Say hello",
"expected_output": "Hello! How can I help you today?",
"category": "greeting"
},
{
"id": "math_1",
"input": "What is 2+2?",
"expected_output": "4",
"category": "math"
}
]
}
"""
# Usage
def similarity_evaluator(actual: str, expected: str) -> float:
# Use semantic similarity
semantic = SemanticAssertions()
actual_emb = semantic.get_embedding(actual)
expected_emb = semantic.get_embedding(expected)
return semantic.cosine_similarity(actual_emb, expected_emb)
def generate_response(prompt: str) -> str:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
tester = GoldenDatasetTester(
"test_cases.json",
evaluator=similarity_evaluator
)
results = tester.run_test(generate_response, threshold=0.75)
print(f"Pass rate: {results['pass_rate']:.1%}")
Integration Testing
Copy
import pytest
from fastapi.testclient import TestClient
from httpx import AsyncClient
import asyncio
# Test FastAPI LLM endpoints
def test_chat_endpoint(client: TestClient, llm_mocker):
mocker = llm_mocker(
responses={"hello": "Hi there!"},
default="I don't understand"
)
with patch("app.llm_client") as mock_client:
# Configure mock
mock_client.complete.side_effect = lambda **k: {
"content": mocker.get_response(k["messages"])
}
response = client.post("/chat", json={
"message": "hello"
})
assert response.status_code == 200
assert "Hi there" in response.json()["response"]
# Test streaming endpoint
@pytest.mark.asyncio
async def test_streaming_endpoint():
async with AsyncClient(app=app, base_url="http://test") as client:
async with client.stream(
"POST",
"/chat/stream",
json={"message": "Hello"}
) as response:
chunks = []
async for line in response.aiter_lines():
if line.startswith("data: "):
chunks.append(line[6:])
assert len(chunks) > 0
# Test RAG pipeline
def test_rag_pipeline():
# Setup
documents = ["Doc 1 content", "Doc 2 content"]
# Mock embedding and completion
with patch("openai.OpenAI") as mock_openai:
mock_client = Mock()
mock_openai.return_value = mock_client
# Mock embeddings
mock_client.embeddings.create.return_value = Mock(
data=[Mock(embedding=[0.1] * 1536)]
)
# Mock completion
mock_client.chat.completions.create.return_value = create_mock_response(
"Based on the context, the answer is..."
)
# Run RAG pipeline
result = rag_pipeline.query(
question="What is in the documents?",
documents=documents
)
assert "context" in result.lower() or "answer" in result.lower()
Performance Testing
Copy
import time
from statistics import mean, stdev
from typing import Callable, List
from dataclasses import dataclass
@dataclass
class LatencyResult:
mean_ms: float
p50_ms: float
p95_ms: float
p99_ms: float
std_dev: float
class LatencyTester:
"""Test LLM response latency"""
def __init__(self, iterations: int = 10):
self.iterations = iterations
def measure(
self,
func: Callable,
*args,
**kwargs
) -> LatencyResult:
"""Measure function latency"""
latencies = []
for _ in range(self.iterations):
start = time.perf_counter()
func(*args, **kwargs)
end = time.perf_counter()
latencies.append((end - start) * 1000) # Convert to ms
latencies.sort()
return LatencyResult(
mean_ms=mean(latencies),
p50_ms=latencies[len(latencies) // 2],
p95_ms=latencies[int(len(latencies) * 0.95)],
p99_ms=latencies[int(len(latencies) * 0.99)],
std_dev=stdev(latencies) if len(latencies) > 1 else 0
)
def test_latency_requirements():
"""Test that latency meets requirements"""
tester = LatencyTester(iterations=10)
def make_request():
client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Hi"}],
max_tokens=50
)
result = tester.measure(make_request)
# Assert latency requirements
assert result.p95_ms < 2000, f"P95 latency {result.p95_ms}ms exceeds 2s"
assert result.mean_ms < 1000, f"Mean latency {result.mean_ms}ms exceeds 1s"
Test Configuration
Copy
# conftest.py
import pytest
import os
@pytest.fixture(scope="session")
def llm_client():
"""Provide LLM client for tests"""
if os.getenv("USE_MOCK_LLM", "true").lower() == "true":
return MockLLMClient()
else:
from openai import OpenAI
return OpenAI()
@pytest.fixture
def small_model():
"""Use smaller model for faster tests"""
return "gpt-4o-mini"
@pytest.fixture
def deterministic_settings():
"""Settings for reproducible outputs"""
return {
"temperature": 0,
"seed": 42
}
# pytest.ini
"""
[pytest]
markers =
slow: marks tests as slow
integration: marks tests as integration tests
llm: marks tests that call real LLM APIs
env =
USE_MOCK_LLM=true
OPENAI_API_KEY=test-key
"""
# Run only fast tests
# pytest -m "not slow"
# Run integration tests
# pytest -m integration
# Run with real LLM
# USE_MOCK_LLM=false pytest -m llm
Test Strategy Summary
| Test Type | Speed | Cost | Coverage |
|---|---|---|---|
| Unit (mocked) | Fast | Free | Logic |
| Cached | Fast | One-time | Regression |
| Integration | Medium | Low | Endpoints |
| Golden Dataset | Slow | Medium | Quality |
| Semantic | Slow | Medium | Meaning |
What is Next
Production Logging
Learn structured logging and debugging for LLM applications