December 2025 Update: Production strategies for managing prompts including version control, A/B testing, and prompt observability.
The Prompt Management Problem
Prompts are critical business logic, but often managed poorly:Copy
Common Anti-Patterns Best Practices
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Hardcoded strings Versioned prompt registry
No change tracking Git-like version control
No testing Automated evaluation
"It worked yesterday" Rollback capability
One-size-fits-all A/B testing
Prompt Registry Pattern
Basic Prompt Registry
Copy
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, Dict, Any
import json
import hashlib
@dataclass
class PromptVersion:
version: str
template: str
variables: list[str]
created_at: datetime
created_by: str
description: str = ""
metadata: dict = field(default_factory=dict)
@property
def hash(self) -> str:
return hashlib.sha256(self.template.encode()).hexdigest()[:12]
@dataclass
class Prompt:
name: str
versions: list[PromptVersion] = field(default_factory=list)
active_version: str = "latest"
def get_version(self, version: str = None) -> PromptVersion:
version = version or self.active_version
if version == "latest":
return self.versions[-1]
for v in self.versions:
if v.version == version:
return v
raise ValueError(f"Version {version} not found")
def render(self, version: str = None, **kwargs) -> str:
prompt_version = self.get_version(version)
return prompt_version.template.format(**kwargs)
class PromptRegistry:
"""Central registry for all prompts"""
def __init__(self):
self.prompts: Dict[str, Prompt] = {}
def register(
self,
name: str,
template: str,
version: str,
created_by: str,
description: str = "",
variables: list[str] = None,
metadata: dict = None
) -> PromptVersion:
"""Register a new prompt version"""
# Extract variables from template
if variables is None:
import re
variables = re.findall(r'\{(\w+)\}', template)
prompt_version = PromptVersion(
version=version,
template=template,
variables=variables,
created_at=datetime.now(),
created_by=created_by,
description=description,
metadata=metadata or {}
)
if name not in self.prompts:
self.prompts[name] = Prompt(name=name)
self.prompts[name].versions.append(prompt_version)
return prompt_version
def get(self, name: str, version: str = None) -> Prompt:
"""Get a prompt by name"""
if name not in self.prompts:
raise ValueError(f"Prompt '{name}' not found")
return self.prompts[name]
def render(
self,
name: str,
version: str = None,
**kwargs
) -> str:
"""Render a prompt with variables"""
prompt = self.get(name)
return prompt.render(version, **kwargs)
def set_active_version(self, name: str, version: str):
"""Set the active version for a prompt"""
prompt = self.get(name)
prompt.get_version(version) # Validate version exists
prompt.active_version = version
def list_prompts(self) -> list[str]:
return list(self.prompts.keys())
def list_versions(self, name: str) -> list[str]:
prompt = self.get(name)
return [v.version for v in prompt.versions]
# Usage
registry = PromptRegistry()
# Register prompts
registry.register(
name="customer_support",
version="1.0.0",
created_by="[email protected]",
template="""You are a helpful customer support agent for {company_name}.
Customer query: {query}
Please provide a helpful response following these guidelines:
- Be friendly and professional
- If you don't know, say so
- Offer to escalate if needed""",
description="Initial customer support prompt"
)
registry.register(
name="customer_support",
version="1.1.0",
created_by="[email protected]",
template="""You are an expert customer support agent for {company_name}.
## Customer Information
- Query: {query}
- Account Status: {account_status}
## Response Guidelines
1. Address the customer by acknowledging their concern
2. Provide accurate, helpful information
3. Offer next steps or alternatives
4. End with a friendly closing
Keep response under 150 words.""",
description="Added account status and improved structure"
)
# Use prompts
prompt = registry.render(
"customer_support",
version="1.1.0",
company_name="TechCorp",
query="My order hasn't arrived",
account_status="Premium"
)
File-Based Prompt Management
Store prompts as files for version control:Copy
prompts/
├── customer_support/
│ ├── prompt.yaml
│ ├── v1.0.0.txt
│ ├── v1.1.0.txt
│ └── tests/
│ └── test_cases.yaml
├── code_review/
│ ├── prompt.yaml
│ └── v1.0.0.txt
└── registry.yaml
Copy
import yaml
from pathlib import Path
from typing import Optional
class FilePromptRegistry:
"""Load prompts from file system"""
def __init__(self, prompts_dir: str = "prompts"):
self.prompts_dir = Path(prompts_dir)
self.prompts = {}
self._load_all()
def _load_all(self):
"""Load all prompts from directory"""
for prompt_dir in self.prompts_dir.iterdir():
if prompt_dir.is_dir():
self._load_prompt(prompt_dir)
def _load_prompt(self, prompt_dir: Path):
"""Load a single prompt"""
config_path = prompt_dir / "prompt.yaml"
if not config_path.exists():
return
with open(config_path) as f:
config = yaml.safe_load(f)
name = config["name"]
self.prompts[name] = {
"config": config,
"versions": {}
}
# Load all versions
for version_file in prompt_dir.glob("v*.txt"):
version = version_file.stem # v1.0.0
with open(version_file) as f:
template = f.read()
self.prompts[name]["versions"][version] = template
def get(
self,
name: str,
version: str = None
) -> str:
"""Get a prompt template"""
if name not in self.prompts:
raise ValueError(f"Prompt '{name}' not found")
prompt_data = self.prompts[name]
if version is None:
version = prompt_data["config"].get("active_version", "latest")
if version == "latest":
versions = sorted(prompt_data["versions"].keys())
version = versions[-1]
return prompt_data["versions"][version]
def render(
self,
name: str,
version: str = None,
**kwargs
) -> str:
"""Render a prompt with variables"""
template = self.get(name, version)
return template.format(**kwargs)
def reload(self):
"""Reload all prompts from disk"""
self.prompts = {}
self._load_all()
# prompts/customer_support/prompt.yaml
"""
name: customer_support
description: Customer support system prompt
active_version: v1.1.0
variables:
- company_name
- query
- account_status
tags:
- support
- production
"""
A/B Testing Prompts
Experiment Framework
Copy
from dataclasses import dataclass
from typing import Callable, Optional
import random
import hashlib
from datetime import datetime
@dataclass
class Variant:
name: str
prompt_version: str
weight: float = 1.0
@dataclass
class Experiment:
name: str
prompt_name: str
variants: list[Variant]
start_date: datetime
end_date: Optional[datetime] = None
def get_variant(self, user_id: str) -> Variant:
"""Deterministically assign user to variant"""
# Hash user_id for consistent assignment
hash_input = f"{self.name}:{user_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
# Normalize to 0-1
normalized = (hash_value % 10000) / 10000
# Assign to variant based on weights
total_weight = sum(v.weight for v in self.variants)
cumulative = 0
for variant in self.variants:
cumulative += variant.weight / total_weight
if normalized < cumulative:
return variant
return self.variants[-1]
class ExperimentManager:
"""Manage A/B testing for prompts"""
def __init__(self, registry: PromptRegistry):
self.registry = registry
self.experiments: Dict[str, Experiment] = {}
self.results: Dict[str, list] = {}
def create_experiment(
self,
name: str,
prompt_name: str,
variants: list[dict]
) -> Experiment:
"""Create a new experiment"""
experiment = Experiment(
name=name,
prompt_name=prompt_name,
variants=[Variant(**v) for v in variants],
start_date=datetime.now()
)
self.experiments[name] = experiment
self.results[name] = []
return experiment
def get_prompt(
self,
experiment_name: str,
user_id: str,
**kwargs
) -> tuple[str, str]:
"""Get prompt for user in experiment"""
experiment = self.experiments[experiment_name]
variant = experiment.get_variant(user_id)
prompt = self.registry.render(
experiment.prompt_name,
version=variant.prompt_version,
**kwargs
)
return prompt, variant.name
def record_result(
self,
experiment_name: str,
user_id: str,
variant_name: str,
metric: str,
value: float
):
"""Record experiment result"""
self.results[experiment_name].append({
"user_id": user_id,
"variant": variant_name,
"metric": metric,
"value": value,
"timestamp": datetime.now()
})
def get_results(self, experiment_name: str) -> dict:
"""Get experiment results by variant"""
results = self.results[experiment_name]
# Group by variant
by_variant = {}
for r in results:
variant = r["variant"]
if variant not in by_variant:
by_variant[variant] = []
by_variant[variant].append(r)
# Calculate stats
stats = {}
for variant, data in by_variant.items():
values = [d["value"] for d in data]
stats[variant] = {
"count": len(values),
"mean": sum(values) / len(values) if values else 0,
"min": min(values) if values else 0,
"max": max(values) if values else 0
}
return stats
# Usage
manager = ExperimentManager(registry)
# Create experiment
manager.create_experiment(
name="support_prompt_v2",
prompt_name="customer_support",
variants=[
{"name": "control", "prompt_version": "1.0.0", "weight": 0.5},
{"name": "treatment", "prompt_version": "1.1.0", "weight": 0.5}
]
)
# Use in production
user_id = "user_123"
prompt, variant = manager.get_prompt(
"support_prompt_v2",
user_id,
company_name="TechCorp",
query="Where is my order?",
account_status="Premium"
)
# After getting feedback
manager.record_result(
"support_prompt_v2",
user_id,
variant,
metric="satisfaction",
value=4.5
)
# Analyze results
results = manager.get_results("support_prompt_v2")
print(json.dumps(results, indent=2))
Prompt Testing Framework
Copy
from dataclasses import dataclass
from typing import Callable, Any
import json
@dataclass
class PromptTestCase:
name: str
inputs: dict
expected: dict # Expected properties of output
validators: list[Callable[[str], bool]] = None
class PromptTester:
"""Test prompts against expected behaviors"""
def __init__(self, registry: PromptRegistry, llm_client):
self.registry = registry
self.client = llm_client
def run_test(
self,
prompt_name: str,
version: str,
test_case: PromptTestCase
) -> dict:
"""Run a single test case"""
# Render prompt
prompt = self.registry.render(
prompt_name,
version=version,
**test_case.inputs
)
# Call LLM
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0 # Deterministic
)
output = response.choices[0].message.content
# Run validators
results = {
"passed": True,
"output": output,
"validations": []
}
for validator in (test_case.validators or []):
try:
passed = validator(output)
results["validations"].append({
"validator": validator.__name__,
"passed": passed
})
if not passed:
results["passed"] = False
except Exception as e:
results["validations"].append({
"validator": validator.__name__,
"passed": False,
"error": str(e)
})
results["passed"] = False
return results
def run_suite(
self,
prompt_name: str,
version: str,
test_cases: list[PromptTestCase]
) -> dict:
"""Run a full test suite"""
results = {
"prompt": prompt_name,
"version": version,
"total": len(test_cases),
"passed": 0,
"failed": 0,
"tests": []
}
for test_case in test_cases:
result = self.run_test(prompt_name, version, test_case)
result["name"] = test_case.name
results["tests"].append(result)
if result["passed"]:
results["passed"] += 1
else:
results["failed"] += 1
return results
# Define validators
def contains_greeting(output: str) -> bool:
"""Check if output contains a greeting"""
greetings = ["hello", "hi", "hey", "greetings"]
return any(g in output.lower() for g in greetings)
def under_word_limit(limit: int) -> Callable[[str], bool]:
"""Check if output is under word limit"""
def validator(output: str) -> bool:
return len(output.split()) <= limit
validator.__name__ = f"under_{limit}_words"
return validator
def is_professional(output: str) -> bool:
"""Check for professional language"""
unprofessional = ["lol", "omg", "wtf", "idk"]
return not any(word in output.lower() for word in unprofessional)
# Create test cases
test_cases = [
PromptTestCase(
name="basic_query",
inputs={
"company_name": "TechCorp",
"query": "What are your business hours?",
"account_status": "Basic"
},
expected={"contains_hours": True},
validators=[contains_greeting, is_professional, under_word_limit(150)]
),
PromptTestCase(
name="complaint_handling",
inputs={
"company_name": "TechCorp",
"query": "I'm very unhappy with your service!",
"account_status": "Premium"
},
expected={"contains_apology": True},
validators=[is_professional, under_word_limit(200)]
)
]
# Run tests
tester = PromptTester(registry, client)
results = tester.run_suite("customer_support", "1.1.0", test_cases)
print(f"Passed: {results['passed']}/{results['total']}")
Prompt Lifecycle Management
Copy
from enum import Enum
from datetime import datetime
class PromptStatus(str, Enum):
DRAFT = "draft"
TESTING = "testing"
CANARY = "canary"
PRODUCTION = "production"
DEPRECATED = "deprecated"
ARCHIVED = "archived"
class PromptLifecycleManager:
"""Manage prompt lifecycle stages"""
ALLOWED_TRANSITIONS = {
PromptStatus.DRAFT: [PromptStatus.TESTING, PromptStatus.ARCHIVED],
PromptStatus.TESTING: [PromptStatus.DRAFT, PromptStatus.CANARY, PromptStatus.ARCHIVED],
PromptStatus.CANARY: [PromptStatus.TESTING, PromptStatus.PRODUCTION],
PromptStatus.PRODUCTION: [PromptStatus.DEPRECATED],
PromptStatus.DEPRECATED: [PromptStatus.ARCHIVED, PromptStatus.PRODUCTION],
PromptStatus.ARCHIVED: []
}
def __init__(self):
self.statuses: Dict[str, Dict[str, PromptStatus]] = {}
self.history: list = []
def set_status(
self,
prompt_name: str,
version: str,
status: PromptStatus,
changed_by: str,
reason: str = ""
):
"""Update prompt status"""
key = f"{prompt_name}:{version}"
current = self.statuses.get(key, {}).get("status", PromptStatus.DRAFT)
# Validate transition
if status not in self.ALLOWED_TRANSITIONS[current]:
raise ValueError(
f"Cannot transition from {current} to {status}"
)
self.statuses[key] = {
"status": status,
"updated_at": datetime.now(),
"updated_by": changed_by
}
self.history.append({
"prompt": prompt_name,
"version": version,
"from_status": current,
"to_status": status,
"changed_by": changed_by,
"reason": reason,
"timestamp": datetime.now()
})
def get_status(self, prompt_name: str, version: str) -> PromptStatus:
key = f"{prompt_name}:{version}"
return self.statuses.get(key, {}).get("status", PromptStatus.DRAFT)
def promote_to_canary(
self,
prompt_name: str,
version: str,
canary_percentage: float = 0.05,
changed_by: str = ""
):
"""Promote to canary with traffic percentage"""
self.set_status(
prompt_name,
version,
PromptStatus.CANARY,
changed_by,
f"Canary at {canary_percentage*100}%"
)
# Return routing config
return {
"prompt": prompt_name,
"version": version,
"percentage": canary_percentage
}
def rollback(
self,
prompt_name: str,
to_version: str,
changed_by: str,
reason: str
):
"""Rollback to a previous version"""
# Deprecate current production
for key, data in self.statuses.items():
if key.startswith(f"{prompt_name}:"):
if data["status"] == PromptStatus.PRODUCTION:
current_version = key.split(":")[1]
self.set_status(
prompt_name,
current_version,
PromptStatus.DEPRECATED,
changed_by,
f"Rollback: {reason}"
)
# Promote target version
self.set_status(
prompt_name,
to_version,
PromptStatus.PRODUCTION,
changed_by,
f"Rollback from previous version"
)
# Usage
lifecycle = PromptLifecycleManager()
# Development flow
lifecycle.set_status("customer_support", "1.2.0", PromptStatus.DRAFT, "[email protected]")
lifecycle.set_status("customer_support", "1.2.0", PromptStatus.TESTING, "[email protected]")
# After tests pass
canary_config = lifecycle.promote_to_canary(
"customer_support", "1.2.0",
canary_percentage=0.05,
changed_by="[email protected]"
)
# If issues detected
lifecycle.rollback(
"customer_support",
to_version="1.1.0",
changed_by="[email protected]",
reason="Increased error rate"
)
Key Takeaways
Version Everything
Treat prompts like code with proper version control
Test Before Deploy
Automated testing catches issues before production
A/B Test Changes
Measure impact with controlled experiments
Enable Rollbacks
Quick rollback capability is essential for production
What’s Next
LLM Orchestration
Learn to orchestrate multiple LLM providers with unified APIs