Skip to main content
December 2025 Update: Production strategies for managing prompts including version control, A/B testing, and prompt observability.

The Prompt Management Problem

Prompts are critical business logic, but often managed poorly:
Common Anti-Patterns              Best Practices
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Hardcoded strings                 Versioned prompt registry
No change tracking                Git-like version control
No testing                        Automated evaluation
"It worked yesterday"             Rollback capability
One-size-fits-all                 A/B testing

Prompt Registry Pattern

Basic Prompt Registry

from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, Dict, Any
import json
import hashlib

@dataclass
class PromptVersion:
    version: str
    template: str
    variables: list[str]
    created_at: datetime
    created_by: str
    description: str = ""
    metadata: dict = field(default_factory=dict)
    
    @property
    def hash(self) -> str:
        return hashlib.sha256(self.template.encode()).hexdigest()[:12]

@dataclass
class Prompt:
    name: str
    versions: list[PromptVersion] = field(default_factory=list)
    active_version: str = "latest"
    
    def get_version(self, version: str = None) -> PromptVersion:
        version = version or self.active_version
        
        if version == "latest":
            return self.versions[-1]
        
        for v in self.versions:
            if v.version == version:
                return v
        
        raise ValueError(f"Version {version} not found")
    
    def render(self, version: str = None, **kwargs) -> str:
        prompt_version = self.get_version(version)
        return prompt_version.template.format(**kwargs)

class PromptRegistry:
    """Central registry for all prompts"""
    
    def __init__(self):
        self.prompts: Dict[str, Prompt] = {}
    
    def register(
        self,
        name: str,
        template: str,
        version: str,
        created_by: str,
        description: str = "",
        variables: list[str] = None,
        metadata: dict = None
    ) -> PromptVersion:
        """Register a new prompt version"""
        
        # Extract variables from template
        if variables is None:
            import re
            variables = re.findall(r'\{(\w+)\}', template)
        
        prompt_version = PromptVersion(
            version=version,
            template=template,
            variables=variables,
            created_at=datetime.now(),
            created_by=created_by,
            description=description,
            metadata=metadata or {}
        )
        
        if name not in self.prompts:
            self.prompts[name] = Prompt(name=name)
        
        self.prompts[name].versions.append(prompt_version)
        
        return prompt_version
    
    def get(self, name: str, version: str = None) -> Prompt:
        """Get a prompt by name"""
        if name not in self.prompts:
            raise ValueError(f"Prompt '{name}' not found")
        return self.prompts[name]
    
    def render(
        self,
        name: str,
        version: str = None,
        **kwargs
    ) -> str:
        """Render a prompt with variables"""
        prompt = self.get(name)
        return prompt.render(version, **kwargs)
    
    def set_active_version(self, name: str, version: str):
        """Set the active version for a prompt"""
        prompt = self.get(name)
        prompt.get_version(version)  # Validate version exists
        prompt.active_version = version
    
    def list_prompts(self) -> list[str]:
        return list(self.prompts.keys())
    
    def list_versions(self, name: str) -> list[str]:
        prompt = self.get(name)
        return [v.version for v in prompt.versions]

# Usage
registry = PromptRegistry()

# Register prompts
registry.register(
    name="customer_support",
    version="1.0.0",
    created_by="[email protected]",
    template="""You are a helpful customer support agent for {company_name}.

Customer query: {query}

Please provide a helpful response following these guidelines:
- Be friendly and professional
- If you don't know, say so
- Offer to escalate if needed""",
    description="Initial customer support prompt"
)

registry.register(
    name="customer_support",
    version="1.1.0",
    created_by="[email protected]",
    template="""You are an expert customer support agent for {company_name}.

## Customer Information
- Query: {query}
- Account Status: {account_status}

## Response Guidelines
1. Address the customer by acknowledging their concern
2. Provide accurate, helpful information
3. Offer next steps or alternatives
4. End with a friendly closing

Keep response under 150 words.""",
    description="Added account status and improved structure"
)

# Use prompts
prompt = registry.render(
    "customer_support",
    version="1.1.0",
    company_name="TechCorp",
    query="My order hasn't arrived",
    account_status="Premium"
)

File-Based Prompt Management

Store prompts as files for version control:
prompts/
├── customer_support/
│   ├── prompt.yaml
│   ├── v1.0.0.txt
│   ├── v1.1.0.txt
│   └── tests/
│       └── test_cases.yaml
├── code_review/
│   ├── prompt.yaml
│   └── v1.0.0.txt
└── registry.yaml
import yaml
from pathlib import Path
from typing import Optional

class FilePromptRegistry:
    """Load prompts from file system"""
    
    def __init__(self, prompts_dir: str = "prompts"):
        self.prompts_dir = Path(prompts_dir)
        self.prompts = {}
        self._load_all()
    
    def _load_all(self):
        """Load all prompts from directory"""
        for prompt_dir in self.prompts_dir.iterdir():
            if prompt_dir.is_dir():
                self._load_prompt(prompt_dir)
    
    def _load_prompt(self, prompt_dir: Path):
        """Load a single prompt"""
        config_path = prompt_dir / "prompt.yaml"
        
        if not config_path.exists():
            return
        
        with open(config_path) as f:
            config = yaml.safe_load(f)
        
        name = config["name"]
        self.prompts[name] = {
            "config": config,
            "versions": {}
        }
        
        # Load all versions
        for version_file in prompt_dir.glob("v*.txt"):
            version = version_file.stem  # v1.0.0
            with open(version_file) as f:
                template = f.read()
            
            self.prompts[name]["versions"][version] = template
    
    def get(
        self,
        name: str,
        version: str = None
    ) -> str:
        """Get a prompt template"""
        if name not in self.prompts:
            raise ValueError(f"Prompt '{name}' not found")
        
        prompt_data = self.prompts[name]
        
        if version is None:
            version = prompt_data["config"].get("active_version", "latest")
        
        if version == "latest":
            versions = sorted(prompt_data["versions"].keys())
            version = versions[-1]
        
        return prompt_data["versions"][version]
    
    def render(
        self,
        name: str,
        version: str = None,
        **kwargs
    ) -> str:
        """Render a prompt with variables"""
        template = self.get(name, version)
        return template.format(**kwargs)
    
    def reload(self):
        """Reload all prompts from disk"""
        self.prompts = {}
        self._load_all()

# prompts/customer_support/prompt.yaml
"""
name: customer_support
description: Customer support system prompt
active_version: v1.1.0
variables:
  - company_name
  - query
  - account_status
tags:
  - support
  - production
"""

A/B Testing Prompts

Experiment Framework

from dataclasses import dataclass
from typing import Callable, Optional
import random
import hashlib
from datetime import datetime

@dataclass
class Variant:
    name: str
    prompt_version: str
    weight: float = 1.0

@dataclass
class Experiment:
    name: str
    prompt_name: str
    variants: list[Variant]
    start_date: datetime
    end_date: Optional[datetime] = None
    
    def get_variant(self, user_id: str) -> Variant:
        """Deterministically assign user to variant"""
        # Hash user_id for consistent assignment
        hash_input = f"{self.name}:{user_id}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        
        # Normalize to 0-1
        normalized = (hash_value % 10000) / 10000
        
        # Assign to variant based on weights
        total_weight = sum(v.weight for v in self.variants)
        cumulative = 0
        
        for variant in self.variants:
            cumulative += variant.weight / total_weight
            if normalized < cumulative:
                return variant
        
        return self.variants[-1]

class ExperimentManager:
    """Manage A/B testing for prompts"""
    
    def __init__(self, registry: PromptRegistry):
        self.registry = registry
        self.experiments: Dict[str, Experiment] = {}
        self.results: Dict[str, list] = {}
    
    def create_experiment(
        self,
        name: str,
        prompt_name: str,
        variants: list[dict]
    ) -> Experiment:
        """Create a new experiment"""
        experiment = Experiment(
            name=name,
            prompt_name=prompt_name,
            variants=[Variant(**v) for v in variants],
            start_date=datetime.now()
        )
        
        self.experiments[name] = experiment
        self.results[name] = []
        
        return experiment
    
    def get_prompt(
        self,
        experiment_name: str,
        user_id: str,
        **kwargs
    ) -> tuple[str, str]:
        """Get prompt for user in experiment"""
        experiment = self.experiments[experiment_name]
        variant = experiment.get_variant(user_id)
        
        prompt = self.registry.render(
            experiment.prompt_name,
            version=variant.prompt_version,
            **kwargs
        )
        
        return prompt, variant.name
    
    def record_result(
        self,
        experiment_name: str,
        user_id: str,
        variant_name: str,
        metric: str,
        value: float
    ):
        """Record experiment result"""
        self.results[experiment_name].append({
            "user_id": user_id,
            "variant": variant_name,
            "metric": metric,
            "value": value,
            "timestamp": datetime.now()
        })
    
    def get_results(self, experiment_name: str) -> dict:
        """Get experiment results by variant"""
        results = self.results[experiment_name]
        
        # Group by variant
        by_variant = {}
        for r in results:
            variant = r["variant"]
            if variant not in by_variant:
                by_variant[variant] = []
            by_variant[variant].append(r)
        
        # Calculate stats
        stats = {}
        for variant, data in by_variant.items():
            values = [d["value"] for d in data]
            stats[variant] = {
                "count": len(values),
                "mean": sum(values) / len(values) if values else 0,
                "min": min(values) if values else 0,
                "max": max(values) if values else 0
            }
        
        return stats

# Usage
manager = ExperimentManager(registry)

# Create experiment
manager.create_experiment(
    name="support_prompt_v2",
    prompt_name="customer_support",
    variants=[
        {"name": "control", "prompt_version": "1.0.0", "weight": 0.5},
        {"name": "treatment", "prompt_version": "1.1.0", "weight": 0.5}
    ]
)

# Use in production
user_id = "user_123"
prompt, variant = manager.get_prompt(
    "support_prompt_v2",
    user_id,
    company_name="TechCorp",
    query="Where is my order?",
    account_status="Premium"
)

# After getting feedback
manager.record_result(
    "support_prompt_v2",
    user_id,
    variant,
    metric="satisfaction",
    value=4.5
)

# Analyze results
results = manager.get_results("support_prompt_v2")
print(json.dumps(results, indent=2))

Prompt Testing Framework

from dataclasses import dataclass
from typing import Callable, Any
import json

@dataclass
class PromptTestCase:
    name: str
    inputs: dict
    expected: dict  # Expected properties of output
    validators: list[Callable[[str], bool]] = None

class PromptTester:
    """Test prompts against expected behaviors"""
    
    def __init__(self, registry: PromptRegistry, llm_client):
        self.registry = registry
        self.client = llm_client
    
    def run_test(
        self,
        prompt_name: str,
        version: str,
        test_case: PromptTestCase
    ) -> dict:
        """Run a single test case"""
        
        # Render prompt
        prompt = self.registry.render(
            prompt_name,
            version=version,
            **test_case.inputs
        )
        
        # Call LLM
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0  # Deterministic
        )
        
        output = response.choices[0].message.content
        
        # Run validators
        results = {
            "passed": True,
            "output": output,
            "validations": []
        }
        
        for validator in (test_case.validators or []):
            try:
                passed = validator(output)
                results["validations"].append({
                    "validator": validator.__name__,
                    "passed": passed
                })
                if not passed:
                    results["passed"] = False
            except Exception as e:
                results["validations"].append({
                    "validator": validator.__name__,
                    "passed": False,
                    "error": str(e)
                })
                results["passed"] = False
        
        return results
    
    def run_suite(
        self,
        prompt_name: str,
        version: str,
        test_cases: list[PromptTestCase]
    ) -> dict:
        """Run a full test suite"""
        results = {
            "prompt": prompt_name,
            "version": version,
            "total": len(test_cases),
            "passed": 0,
            "failed": 0,
            "tests": []
        }
        
        for test_case in test_cases:
            result = self.run_test(prompt_name, version, test_case)
            result["name"] = test_case.name
            results["tests"].append(result)
            
            if result["passed"]:
                results["passed"] += 1
            else:
                results["failed"] += 1
        
        return results

# Define validators
def contains_greeting(output: str) -> bool:
    """Check if output contains a greeting"""
    greetings = ["hello", "hi", "hey", "greetings"]
    return any(g in output.lower() for g in greetings)

def under_word_limit(limit: int) -> Callable[[str], bool]:
    """Check if output is under word limit"""
    def validator(output: str) -> bool:
        return len(output.split()) <= limit
    validator.__name__ = f"under_{limit}_words"
    return validator

def is_professional(output: str) -> bool:
    """Check for professional language"""
    unprofessional = ["lol", "omg", "wtf", "idk"]
    return not any(word in output.lower() for word in unprofessional)

# Create test cases
test_cases = [
    PromptTestCase(
        name="basic_query",
        inputs={
            "company_name": "TechCorp",
            "query": "What are your business hours?",
            "account_status": "Basic"
        },
        expected={"contains_hours": True},
        validators=[contains_greeting, is_professional, under_word_limit(150)]
    ),
    PromptTestCase(
        name="complaint_handling",
        inputs={
            "company_name": "TechCorp",
            "query": "I'm very unhappy with your service!",
            "account_status": "Premium"
        },
        expected={"contains_apology": True},
        validators=[is_professional, under_word_limit(200)]
    )
]

# Run tests
tester = PromptTester(registry, client)
results = tester.run_suite("customer_support", "1.1.0", test_cases)
print(f"Passed: {results['passed']}/{results['total']}")

Prompt Lifecycle Management

from enum import Enum
from datetime import datetime

class PromptStatus(str, Enum):
    DRAFT = "draft"
    TESTING = "testing"
    CANARY = "canary"
    PRODUCTION = "production"
    DEPRECATED = "deprecated"
    ARCHIVED = "archived"

class PromptLifecycleManager:
    """Manage prompt lifecycle stages"""
    
    ALLOWED_TRANSITIONS = {
        PromptStatus.DRAFT: [PromptStatus.TESTING, PromptStatus.ARCHIVED],
        PromptStatus.TESTING: [PromptStatus.DRAFT, PromptStatus.CANARY, PromptStatus.ARCHIVED],
        PromptStatus.CANARY: [PromptStatus.TESTING, PromptStatus.PRODUCTION],
        PromptStatus.PRODUCTION: [PromptStatus.DEPRECATED],
        PromptStatus.DEPRECATED: [PromptStatus.ARCHIVED, PromptStatus.PRODUCTION],
        PromptStatus.ARCHIVED: []
    }
    
    def __init__(self):
        self.statuses: Dict[str, Dict[str, PromptStatus]] = {}
        self.history: list = []
    
    def set_status(
        self,
        prompt_name: str,
        version: str,
        status: PromptStatus,
        changed_by: str,
        reason: str = ""
    ):
        """Update prompt status"""
        key = f"{prompt_name}:{version}"
        current = self.statuses.get(key, {}).get("status", PromptStatus.DRAFT)
        
        # Validate transition
        if status not in self.ALLOWED_TRANSITIONS[current]:
            raise ValueError(
                f"Cannot transition from {current} to {status}"
            )
        
        self.statuses[key] = {
            "status": status,
            "updated_at": datetime.now(),
            "updated_by": changed_by
        }
        
        self.history.append({
            "prompt": prompt_name,
            "version": version,
            "from_status": current,
            "to_status": status,
            "changed_by": changed_by,
            "reason": reason,
            "timestamp": datetime.now()
        })
    
    def get_status(self, prompt_name: str, version: str) -> PromptStatus:
        key = f"{prompt_name}:{version}"
        return self.statuses.get(key, {}).get("status", PromptStatus.DRAFT)
    
    def promote_to_canary(
        self,
        prompt_name: str,
        version: str,
        canary_percentage: float = 0.05,
        changed_by: str = ""
    ):
        """Promote to canary with traffic percentage"""
        self.set_status(
            prompt_name,
            version,
            PromptStatus.CANARY,
            changed_by,
            f"Canary at {canary_percentage*100}%"
        )
        
        # Return routing config
        return {
            "prompt": prompt_name,
            "version": version,
            "percentage": canary_percentage
        }
    
    def rollback(
        self,
        prompt_name: str,
        to_version: str,
        changed_by: str,
        reason: str
    ):
        """Rollback to a previous version"""
        # Deprecate current production
        for key, data in self.statuses.items():
            if key.startswith(f"{prompt_name}:"):
                if data["status"] == PromptStatus.PRODUCTION:
                    current_version = key.split(":")[1]
                    self.set_status(
                        prompt_name,
                        current_version,
                        PromptStatus.DEPRECATED,
                        changed_by,
                        f"Rollback: {reason}"
                    )
        
        # Promote target version
        self.set_status(
            prompt_name,
            to_version,
            PromptStatus.PRODUCTION,
            changed_by,
            f"Rollback from previous version"
        )

# Usage
lifecycle = PromptLifecycleManager()

# Development flow
lifecycle.set_status("customer_support", "1.2.0", PromptStatus.DRAFT, "[email protected]")
lifecycle.set_status("customer_support", "1.2.0", PromptStatus.TESTING, "[email protected]")

# After tests pass
canary_config = lifecycle.promote_to_canary(
    "customer_support", "1.2.0",
    canary_percentage=0.05,
    changed_by="[email protected]"
)

# If issues detected
lifecycle.rollback(
    "customer_support",
    to_version="1.1.0",
    changed_by="[email protected]",
    reason="Increased error rate"
)

Key Takeaways

Version Everything

Treat prompts like code with proper version control

Test Before Deploy

Automated testing catches issues before production

A/B Test Changes

Measure impact with controlled experiments

Enable Rollbacks

Quick rollback capability is essential for production

What’s Next

LLM Orchestration

Learn to orchestrate multiple LLM providers with unified APIs