Skip to main content

Documentation Index

Fetch the complete documentation index at: https://resources.devweekends.com/llms.txt

Use this file to discover all available pages before exploring further.

December 2025 Update: Production strategies for managing prompts including version control, A/B testing, and prompt observability.

The Prompt Management Problem

Here is a scenario that plays out at every company building with LLMs: a developer changes a system prompt to fix one edge case, and it silently breaks three other use cases. Nobody notices until customers complain a week later. By then, nobody remembers what the prompt said before or why it was changed. Sound familiar? Prompts are the new configuration files — except they are more fragile, harder to test, and have a bigger blast radius when they break. They are critical business logic, but often managed poorly:
Common Anti-Patterns              Best Practices
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Hardcoded strings                 Versioned prompt registry
No change tracking                Git-like version control
No testing                        Automated evaluation
"It worked yesterday"             Rollback capability
One-size-fits-all                 A/B testing

Prompt Registry Pattern

A prompt registry is the foundational pattern for managing prompts at scale. Instead of hardcoding prompts as string literals scattered across your codebase, you centralize them in a registry that supports versioning, variable substitution, and metadata tracking. Think of it as a database for prompts — every prompt has a name, a version, an author, and a history of changes.

Basic Prompt Registry

from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, Dict, Any
import json
import hashlib

@dataclass
class PromptVersion:
    version: str
    template: str
    variables: list[str]
    created_at: datetime
    created_by: str
    description: str = ""
    metadata: dict = field(default_factory=dict)
    
    @property
    def hash(self) -> str:
        return hashlib.sha256(self.template.encode()).hexdigest()[:12]

@dataclass
class Prompt:
    name: str
    versions: list[PromptVersion] = field(default_factory=list)
    active_version: str = "latest"
    
    def get_version(self, version: str = None) -> PromptVersion:
        version = version or self.active_version
        
        if version == "latest":
            return self.versions[-1]
        
        for v in self.versions:
            if v.version == version:
                return v
        
        raise ValueError(f"Version {version} not found")
    
    def render(self, version: str = None, **kwargs) -> str:
        prompt_version = self.get_version(version)
        return prompt_version.template.format(**kwargs)

class PromptRegistry:
    """Central registry for all prompts"""
    
    def __init__(self):
        self.prompts: Dict[str, Prompt] = {}
    
    def register(
        self,
        name: str,
        template: str,
        version: str,
        created_by: str,
        description: str = "",
        variables: list[str] = None,
        metadata: dict = None
    ) -> PromptVersion:
        """Register a new prompt version"""
        
        # Extract variables from template
        if variables is None:
            import re
            variables = re.findall(r'\{(\w+)\}', template)
        
        prompt_version = PromptVersion(
            version=version,
            template=template,
            variables=variables,
            created_at=datetime.now(),
            created_by=created_by,
            description=description,
            metadata=metadata or {}
        )
        
        if name not in self.prompts:
            self.prompts[name] = Prompt(name=name)
        
        self.prompts[name].versions.append(prompt_version)
        
        return prompt_version
    
    def get(self, name: str, version: str = None) -> Prompt:
        """Get a prompt by name"""
        if name not in self.prompts:
            raise ValueError(f"Prompt '{name}' not found")
        return self.prompts[name]
    
    def render(
        self,
        name: str,
        version: str = None,
        **kwargs
    ) -> str:
        """Render a prompt with variables"""
        prompt = self.get(name)
        return prompt.render(version, **kwargs)
    
    def set_active_version(self, name: str, version: str):
        """Set the active version for a prompt"""
        prompt = self.get(name)
        prompt.get_version(version)  # Validate version exists
        prompt.active_version = version
    
    def list_prompts(self) -> list[str]:
        return list(self.prompts.keys())
    
    def list_versions(self, name: str) -> list[str]:
        prompt = self.get(name)
        return [v.version for v in prompt.versions]

# Usage
registry = PromptRegistry()

# Register prompts
registry.register(
    name="customer_support",
    version="1.0.0",
    created_by="john@example.com",
    template="""You are a helpful customer support agent for {company_name}.

Customer query: {query}

Please provide a helpful response following these guidelines:
- Be friendly and professional
- If you don't know, say so
- Offer to escalate if needed""",
    description="Initial customer support prompt"
)

registry.register(
    name="customer_support",
    version="1.1.0",
    created_by="jane@example.com",
    template="""You are an expert customer support agent for {company_name}.

## Customer Information
- Query: {query}
- Account Status: {account_status}

## Response Guidelines
1. Address the customer by acknowledging their concern
2. Provide accurate, helpful information
3. Offer next steps or alternatives
4. End with a friendly closing

Keep response under 150 words.""",
    description="Added account status and improved structure"
)

# Use prompts
prompt = registry.render(
    "customer_support",
    version="1.1.0",
    company_name="TechCorp",
    query="My order hasn't arrived",
    account_status="Premium"
)

File-Based Prompt Management

For teams that want prompt version control without building a database, file-based management is a pragmatic middle ground. You store prompts as text files in your git repo, organized by name and version. This gives you all the benefits of git — diffs, blame, pull request reviews — for free. The trade-off: you cannot change prompts without a code deployment, which is actually a feature for teams that want deployment gates and review processes. Store prompts as files for version control:
prompts/
├── customer_support/
│   ├── prompt.yaml
│   ├── v1.0.0.txt
│   ├── v1.1.0.txt
│   └── tests/
│       └── test_cases.yaml
├── code_review/
│   ├── prompt.yaml
│   └── v1.0.0.txt
└── registry.yaml
import yaml
from pathlib import Path
from typing import Optional

class FilePromptRegistry:
    """Load prompts from file system"""
    
    def __init__(self, prompts_dir: str = "prompts"):
        self.prompts_dir = Path(prompts_dir)
        self.prompts = {}
        self._load_all()
    
    def _load_all(self):
        """Load all prompts from directory"""
        for prompt_dir in self.prompts_dir.iterdir():
            if prompt_dir.is_dir():
                self._load_prompt(prompt_dir)
    
    def _load_prompt(self, prompt_dir: Path):
        """Load a single prompt"""
        config_path = prompt_dir / "prompt.yaml"
        
        if not config_path.exists():
            return
        
        with open(config_path) as f:
            config = yaml.safe_load(f)
        
        name = config["name"]
        self.prompts[name] = {
            "config": config,
            "versions": {}
        }
        
        # Load all versions
        for version_file in prompt_dir.glob("v*.txt"):
            version = version_file.stem  # v1.0.0
            with open(version_file) as f:
                template = f.read()
            
            self.prompts[name]["versions"][version] = template
    
    def get(
        self,
        name: str,
        version: str = None
    ) -> str:
        """Get a prompt template"""
        if name not in self.prompts:
            raise ValueError(f"Prompt '{name}' not found")
        
        prompt_data = self.prompts[name]
        
        if version is None:
            version = prompt_data["config"].get("active_version", "latest")
        
        if version == "latest":
            versions = sorted(prompt_data["versions"].keys())
            version = versions[-1]
        
        return prompt_data["versions"][version]
    
    def render(
        self,
        name: str,
        version: str = None,
        **kwargs
    ) -> str:
        """Render a prompt with variables"""
        template = self.get(name, version)
        return template.format(**kwargs)
    
    def reload(self):
        """Reload all prompts from disk"""
        self.prompts = {}
        self._load_all()

# prompts/customer_support/prompt.yaml
"""
name: customer_support
description: Customer support system prompt
active_version: v1.1.0
variables:
  - company_name
  - query
  - account_status
tags:
  - support
  - production
"""

A/B Testing Prompts

A/B testing for prompts answers the question every team eventually asks: “is this new prompt actually better, or does it just look better on the three examples we tried?” Without controlled experiments, prompt changes are driven by vibes rather than data. The framework below uses deterministic user assignment (hash-based) so the same user always sees the same variant — essential for consistent UX and valid statistical comparisons.

Experiment Framework

from dataclasses import dataclass
from typing import Callable, Optional
import random
import hashlib
from datetime import datetime

@dataclass
class Variant:
    name: str
    prompt_version: str
    weight: float = 1.0

@dataclass
class Experiment:
    name: str
    prompt_name: str
    variants: list[Variant]
    start_date: datetime
    end_date: Optional[datetime] = None
    
    def get_variant(self, user_id: str) -> Variant:
        """Deterministically assign user to variant.
        
        Why hash-based? Random assignment would give the same user different
        variants on different requests, which is confusing UX and invalid
        statistics. Hashing the user_id ensures consistency: user_123 always
        sees variant A, regardless of how many times they visit.
        """
        hash_input = f"{self.name}:{user_id}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        
        # Normalize to 0-1
        normalized = (hash_value % 10000) / 10000
        
        # Assign to variant based on weights
        total_weight = sum(v.weight for v in self.variants)
        cumulative = 0
        
        for variant in self.variants:
            cumulative += variant.weight / total_weight
            if normalized < cumulative:
                return variant
        
        return self.variants[-1]

class ExperimentManager:
    """Manage A/B testing for prompts"""
    
    def __init__(self, registry: PromptRegistry):
        self.registry = registry
        self.experiments: Dict[str, Experiment] = {}
        self.results: Dict[str, list] = {}
    
    def create_experiment(
        self,
        name: str,
        prompt_name: str,
        variants: list[dict]
    ) -> Experiment:
        """Create a new experiment"""
        experiment = Experiment(
            name=name,
            prompt_name=prompt_name,
            variants=[Variant(**v) for v in variants],
            start_date=datetime.now()
        )
        
        self.experiments[name] = experiment
        self.results[name] = []
        
        return experiment
    
    def get_prompt(
        self,
        experiment_name: str,
        user_id: str,
        **kwargs
    ) -> tuple[str, str]:
        """Get prompt for user in experiment"""
        experiment = self.experiments[experiment_name]
        variant = experiment.get_variant(user_id)
        
        prompt = self.registry.render(
            experiment.prompt_name,
            version=variant.prompt_version,
            **kwargs
        )
        
        return prompt, variant.name
    
    def record_result(
        self,
        experiment_name: str,
        user_id: str,
        variant_name: str,
        metric: str,
        value: float
    ):
        """Record experiment result"""
        self.results[experiment_name].append({
            "user_id": user_id,
            "variant": variant_name,
            "metric": metric,
            "value": value,
            "timestamp": datetime.now()
        })
    
    def get_results(self, experiment_name: str) -> dict:
        """Get experiment results by variant"""
        results = self.results[experiment_name]
        
        # Group by variant
        by_variant = {}
        for r in results:
            variant = r["variant"]
            if variant not in by_variant:
                by_variant[variant] = []
            by_variant[variant].append(r)
        
        # Calculate stats
        stats = {}
        for variant, data in by_variant.items():
            values = [d["value"] for d in data]
            stats[variant] = {
                "count": len(values),
                "mean": sum(values) / len(values) if values else 0,
                "min": min(values) if values else 0,
                "max": max(values) if values else 0
            }
        
        return stats

# Usage
manager = ExperimentManager(registry)

# Create experiment
manager.create_experiment(
    name="support_prompt_v2",
    prompt_name="customer_support",
    variants=[
        {"name": "control", "prompt_version": "1.0.0", "weight": 0.5},
        {"name": "treatment", "prompt_version": "1.1.0", "weight": 0.5}
    ]
)

# Use in production
user_id = "user_123"
prompt, variant = manager.get_prompt(
    "support_prompt_v2",
    user_id,
    company_name="TechCorp",
    query="Where is my order?",
    account_status="Premium"
)

# After getting feedback
manager.record_result(
    "support_prompt_v2",
    user_id,
    variant,
    metric="satisfaction",
    value=4.5
)

# Analyze results
results = manager.get_results("support_prompt_v2")
print(json.dumps(results, indent=2))

Prompt Storage: Database vs. Files vs. Config Service

ApproachProsConsBest For
Git files (text files in repo)Full version history via git, PR review process, freeRequires deployment to change prompts, no runtime switchingTeams that want deployment gates and review rigor
Database (PostgreSQL/Redis)Runtime changes without deploy, A/B testing, fast rollbackNeed to build admin UI, risk of unreviewed changes, migration complexityTeams that iterate on prompts frequently in production
Config service (LaunchDarkly, Flagsmith)Feature flags for prompts, gradual rollouts, instant rollbackCost, limited version history, not purpose-built for promptsTeams already using feature flags for other config
Langfuse/LangSmithPurpose-built, tracing integration, evaluation workflowsVendor lock-in, cost at scale, data leaves your infraTeams already using these for observability
Hybrid (git + database cache)Git for source of truth, database for runtime servingMore complex, need sync mechanismMature teams wanting both rigor and flexibility
Decision shortcut: If you are a team of 1-5 and deploy daily, use git files. If you have 5-20 engineers and change prompts weekly, use a database with an admin UI. If you have 20+ engineers and change prompts daily, invest in a dedicated prompt management platform or build an internal one.

Prompt Testing Framework

Prompt testing is the most underinvested area in AI engineering. Teams will spend weeks writing unit tests for their API endpoints but deploy prompt changes to production without any automated testing. The framework below lets you define test cases with expected behaviors (contains certain phrases, stays under word limit, maintains professional tone) and run them automatically before any prompt deployment. Practical tip: start with 5-10 test cases covering your most common and most critical use cases. Add a new test case every time you find a bug in production. Over time, this becomes your regression suite.
from dataclasses import dataclass
from typing import Callable, Any
import json

@dataclass
class PromptTestCase:
    name: str
    inputs: dict
    expected: dict  # Expected properties of output
    validators: list[Callable[[str], bool]] = None

class PromptTester:
    """Test prompts against expected behaviors"""
    
    def __init__(self, registry: PromptRegistry, llm_client):
        self.registry = registry
        self.client = llm_client
    
    def run_test(
        self,
        prompt_name: str,
        version: str,
        test_case: PromptTestCase
    ) -> dict:
        """Run a single test case"""
        
        # Render prompt
        prompt = self.registry.render(
            prompt_name,
            version=version,
            **test_case.inputs
        )
        
        # Call LLM
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0  # Deterministic
        )
        
        output = response.choices[0].message.content
        
        # Run validators
        results = {
            "passed": True,
            "output": output,
            "validations": []
        }
        
        for validator in (test_case.validators or []):
            try:
                passed = validator(output)
                results["validations"].append({
                    "validator": validator.__name__,
                    "passed": passed
                })
                if not passed:
                    results["passed"] = False
            except Exception as e:
                results["validations"].append({
                    "validator": validator.__name__,
                    "passed": False,
                    "error": str(e)
                })
                results["passed"] = False
        
        return results
    
    def run_suite(
        self,
        prompt_name: str,
        version: str,
        test_cases: list[PromptTestCase]
    ) -> dict:
        """Run a full test suite"""
        results = {
            "prompt": prompt_name,
            "version": version,
            "total": len(test_cases),
            "passed": 0,
            "failed": 0,
            "tests": []
        }
        
        for test_case in test_cases:
            result = self.run_test(prompt_name, version, test_case)
            result["name"] = test_case.name
            results["tests"].append(result)
            
            if result["passed"]:
                results["passed"] += 1
            else:
                results["failed"] += 1
        
        return results

# Define validators
def contains_greeting(output: str) -> bool:
    """Check if output contains a greeting"""
    greetings = ["hello", "hi", "hey", "greetings"]
    return any(g in output.lower() for g in greetings)

def under_word_limit(limit: int) -> Callable[[str], bool]:
    """Check if output is under word limit"""
    def validator(output: str) -> bool:
        return len(output.split()) <= limit
    validator.__name__ = f"under_{limit}_words"
    return validator

def is_professional(output: str) -> bool:
    """Check for professional language"""
    unprofessional = ["lol", "omg", "wtf", "idk"]
    return not any(word in output.lower() for word in unprofessional)

# Create test cases
test_cases = [
    PromptTestCase(
        name="basic_query",
        inputs={
            "company_name": "TechCorp",
            "query": "What are your business hours?",
            "account_status": "Basic"
        },
        expected={"contains_hours": True},
        validators=[contains_greeting, is_professional, under_word_limit(150)]
    ),
    PromptTestCase(
        name="complaint_handling",
        inputs={
            "company_name": "TechCorp",
            "query": "I'm very unhappy with your service!",
            "account_status": "Premium"
        },
        expected={"contains_apology": True},
        validators=[is_professional, under_word_limit(200)]
    )
]

# Run tests
tester = PromptTester(registry, client)
results = tester.run_suite("customer_support", "1.1.0", test_cases)
print(f"Passed: {results['passed']}/{results['total']}")

A/B Testing Edge Cases

Edge case — statistical significance with LLM non-determinism: Unlike traditional A/B tests where the treatment is deterministic (users see button A or button B), LLM outputs vary even within the same prompt version. This means you need larger sample sizes to detect real differences. A rough rule: aim for 200+ observations per variant before drawing conclusions, and use metrics that aggregate well (satisfaction score, task completion rate) rather than individual response quality. Edge case — prompt interactions with context: A prompt version that wins the A/B test on short queries might lose on long queries. Segment your results by input characteristics (query length, topic, user tier) before declaring a winner. The “winning” prompt might only be better for 60% of your traffic. Edge case — user experience consistency: If user_123 gets variant A on Monday and you end the experiment on Tuesday, switching them to the winning variant B means their experience changes mid-conversation. For chatbot-style applications, pin users to their variant for the duration of their session or conversation thread, not just per-request.

Prompt Lifecycle Management

Just like code goes through dev, staging, and production, prompts should go through a defined lifecycle: draft, testing, canary, production, deprecated, archived. The lifecycle manager below enforces valid transitions (you cannot jump from draft to production) and maintains an audit trail of who changed what and why. This is not bureaucracy — it is the minimum safety net for a system where a single word change can break your product.
Prompt Lifecycle Transitions (allowed paths):

  DRAFT ──────> TESTING ──────> CANARY ──────> PRODUCTION ──────> DEPRECATED ──────> ARCHIVED
    │              │                │                                    │
    │              ▼                ▼                                    │
    └──────> ARCHIVED         TESTING (rollback)                  PRODUCTION (re-promote)
TransitionWhenWho Should Approve
Draft -> TestingDeveloper is ready for automated evalSelf (author)
Testing -> CanaryAll test cases passTech lead or prompt owner
Canary -> ProductionCanary metrics match or exceed current productionOn-call or product owner
Production -> DeprecatedNew version promoted, or critical bug foundAnyone (emergency) / prompt owner (planned)
Any -> ArchivedNo longer needed, preserved for historyPrompt owner
from enum import Enum
from datetime import datetime

class PromptStatus(str, Enum):
    DRAFT = "draft"
    TESTING = "testing"
    CANARY = "canary"
    PRODUCTION = "production"
    DEPRECATED = "deprecated"
    ARCHIVED = "archived"

class PromptLifecycleManager:
    """Manage prompt lifecycle stages"""
    
    ALLOWED_TRANSITIONS = {
        PromptStatus.DRAFT: [PromptStatus.TESTING, PromptStatus.ARCHIVED],
        PromptStatus.TESTING: [PromptStatus.DRAFT, PromptStatus.CANARY, PromptStatus.ARCHIVED],
        PromptStatus.CANARY: [PromptStatus.TESTING, PromptStatus.PRODUCTION],
        PromptStatus.PRODUCTION: [PromptStatus.DEPRECATED],
        PromptStatus.DEPRECATED: [PromptStatus.ARCHIVED, PromptStatus.PRODUCTION],
        PromptStatus.ARCHIVED: []
    }
    
    def __init__(self):
        self.statuses: Dict[str, Dict[str, PromptStatus]] = {}
        self.history: list = []
    
    def set_status(
        self,
        prompt_name: str,
        version: str,
        status: PromptStatus,
        changed_by: str,
        reason: str = ""
    ):
        """Update prompt status"""
        key = f"{prompt_name}:{version}"
        current = self.statuses.get(key, {}).get("status", PromptStatus.DRAFT)
        
        # Validate transition
        if status not in self.ALLOWED_TRANSITIONS[current]:
            raise ValueError(
                f"Cannot transition from {current} to {status}"
            )
        
        self.statuses[key] = {
            "status": status,
            "updated_at": datetime.now(),
            "updated_by": changed_by
        }
        
        self.history.append({
            "prompt": prompt_name,
            "version": version,
            "from_status": current,
            "to_status": status,
            "changed_by": changed_by,
            "reason": reason,
            "timestamp": datetime.now()
        })
    
    def get_status(self, prompt_name: str, version: str) -> PromptStatus:
        key = f"{prompt_name}:{version}"
        return self.statuses.get(key, {}).get("status", PromptStatus.DRAFT)
    
    def promote_to_canary(
        self,
        prompt_name: str,
        version: str,
        canary_percentage: float = 0.05,
        changed_by: str = ""
    ):
        """Promote to canary with traffic percentage"""
        self.set_status(
            prompt_name,
            version,
            PromptStatus.CANARY,
            changed_by,
            f"Canary at {canary_percentage*100}%"
        )
        
        # Return routing config
        return {
            "prompt": prompt_name,
            "version": version,
            "percentage": canary_percentage
        }
    
    def rollback(
        self,
        prompt_name: str,
        to_version: str,
        changed_by: str,
        reason: str
    ):
        """Rollback to a previous version"""
        # Deprecate current production
        for key, data in self.statuses.items():
            if key.startswith(f"{prompt_name}:"):
                if data["status"] == PromptStatus.PRODUCTION:
                    current_version = key.split(":")[1]
                    self.set_status(
                        prompt_name,
                        current_version,
                        PromptStatus.DEPRECATED,
                        changed_by,
                        f"Rollback: {reason}"
                    )
        
        # Promote target version
        self.set_status(
            prompt_name,
            to_version,
            PromptStatus.PRODUCTION,
            changed_by,
            f"Rollback from previous version"
        )

# Usage
lifecycle = PromptLifecycleManager()

# Development flow
lifecycle.set_status("customer_support", "1.2.0", PromptStatus.DRAFT, "dev@example.com")
lifecycle.set_status("customer_support", "1.2.0", PromptStatus.TESTING, "dev@example.com")

# After tests pass
canary_config = lifecycle.promote_to_canary(
    "customer_support", "1.2.0",
    canary_percentage=0.05,
    changed_by="ops@example.com"
)

# If issues detected
lifecycle.rollback(
    "customer_support",
    to_version="1.1.0",
    changed_by="ops@example.com",
    reason="Increased error rate"
)

Key Takeaways

Version Everything

Treat prompts like code with proper version control

Test Before Deploy

Automated testing catches issues before production

A/B Test Changes

Measure impact with controlled experiments

Enable Rollbacks

Quick rollback capability is essential for production

What’s Next

LLM Orchestration

Learn to orchestrate multiple LLM providers with unified APIs