Skip to main content
LLMs excel at generating synthetic data for training, testing, and augmentation. This chapter covers patterns for creating high-quality synthetic datasets.

Training Data Generation

Basic Data Generation

from openai import OpenAI
import json
from dataclasses import dataclass


@dataclass
class DataExample:
    """A single training example."""
    input: str
    output: str
    metadata: dict = None


class DataGenerator:
    """Generate synthetic training data."""
    
    def __init__(self, model: str = "gpt-4o"):
        self.client = OpenAI()
        self.model = model
    
    def generate_examples(
        self,
        task_description: str,
        num_examples: int = 10,
        seed_examples: list[dict] = None
    ) -> list[DataExample]:
        """Generate training examples for a task."""
        seed_text = ""
        if seed_examples:
            seed_text = "Here are some example patterns to follow:\n"
            for ex in seed_examples:
                seed_text += f"Input: {ex['input']}\nOutput: {ex['output']}\n\n"
        
        prompt = f"""Generate {num_examples} diverse training examples for this task:

Task: {task_description}

{seed_text}

Requirements:
- Make examples diverse in content and complexity
- Ensure outputs are accurate and consistent with the task
- Vary the input lengths and styles
- Include edge cases and challenging examples

Return JSON:
{{
    "examples": [
        {{"input": "example input", "output": "correct output"}}
    ]
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        
        return [
            DataExample(input=ex["input"], output=ex["output"])
            for ex in data.get("examples", [])
        ]
    
    def generate_classification_data(
        self,
        labels: list[str],
        label_descriptions: dict[str, str],
        examples_per_label: int = 20
    ) -> list[DataExample]:
        """Generate classification training data."""
        all_examples = []
        
        for label in labels:
            description = label_descriptions.get(label, "")
            
            prompt = f"""Generate {examples_per_label} text examples for the classification label "{label}".

Label description: {description}

Requirements:
- Make examples realistic and varied
- Include different lengths and styles
- Ensure each example clearly belongs to this category
- Include some challenging borderline cases

Return JSON:
{{
    "examples": [
        {{"text": "example text"}}
    ]
}}"""
            
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            
            data = json.loads(response.choices[0].message.content)
            
            for ex in data.get("examples", []):
                all_examples.append(DataExample(
                    input=ex["text"],
                    output=label,
                    metadata={"generated": True}
                ))
        
        return all_examples


# Usage
generator = DataGenerator()

# Generate sentiment analysis data
labels = ["positive", "negative", "neutral"]
descriptions = {
    "positive": "Happy, satisfied, enthusiastic customer feedback",
    "negative": "Unhappy, frustrated, disappointed customer feedback",
    "neutral": "Factual, objective statements without strong emotion"
}

data = generator.generate_classification_data(
    labels=labels,
    label_descriptions=descriptions,
    examples_per_label=10
)

print(f"Generated {len(data)} examples")
for ex in data[:3]:
    print(f"  {ex.output}: {ex.input[:50]}...")

Instruction-Following Data

from openai import OpenAI
import json
from typing import Optional


class InstructionDataGenerator:
    """Generate instruction-following training data."""
    
    def __init__(self, model: str = "gpt-4o"):
        self.client = OpenAI()
        self.model = model
    
    def generate_instructions(
        self,
        domain: str,
        complexity_levels: list[str] = None,
        num_per_level: int = 10
    ) -> list[dict]:
        """Generate diverse instruction-response pairs."""
        complexity_levels = complexity_levels or ["simple", "moderate", "complex"]
        all_data = []
        
        for level in complexity_levels:
            prompt = f"""Generate {num_per_level} instruction-response pairs for the domain: {domain}

Complexity level: {level}
- Simple: Single-step tasks, short responses
- Moderate: Multi-step tasks, detailed responses
- Complex: Nuanced tasks requiring reasoning, comprehensive responses

Requirements:
- Make instructions clear and specific
- Responses should be helpful and complete
- Include variety in instruction formats (questions, commands, requests)
- Ensure responses demonstrate the complexity level

Return JSON:
{{
    "pairs": [
        {{
            "instruction": "user instruction",
            "response": "assistant response",
            "reasoning": "optional chain of thought"
        }}
    ]
}}"""
            
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            
            data = json.loads(response.choices[0].message.content)
            
            for pair in data.get("pairs", []):
                all_data.append({
                    "instruction": pair["instruction"],
                    "response": pair["response"],
                    "complexity": level,
                    "domain": domain,
                    "reasoning": pair.get("reasoning")
                })
        
        return all_data
    
    def generate_multi_turn(
        self,
        scenario: str,
        num_turns: int = 5,
        num_conversations: int = 5
    ) -> list[list[dict]]:
        """Generate multi-turn conversation data."""
        conversations = []
        
        for _ in range(num_conversations):
            prompt = f"""Generate a realistic {num_turns}-turn conversation for this scenario:

Scenario: {scenario}

Requirements:
- User asks progressively related questions
- Assistant provides helpful, accurate responses
- Include follow-up questions and clarifications
- Make the conversation flow naturally

Return JSON:
{{
    "conversation": [
        {{"role": "user", "content": "user message"}},
        {{"role": "assistant", "content": "assistant response"}}
    ]
}}"""
            
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            
            data = json.loads(response.choices[0].message.content)
            conversations.append(data.get("conversation", []))
        
        return conversations


# Usage
generator = InstructionDataGenerator()

# Generate coding instruction data
coding_data = generator.generate_instructions(
    domain="Python programming",
    complexity_levels=["simple", "moderate", "complex"],
    num_per_level=5
)

print(f"Generated {len(coding_data)} instruction pairs")

# Generate multi-turn data
conversations = generator.generate_multi_turn(
    scenario="User is learning about machine learning and wants to understand neural networks",
    num_turns=4,
    num_conversations=3
)

print(f"Generated {len(conversations)} conversations")

Data Augmentation

from openai import OpenAI
import json
import random


class DataAugmenter:
    """Augment existing datasets with synthetic variations."""
    
    def __init__(self, model: str = "gpt-4o-mini"):
        self.client = OpenAI()
        self.model = model
    
    def paraphrase(
        self,
        text: str,
        num_variations: int = 3,
        preserve_meaning: bool = True
    ) -> list[str]:
        """Generate paraphrased versions of text."""
        constraint = "Preserve the exact meaning." if preserve_meaning else "Allow slight variations in meaning."
        
        prompt = f"""Generate {num_variations} paraphrased versions of this text.

Original: {text}

Requirements:
- {constraint}
- Use different vocabulary and sentence structures
- Maintain the same tone
- Each version should be distinct

Return JSON: {{"paraphrases": ["version1", "version2", ...]}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("paraphrases", [])
    
    def augment_with_context(
        self,
        examples: list[dict],
        context_variations: list[str]
    ) -> list[dict]:
        """Augment examples with different contexts."""
        augmented = []
        
        for example in examples:
            for context in context_variations:
                prompt = f"""Rewrite this example in a new context.

Original input: {example['input']}
Original output: {example['output']}
New context: {context}

Adapt the example to fit the new context while preserving the task pattern.

Return JSON:
{{
    "input": "adapted input",
    "output": "adapted output"
}}"""
                
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[{"role": "user", "content": prompt}],
                    response_format={"type": "json_object"}
                )
                
                data = json.loads(response.choices[0].message.content)
                augmented.append({
                    "input": data["input"],
                    "output": data["output"],
                    "original_context": example.get("context"),
                    "new_context": context
                })
        
        return augmented
    
    def generate_edge_cases(
        self,
        task_description: str,
        seed_examples: list[dict],
        num_edge_cases: int = 10
    ) -> list[dict]:
        """Generate challenging edge cases."""
        prompt = f"""Generate {num_edge_cases} edge case examples for this task.

Task: {task_description}

Normal examples:
{json.dumps(seed_examples[:3], indent=2)}

Generate challenging edge cases that test:
- Boundary conditions
- Unusual inputs
- Ambiguous cases
- Error handling scenarios
- Edge of distribution examples

Return JSON:
{{
    "edge_cases": [
        {{"input": "edge case input", "output": "correct output", "challenge": "what makes this challenging"}}
    ]
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("edge_cases", [])
    
    def back_translate(
        self,
        text: str,
        intermediate_language: str = "French"
    ) -> str:
        """Augment by translating to another language and back."""
        # Translate to intermediate language
        translate_prompt = f"Translate to {intermediate_language}: {text}"
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": translate_prompt}]
        )
        
        intermediate = response.choices[0].message.content
        
        # Translate back
        back_prompt = f"Translate to English: {intermediate}"
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": back_prompt}]
        )
        
        return response.choices[0].message.content


# Usage
augmenter = DataAugmenter()

# Paraphrase augmentation
original = "The customer service was excellent and the product exceeded my expectations."
paraphrases = augmenter.paraphrase(original, num_variations=3)

print("Original:", original)
for i, p in enumerate(paraphrases):
    print(f"Variation {i+1}:", p)

# Edge case generation
task = "Sentiment classification of product reviews"
seed = [
    {"input": "Great product!", "output": "positive"},
    {"input": "Terrible quality", "output": "negative"}
]

edge_cases = augmenter.generate_edge_cases(task, seed, num_edge_cases=5)
for case in edge_cases:
    print(f"Edge case: {case['input']} -> {case['output']}")
    print(f"  Challenge: {case['challenge']}")

Quality Filtering

from openai import OpenAI
import json
from dataclasses import dataclass


@dataclass
class QualityScore:
    """Quality assessment for a data example."""
    overall_score: float
    relevance: float
    accuracy: float
    clarity: float
    diversity: float
    issues: list[str]


class DataQualityFilter:
    """Filter synthetic data for quality."""
    
    def __init__(self, model: str = "gpt-4o-mini"):
        self.client = OpenAI()
        self.model = model
    
    def score_example(
        self,
        example: dict,
        task_description: str
    ) -> QualityScore:
        """Score a single example for quality."""
        prompt = f"""Evaluate the quality of this training example.

Task: {task_description}
Input: {example['input']}
Output: {example['output']}

Score each dimension from 0 to 1:
- Relevance: Does it match the task?
- Accuracy: Is the output correct?
- Clarity: Is it clear and unambiguous?
- Diversity: Does it add value beyond basic examples?

Return JSON:
{{
    "relevance": 0.0-1.0,
    "accuracy": 0.0-1.0,
    "clarity": 0.0-1.0,
    "diversity": 0.0-1.0,
    "issues": ["list of any problems found"],
    "reasoning": "brief explanation"
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        
        scores = [
            data.get("relevance", 0),
            data.get("accuracy", 0),
            data.get("clarity", 0),
            data.get("diversity", 0)
        ]
        
        return QualityScore(
            overall_score=sum(scores) / len(scores),
            relevance=data.get("relevance", 0),
            accuracy=data.get("accuracy", 0),
            clarity=data.get("clarity", 0),
            diversity=data.get("diversity", 0),
            issues=data.get("issues", [])
        )
    
    def filter_dataset(
        self,
        examples: list[dict],
        task_description: str,
        min_score: float = 0.7
    ) -> tuple[list[dict], list[dict]]:
        """Filter dataset keeping only high-quality examples."""
        accepted = []
        rejected = []
        
        for example in examples:
            score = self.score_example(example, task_description)
            
            example["quality_score"] = score.overall_score
            example["quality_details"] = {
                "relevance": score.relevance,
                "accuracy": score.accuracy,
                "clarity": score.clarity,
                "diversity": score.diversity,
                "issues": score.issues
            }
            
            if score.overall_score >= min_score:
                accepted.append(example)
            else:
                rejected.append(example)
        
        return accepted, rejected
    
    def deduplicate(
        self,
        examples: list[dict],
        similarity_threshold: float = 0.85
    ) -> list[dict]:
        """Remove near-duplicate examples."""
        if len(examples) <= 1:
            return examples
        
        # Use LLM to find similar pairs
        inputs = [ex["input"] for ex in examples]
        
        prompt = f"""Identify groups of very similar or duplicate texts from this list.

Texts:
{json.dumps(dict(enumerate(inputs)), indent=2)}

Group texts that are too similar (>85% semantic overlap).
For each group, identify which index to keep.

Return JSON:
{{
    "duplicate_groups": [
        {{"keep_index": 0, "remove_indices": [1, 2]}}
    ]
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        
        # Collect indices to remove
        remove_indices = set()
        for group in data.get("duplicate_groups", []):
            remove_indices.update(group.get("remove_indices", []))
        
        # Filter examples
        return [ex for i, ex in enumerate(examples) if i not in remove_indices]


# Usage
filter = DataQualityFilter()

task = "Extract action items from meeting notes"
examples = [
    {"input": "Let's schedule a follow-up meeting", "output": "Schedule follow-up meeting"},
    {"input": "Bob will send the report by Friday", "output": "Bob: Send report by Friday"},
    {"input": "Nice weather today", "output": "Enjoy weather"},  # Bad example
]

accepted, rejected = filter.filter_dataset(examples, task, min_score=0.6)

print(f"Accepted: {len(accepted)}, Rejected: {len(rejected)}")
for ex in rejected:
    print(f"Rejected: {ex['input']}")
    print(f"  Score: {ex['quality_score']:.2f}")
    print(f"  Issues: {ex['quality_details']['issues']}")

Test Data Generation

from openai import OpenAI
import json
from datetime import datetime, timedelta
import random


class TestDataGenerator:
    """Generate realistic test fixtures and mock data."""
    
    def __init__(self, model: str = "gpt-4o-mini"):
        self.client = OpenAI()
        self.model = model
    
    def generate_users(self, count: int = 10, schema: dict = None) -> list[dict]:
        """Generate realistic user data."""
        schema = schema or {
            "id": "integer",
            "name": "full name",
            "email": "email address",
            "age": "integer 18-80",
            "country": "country name",
            "created_at": "ISO datetime"
        }
        
        prompt = f"""Generate {count} realistic user records.

Schema:
{json.dumps(schema, indent=2)}

Requirements:
- Make names and emails consistent
- Distribute ages realistically
- Include international diversity
- Ensure all emails are unique

Return JSON: {{"users": [...]}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("users", [])
    
    def generate_from_schema(
        self,
        schema: dict,
        count: int = 10,
        constraints: dict = None
    ) -> list[dict]:
        """Generate data from arbitrary schema."""
        constraints = constraints or {}
        
        prompt = f"""Generate {count} records matching this schema.

Schema:
{json.dumps(schema, indent=2)}

Constraints:
{json.dumps(constraints, indent=2) if constraints else "None"}

Generate realistic, varied data that follows the schema exactly.

Return JSON: {{"records": [...]}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("records", [])
    
    def generate_api_responses(
        self,
        endpoint_description: str,
        scenarios: list[str],
        include_errors: bool = True
    ) -> list[dict]:
        """Generate mock API responses for testing."""
        prompt = f"""Generate mock API responses for testing.

Endpoint: {endpoint_description}

Scenarios to cover:
{json.dumps(scenarios, indent=2)}

{"Include error responses (400, 404, 500)." if include_errors else ""}

Return JSON:
{{
    "responses": [
        {{
            "scenario": "description",
            "status_code": 200,
            "body": {{}},
            "headers": {{}}
        }}
    ]
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("responses", [])
    
    def generate_test_documents(
        self,
        document_type: str,
        count: int = 5,
        include_variations: list[str] = None
    ) -> list[str]:
        """Generate test documents of various types."""
        variations = include_variations or ["standard", "edge case", "malformed"]
        
        prompt = f"""Generate {count} {document_type} documents for testing.

Include these variations:
{json.dumps(variations, indent=2)}

Each document should be realistic and complete.
Make them diverse in content and structure.

Return JSON:
{{
    "documents": [
        {{"content": "document text", "variation": "type of variation"}}
    ]
}}"""
        
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        data = json.loads(response.choices[0].message.content)
        return data.get("documents", [])


# Usage
generator = TestDataGenerator()

# Generate user fixtures
users = generator.generate_users(5)
print("Generated users:")
for user in users:
    print(f"  {user['name']} ({user['email']})")

# Generate from custom schema
product_schema = {
    "sku": "string (8 alphanumeric)",
    "name": "product name",
    "price": "float (10-1000)",
    "category": "electronics|clothing|home|sports",
    "in_stock": "boolean",
    "rating": "float (1-5)"
}

products = generator.generate_from_schema(
    product_schema,
    count=5,
    constraints={"at_least_2_out_of_stock": True}
)

print("\nGenerated products:")
for product in products:
    print(f"  {product['sku']}: {product['name']} - ${product.get('price', 0):.2f}")

# Generate API test responses
api_responses = generator.generate_api_responses(
    endpoint_description="GET /api/orders/{order_id}",
    scenarios=["Valid order", "Order not found", "Server error"],
    include_errors=True
)

print("\nAPI test responses:")
for resp in api_responses:
    print(f"  {resp['scenario']}: {resp['status_code']}")

Evaluation Dataset Creation

from openai import OpenAI
import json


class EvaluationDataGenerator:
    """Generate evaluation/benchmark datasets."""
    
    def __init__(self, model: str = "gpt-4o"):
        self.client = OpenAI()
        self.model = model
    
    def create_benchmark(
        self,
        task_description: str,
        difficulty_distribution: dict = None,
        total_examples: int = 100
    ) -> dict:
        """Create a balanced benchmark dataset."""
        difficulty_distribution = difficulty_distribution or {
            "easy": 0.3,
            "medium": 0.5,
            "hard": 0.2
        }
        
        all_examples = []
        
        for difficulty, proportion in difficulty_distribution.items():
            count = int(total_examples * proportion)
            
            prompt = f"""Generate {count} evaluation examples for this task.

Task: {task_description}
Difficulty: {difficulty}

Requirements for {difficulty} difficulty:
- Easy: Simple, clear cases with obvious answers
- Medium: Requires some reasoning or has minor ambiguity
- Hard: Complex cases requiring nuanced understanding

Include a brief explanation for why each answer is correct.

Return JSON:
{{
    "examples": [
        {{
            "input": "test input",
            "expected_output": "correct answer",
            "explanation": "why this is correct"
        }}
    ]
}}"""
            
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            
            data = json.loads(response.choices[0].message.content)
            
            for ex in data.get("examples", []):
                ex["difficulty"] = difficulty
                all_examples.append(ex)
        
        return {
            "task": task_description,
            "total_examples": len(all_examples),
            "difficulty_distribution": difficulty_distribution,
            "examples": all_examples
        }
    
    def create_adversarial_set(
        self,
        task_description: str,
        attack_types: list[str] = None,
        examples_per_attack: int = 10
    ) -> list[dict]:
        """Generate adversarial test examples."""
        attack_types = attack_types or [
            "distraction",
            "negation",
            "paraphrase",
            "format_variation",
            "edge_case"
        ]
        
        all_adversarial = []
        
        for attack in attack_types:
            prompt = f"""Generate {examples_per_attack} adversarial examples for this task.

Task: {task_description}
Attack type: {attack}

Attack descriptions:
- distraction: Add irrelevant information that might confuse
- negation: Use negations and double negatives
- paraphrase: Unusual phrasing of standard cases
- format_variation: Unusual formatting, punctuation, or structure
- edge_case: Boundary conditions and unusual values

Create examples that are tricky but still have correct answers.

Return JSON:
{{
    "examples": [
        {{
            "input": "adversarial input",
            "expected_output": "correct answer despite adversarial nature",
            "attack_description": "what makes this adversarial"
        }}
    ]
}}"""
            
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            
            data = json.loads(response.choices[0].message.content)
            
            for ex in data.get("examples", []):
                ex["attack_type"] = attack
                all_adversarial.append(ex)
        
        return all_adversarial


# Usage
eval_generator = EvaluationDataGenerator()

# Create benchmark
benchmark = eval_generator.create_benchmark(
    task_description="Named entity recognition - identify person, organization, and location entities",
    difficulty_distribution={"easy": 0.4, "medium": 0.4, "hard": 0.2},
    total_examples=20
)

print(f"Created benchmark with {benchmark['total_examples']} examples")
for diff, prop in benchmark['difficulty_distribution'].items():
    count = sum(1 for ex in benchmark['examples'] if ex['difficulty'] == diff)
    print(f"  {diff}: {count} examples")

# Create adversarial set
adversarial = eval_generator.create_adversarial_set(
    task_description="Sentiment classification",
    attack_types=["negation", "distraction"],
    examples_per_attack=5
)

print(f"\nCreated {len(adversarial)} adversarial examples")
Synthetic Data Guidelines
  • Always validate generated data against task requirements
  • Use seed examples to guide generation style
  • Include difficulty stratification for robust training
  • Filter and deduplicate before use
  • Test on held-out real data to verify effectiveness

Practice Exercise

Build a synthetic data pipeline that:
  1. Generates task-specific training examples
  2. Creates augmented variations of existing data
  3. Filters for quality and removes duplicates
  4. Produces balanced evaluation benchmarks
  5. Includes adversarial test cases
Focus on:
  • Diversity in generated examples
  • Accuracy of labels and outputs
  • Quality filtering at each stage
  • Realistic edge case coverage