Training Data Generation
Basic Data Generation
Copy
from openai import OpenAI
import json
from dataclasses import dataclass
@dataclass
class DataExample:
"""A single training example."""
input: str
output: str
metadata: dict = None
class DataGenerator:
"""Generate synthetic training data."""
def __init__(self, model: str = "gpt-4o"):
self.client = OpenAI()
self.model = model
def generate_examples(
self,
task_description: str,
num_examples: int = 10,
seed_examples: list[dict] = None
) -> list[DataExample]:
"""Generate training examples for a task."""
seed_text = ""
if seed_examples:
seed_text = "Here are some example patterns to follow:\n"
for ex in seed_examples:
seed_text += f"Input: {ex['input']}\nOutput: {ex['output']}\n\n"
prompt = f"""Generate {num_examples} diverse training examples for this task:
Task: {task_description}
{seed_text}
Requirements:
- Make examples diverse in content and complexity
- Ensure outputs are accurate and consistent with the task
- Vary the input lengths and styles
- Include edge cases and challenging examples
Return JSON:
{{
"examples": [
{{"input": "example input", "output": "correct output"}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return [
DataExample(input=ex["input"], output=ex["output"])
for ex in data.get("examples", [])
]
def generate_classification_data(
self,
labels: list[str],
label_descriptions: dict[str, str],
examples_per_label: int = 20
) -> list[DataExample]:
"""Generate classification training data."""
all_examples = []
for label in labels:
description = label_descriptions.get(label, "")
prompt = f"""Generate {examples_per_label} text examples for the classification label "{label}".
Label description: {description}
Requirements:
- Make examples realistic and varied
- Include different lengths and styles
- Ensure each example clearly belongs to this category
- Include some challenging borderline cases
Return JSON:
{{
"examples": [
{{"text": "example text"}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
for ex in data.get("examples", []):
all_examples.append(DataExample(
input=ex["text"],
output=label,
metadata={"generated": True}
))
return all_examples
# Usage
generator = DataGenerator()
# Generate sentiment analysis data
labels = ["positive", "negative", "neutral"]
descriptions = {
"positive": "Happy, satisfied, enthusiastic customer feedback",
"negative": "Unhappy, frustrated, disappointed customer feedback",
"neutral": "Factual, objective statements without strong emotion"
}
data = generator.generate_classification_data(
labels=labels,
label_descriptions=descriptions,
examples_per_label=10
)
print(f"Generated {len(data)} examples")
for ex in data[:3]:
print(f" {ex.output}: {ex.input[:50]}...")
Instruction-Following Data
Copy
from openai import OpenAI
import json
from typing import Optional
class InstructionDataGenerator:
"""Generate instruction-following training data."""
def __init__(self, model: str = "gpt-4o"):
self.client = OpenAI()
self.model = model
def generate_instructions(
self,
domain: str,
complexity_levels: list[str] = None,
num_per_level: int = 10
) -> list[dict]:
"""Generate diverse instruction-response pairs."""
complexity_levels = complexity_levels or ["simple", "moderate", "complex"]
all_data = []
for level in complexity_levels:
prompt = f"""Generate {num_per_level} instruction-response pairs for the domain: {domain}
Complexity level: {level}
- Simple: Single-step tasks, short responses
- Moderate: Multi-step tasks, detailed responses
- Complex: Nuanced tasks requiring reasoning, comprehensive responses
Requirements:
- Make instructions clear and specific
- Responses should be helpful and complete
- Include variety in instruction formats (questions, commands, requests)
- Ensure responses demonstrate the complexity level
Return JSON:
{{
"pairs": [
{{
"instruction": "user instruction",
"response": "assistant response",
"reasoning": "optional chain of thought"
}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
for pair in data.get("pairs", []):
all_data.append({
"instruction": pair["instruction"],
"response": pair["response"],
"complexity": level,
"domain": domain,
"reasoning": pair.get("reasoning")
})
return all_data
def generate_multi_turn(
self,
scenario: str,
num_turns: int = 5,
num_conversations: int = 5
) -> list[list[dict]]:
"""Generate multi-turn conversation data."""
conversations = []
for _ in range(num_conversations):
prompt = f"""Generate a realistic {num_turns}-turn conversation for this scenario:
Scenario: {scenario}
Requirements:
- User asks progressively related questions
- Assistant provides helpful, accurate responses
- Include follow-up questions and clarifications
- Make the conversation flow naturally
Return JSON:
{{
"conversation": [
{{"role": "user", "content": "user message"}},
{{"role": "assistant", "content": "assistant response"}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
conversations.append(data.get("conversation", []))
return conversations
# Usage
generator = InstructionDataGenerator()
# Generate coding instruction data
coding_data = generator.generate_instructions(
domain="Python programming",
complexity_levels=["simple", "moderate", "complex"],
num_per_level=5
)
print(f"Generated {len(coding_data)} instruction pairs")
# Generate multi-turn data
conversations = generator.generate_multi_turn(
scenario="User is learning about machine learning and wants to understand neural networks",
num_turns=4,
num_conversations=3
)
print(f"Generated {len(conversations)} conversations")
Data Augmentation
Copy
from openai import OpenAI
import json
import random
class DataAugmenter:
"""Augment existing datasets with synthetic variations."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def paraphrase(
self,
text: str,
num_variations: int = 3,
preserve_meaning: bool = True
) -> list[str]:
"""Generate paraphrased versions of text."""
constraint = "Preserve the exact meaning." if preserve_meaning else "Allow slight variations in meaning."
prompt = f"""Generate {num_variations} paraphrased versions of this text.
Original: {text}
Requirements:
- {constraint}
- Use different vocabulary and sentence structures
- Maintain the same tone
- Each version should be distinct
Return JSON: {{"paraphrases": ["version1", "version2", ...]}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("paraphrases", [])
def augment_with_context(
self,
examples: list[dict],
context_variations: list[str]
) -> list[dict]:
"""Augment examples with different contexts."""
augmented = []
for example in examples:
for context in context_variations:
prompt = f"""Rewrite this example in a new context.
Original input: {example['input']}
Original output: {example['output']}
New context: {context}
Adapt the example to fit the new context while preserving the task pattern.
Return JSON:
{{
"input": "adapted input",
"output": "adapted output"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
augmented.append({
"input": data["input"],
"output": data["output"],
"original_context": example.get("context"),
"new_context": context
})
return augmented
def generate_edge_cases(
self,
task_description: str,
seed_examples: list[dict],
num_edge_cases: int = 10
) -> list[dict]:
"""Generate challenging edge cases."""
prompt = f"""Generate {num_edge_cases} edge case examples for this task.
Task: {task_description}
Normal examples:
{json.dumps(seed_examples[:3], indent=2)}
Generate challenging edge cases that test:
- Boundary conditions
- Unusual inputs
- Ambiguous cases
- Error handling scenarios
- Edge of distribution examples
Return JSON:
{{
"edge_cases": [
{{"input": "edge case input", "output": "correct output", "challenge": "what makes this challenging"}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("edge_cases", [])
def back_translate(
self,
text: str,
intermediate_language: str = "French"
) -> str:
"""Augment by translating to another language and back."""
# Translate to intermediate language
translate_prompt = f"Translate to {intermediate_language}: {text}"
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": translate_prompt}]
)
intermediate = response.choices[0].message.content
# Translate back
back_prompt = f"Translate to English: {intermediate}"
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": back_prompt}]
)
return response.choices[0].message.content
# Usage
augmenter = DataAugmenter()
# Paraphrase augmentation
original = "The customer service was excellent and the product exceeded my expectations."
paraphrases = augmenter.paraphrase(original, num_variations=3)
print("Original:", original)
for i, p in enumerate(paraphrases):
print(f"Variation {i+1}:", p)
# Edge case generation
task = "Sentiment classification of product reviews"
seed = [
{"input": "Great product!", "output": "positive"},
{"input": "Terrible quality", "output": "negative"}
]
edge_cases = augmenter.generate_edge_cases(task, seed, num_edge_cases=5)
for case in edge_cases:
print(f"Edge case: {case['input']} -> {case['output']}")
print(f" Challenge: {case['challenge']}")
Quality Filtering
Copy
from openai import OpenAI
import json
from dataclasses import dataclass
@dataclass
class QualityScore:
"""Quality assessment for a data example."""
overall_score: float
relevance: float
accuracy: float
clarity: float
diversity: float
issues: list[str]
class DataQualityFilter:
"""Filter synthetic data for quality."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def score_example(
self,
example: dict,
task_description: str
) -> QualityScore:
"""Score a single example for quality."""
prompt = f"""Evaluate the quality of this training example.
Task: {task_description}
Input: {example['input']}
Output: {example['output']}
Score each dimension from 0 to 1:
- Relevance: Does it match the task?
- Accuracy: Is the output correct?
- Clarity: Is it clear and unambiguous?
- Diversity: Does it add value beyond basic examples?
Return JSON:
{{
"relevance": 0.0-1.0,
"accuracy": 0.0-1.0,
"clarity": 0.0-1.0,
"diversity": 0.0-1.0,
"issues": ["list of any problems found"],
"reasoning": "brief explanation"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
scores = [
data.get("relevance", 0),
data.get("accuracy", 0),
data.get("clarity", 0),
data.get("diversity", 0)
]
return QualityScore(
overall_score=sum(scores) / len(scores),
relevance=data.get("relevance", 0),
accuracy=data.get("accuracy", 0),
clarity=data.get("clarity", 0),
diversity=data.get("diversity", 0),
issues=data.get("issues", [])
)
def filter_dataset(
self,
examples: list[dict],
task_description: str,
min_score: float = 0.7
) -> tuple[list[dict], list[dict]]:
"""Filter dataset keeping only high-quality examples."""
accepted = []
rejected = []
for example in examples:
score = self.score_example(example, task_description)
example["quality_score"] = score.overall_score
example["quality_details"] = {
"relevance": score.relevance,
"accuracy": score.accuracy,
"clarity": score.clarity,
"diversity": score.diversity,
"issues": score.issues
}
if score.overall_score >= min_score:
accepted.append(example)
else:
rejected.append(example)
return accepted, rejected
def deduplicate(
self,
examples: list[dict],
similarity_threshold: float = 0.85
) -> list[dict]:
"""Remove near-duplicate examples."""
if len(examples) <= 1:
return examples
# Use LLM to find similar pairs
inputs = [ex["input"] for ex in examples]
prompt = f"""Identify groups of very similar or duplicate texts from this list.
Texts:
{json.dumps(dict(enumerate(inputs)), indent=2)}
Group texts that are too similar (>85% semantic overlap).
For each group, identify which index to keep.
Return JSON:
{{
"duplicate_groups": [
{{"keep_index": 0, "remove_indices": [1, 2]}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
# Collect indices to remove
remove_indices = set()
for group in data.get("duplicate_groups", []):
remove_indices.update(group.get("remove_indices", []))
# Filter examples
return [ex for i, ex in enumerate(examples) if i not in remove_indices]
# Usage
filter = DataQualityFilter()
task = "Extract action items from meeting notes"
examples = [
{"input": "Let's schedule a follow-up meeting", "output": "Schedule follow-up meeting"},
{"input": "Bob will send the report by Friday", "output": "Bob: Send report by Friday"},
{"input": "Nice weather today", "output": "Enjoy weather"}, # Bad example
]
accepted, rejected = filter.filter_dataset(examples, task, min_score=0.6)
print(f"Accepted: {len(accepted)}, Rejected: {len(rejected)}")
for ex in rejected:
print(f"Rejected: {ex['input']}")
print(f" Score: {ex['quality_score']:.2f}")
print(f" Issues: {ex['quality_details']['issues']}")
Test Data Generation
Copy
from openai import OpenAI
import json
from datetime import datetime, timedelta
import random
class TestDataGenerator:
"""Generate realistic test fixtures and mock data."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def generate_users(self, count: int = 10, schema: dict = None) -> list[dict]:
"""Generate realistic user data."""
schema = schema or {
"id": "integer",
"name": "full name",
"email": "email address",
"age": "integer 18-80",
"country": "country name",
"created_at": "ISO datetime"
}
prompt = f"""Generate {count} realistic user records.
Schema:
{json.dumps(schema, indent=2)}
Requirements:
- Make names and emails consistent
- Distribute ages realistically
- Include international diversity
- Ensure all emails are unique
Return JSON: {{"users": [...]}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("users", [])
def generate_from_schema(
self,
schema: dict,
count: int = 10,
constraints: dict = None
) -> list[dict]:
"""Generate data from arbitrary schema."""
constraints = constraints or {}
prompt = f"""Generate {count} records matching this schema.
Schema:
{json.dumps(schema, indent=2)}
Constraints:
{json.dumps(constraints, indent=2) if constraints else "None"}
Generate realistic, varied data that follows the schema exactly.
Return JSON: {{"records": [...]}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("records", [])
def generate_api_responses(
self,
endpoint_description: str,
scenarios: list[str],
include_errors: bool = True
) -> list[dict]:
"""Generate mock API responses for testing."""
prompt = f"""Generate mock API responses for testing.
Endpoint: {endpoint_description}
Scenarios to cover:
{json.dumps(scenarios, indent=2)}
{"Include error responses (400, 404, 500)." if include_errors else ""}
Return JSON:
{{
"responses": [
{{
"scenario": "description",
"status_code": 200,
"body": {{}},
"headers": {{}}
}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("responses", [])
def generate_test_documents(
self,
document_type: str,
count: int = 5,
include_variations: list[str] = None
) -> list[str]:
"""Generate test documents of various types."""
variations = include_variations or ["standard", "edge case", "malformed"]
prompt = f"""Generate {count} {document_type} documents for testing.
Include these variations:
{json.dumps(variations, indent=2)}
Each document should be realistic and complete.
Make them diverse in content and structure.
Return JSON:
{{
"documents": [
{{"content": "document text", "variation": "type of variation"}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("documents", [])
# Usage
generator = TestDataGenerator()
# Generate user fixtures
users = generator.generate_users(5)
print("Generated users:")
for user in users:
print(f" {user['name']} ({user['email']})")
# Generate from custom schema
product_schema = {
"sku": "string (8 alphanumeric)",
"name": "product name",
"price": "float (10-1000)",
"category": "electronics|clothing|home|sports",
"in_stock": "boolean",
"rating": "float (1-5)"
}
products = generator.generate_from_schema(
product_schema,
count=5,
constraints={"at_least_2_out_of_stock": True}
)
print("\nGenerated products:")
for product in products:
print(f" {product['sku']}: {product['name']} - ${product.get('price', 0):.2f}")
# Generate API test responses
api_responses = generator.generate_api_responses(
endpoint_description="GET /api/orders/{order_id}",
scenarios=["Valid order", "Order not found", "Server error"],
include_errors=True
)
print("\nAPI test responses:")
for resp in api_responses:
print(f" {resp['scenario']}: {resp['status_code']}")
Evaluation Dataset Creation
Copy
from openai import OpenAI
import json
class EvaluationDataGenerator:
"""Generate evaluation/benchmark datasets."""
def __init__(self, model: str = "gpt-4o"):
self.client = OpenAI()
self.model = model
def create_benchmark(
self,
task_description: str,
difficulty_distribution: dict = None,
total_examples: int = 100
) -> dict:
"""Create a balanced benchmark dataset."""
difficulty_distribution = difficulty_distribution or {
"easy": 0.3,
"medium": 0.5,
"hard": 0.2
}
all_examples = []
for difficulty, proportion in difficulty_distribution.items():
count = int(total_examples * proportion)
prompt = f"""Generate {count} evaluation examples for this task.
Task: {task_description}
Difficulty: {difficulty}
Requirements for {difficulty} difficulty:
- Easy: Simple, clear cases with obvious answers
- Medium: Requires some reasoning or has minor ambiguity
- Hard: Complex cases requiring nuanced understanding
Include a brief explanation for why each answer is correct.
Return JSON:
{{
"examples": [
{{
"input": "test input",
"expected_output": "correct answer",
"explanation": "why this is correct"
}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
for ex in data.get("examples", []):
ex["difficulty"] = difficulty
all_examples.append(ex)
return {
"task": task_description,
"total_examples": len(all_examples),
"difficulty_distribution": difficulty_distribution,
"examples": all_examples
}
def create_adversarial_set(
self,
task_description: str,
attack_types: list[str] = None,
examples_per_attack: int = 10
) -> list[dict]:
"""Generate adversarial test examples."""
attack_types = attack_types or [
"distraction",
"negation",
"paraphrase",
"format_variation",
"edge_case"
]
all_adversarial = []
for attack in attack_types:
prompt = f"""Generate {examples_per_attack} adversarial examples for this task.
Task: {task_description}
Attack type: {attack}
Attack descriptions:
- distraction: Add irrelevant information that might confuse
- negation: Use negations and double negatives
- paraphrase: Unusual phrasing of standard cases
- format_variation: Unusual formatting, punctuation, or structure
- edge_case: Boundary conditions and unusual values
Create examples that are tricky but still have correct answers.
Return JSON:
{{
"examples": [
{{
"input": "adversarial input",
"expected_output": "correct answer despite adversarial nature",
"attack_description": "what makes this adversarial"
}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
for ex in data.get("examples", []):
ex["attack_type"] = attack
all_adversarial.append(ex)
return all_adversarial
# Usage
eval_generator = EvaluationDataGenerator()
# Create benchmark
benchmark = eval_generator.create_benchmark(
task_description="Named entity recognition - identify person, organization, and location entities",
difficulty_distribution={"easy": 0.4, "medium": 0.4, "hard": 0.2},
total_examples=20
)
print(f"Created benchmark with {benchmark['total_examples']} examples")
for diff, prop in benchmark['difficulty_distribution'].items():
count = sum(1 for ex in benchmark['examples'] if ex['difficulty'] == diff)
print(f" {diff}: {count} examples")
# Create adversarial set
adversarial = eval_generator.create_adversarial_set(
task_description="Sentiment classification",
attack_types=["negation", "distraction"],
examples_per_attack=5
)
print(f"\nCreated {len(adversarial)} adversarial examples")
Synthetic Data Guidelines
- Always validate generated data against task requirements
- Use seed examples to guide generation style
- Include difficulty stratification for robust training
- Filter and deduplicate before use
- Test on held-out real data to verify effectiveness
Practice Exercise
Build a synthetic data pipeline that:- Generates task-specific training examples
- Creates augmented variations of existing data
- Filters for quality and removes duplicates
- Produces balanced evaluation benchmarks
- Includes adversarial test cases
- Diversity in generated examples
- Accuracy of labels and outputs
- Quality filtering at each stage
- Realistic edge case coverage