Instructor for Validated Outputs
Instructor provides type-safe structured outputs using Pydantic.Basic Usage
Copy
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Optional
# Patch OpenAI client
client = instructor.from_openai(OpenAI())
class UserInfo(BaseModel):
"""Structured user information."""
name: str = Field(description="Full name of the user")
age: int = Field(ge=0, le=150, description="Age in years")
email: Optional[str] = Field(None, description="Email address")
occupation: str = Field(description="Current occupation")
def extract_user_info(text: str) -> UserInfo:
"""Extract structured user info from text."""
return client.chat.completions.create(
model="gpt-4o-mini",
response_model=UserInfo,
messages=[
{
"role": "user",
"content": f"Extract user information from: {text}"
}
]
)
# Usage
text = """
Hi, I'm Sarah Johnson. I'm 28 years old and work as a software
engineer at a tech startup. You can reach me at [email protected]
"""
user = extract_user_info(text)
print(f"Name: {user.name}")
print(f"Age: {user.age}")
print(f"Email: {user.email}")
print(f"Occupation: {user.occupation}")
Complex Nested Structures
Copy
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import Optional
from enum import Enum
client = instructor.from_openai(OpenAI())
class Priority(str, Enum):
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
CRITICAL = "critical"
class SubTask(BaseModel):
"""A subtask within a task."""
title: str
estimated_hours: float = Field(ge=0)
completed: bool = False
class Task(BaseModel):
"""A structured task with subtasks."""
title: str
description: str
priority: Priority
assignee: Optional[str] = None
subtasks: list[SubTask] = Field(default_factory=list)
tags: list[str] = Field(default_factory=list)
class ProjectPlan(BaseModel):
"""Complete project plan."""
project_name: str
objective: str
tasks: list[Task]
total_estimated_hours: float = Field(ge=0)
def create_project_plan(description: str) -> ProjectPlan:
"""Generate a structured project plan."""
return client.chat.completions.create(
model="gpt-4o",
response_model=ProjectPlan,
messages=[
{
"role": "system",
"content": "You are a project planning assistant. Create detailed, actionable project plans."
},
{
"role": "user",
"content": f"Create a project plan for: {description}"
}
]
)
# Usage
plan = create_project_plan(
"Build a REST API for a todo application with user authentication"
)
print(f"Project: {plan.project_name}")
print(f"Objective: {plan.objective}")
print(f"Total Hours: {plan.total_estimated_hours}")
for task in plan.tasks:
print(f"\nTask: {task.title} [{task.priority.value}]")
for subtask in task.subtasks:
print(f" - {subtask.title} ({subtask.estimated_hours}h)")
Retry Logic with Validation
Copy
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field, field_validator
from tenacity import retry, stop_after_attempt
client = instructor.from_openai(OpenAI())
class CodeReview(BaseModel):
"""Structured code review result."""
summary: str = Field(min_length=10, max_length=500)
issues: list[str] = Field(min_length=1)
score: int = Field(ge=1, le=10)
suggested_improvements: list[str]
@field_validator("issues")
@classmethod
def validate_issues(cls, v):
if not v:
raise ValueError("At least one issue must be identified")
return v
@field_validator("score")
@classmethod
def validate_score(cls, v):
if v < 1 or v > 10:
raise ValueError("Score must be between 1 and 10")
return v
@retry(stop=stop_after_attempt(3))
def review_code(code: str) -> CodeReview:
"""Review code with automatic retries on validation failure."""
return client.chat.completions.create(
model="gpt-4o",
response_model=CodeReview,
max_retries=3,
messages=[
{
"role": "system",
"content": "You are a senior code reviewer. Provide detailed, constructive feedback."
},
{
"role": "user",
"content": f"Review this code:\n\n```python\n{code}\n```"
}
]
)
# Usage
code = """
def get_user(id):
query = f"SELECT * FROM users WHERE id = {id}"
return db.execute(query)
"""
review = review_code(code)
print(f"Score: {review.score}/10")
print(f"Summary: {review.summary}")
print(f"Issues: {review.issues}")
Custom Validation Strategies
Regex-Based Extraction
Copy
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class ExtractedData:
"""Data extracted using regex patterns."""
emails: list[str]
phone_numbers: list[str]
urls: list[str]
dates: list[str]
class RegexExtractor:
"""Extract structured data using regex patterns."""
PATTERNS = {
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
"phone": r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
"url": r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*)',
"date": r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b',
}
def extract(self, text: str) -> ExtractedData:
"""Extract all structured data from text."""
return ExtractedData(
emails=re.findall(self.PATTERNS["email"], text),
phone_numbers=re.findall(self.PATTERNS["phone"], text),
urls=re.findall(self.PATTERNS["url"], text),
dates=re.findall(self.PATTERNS["date"], text, re.IGNORECASE)
)
def extract_pattern(self, text: str, pattern_name: str) -> list[str]:
"""Extract specific pattern from text."""
pattern = self.PATTERNS.get(pattern_name)
if not pattern:
raise ValueError(f"Unknown pattern: {pattern_name}")
return re.findall(pattern, text)
# Usage
extractor = RegexExtractor()
llm_output = """
Contact us at [email protected] or [email protected].
Call 555-123-4567 or +1 (800) 555-0199.
Visit https://www.example.com for more info.
Meeting scheduled for 12/25/2024.
"""
data = extractor.extract(llm_output)
print(f"Emails: {data.emails}")
print(f"Phones: {data.phone_numbers}")
print(f"URLs: {data.urls}")
print(f"Dates: {data.dates}")
JSON Extraction and Validation
Copy
import json
import re
from typing import Any, Optional, Type, TypeVar
from pydantic import BaseModel, ValidationError
T = TypeVar("T", bound=BaseModel)
class JSONExtractor:
"""Extract and validate JSON from LLM outputs."""
@staticmethod
def extract_json(text: str) -> Optional[dict]:
"""Extract JSON from text, handling various formats."""
# Try direct parse first
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try to find JSON in code blocks
code_block = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
if code_block:
try:
return json.loads(code_block.group(1).strip())
except json.JSONDecodeError:
pass
# Try to find raw JSON object/array
json_match = re.search(r'(\{[\s\S]*\}|\[[\s\S]*\])', text)
if json_match:
try:
return json.loads(json_match.group(1))
except json.JSONDecodeError:
pass
return None
@classmethod
def extract_and_validate(
cls,
text: str,
model: Type[T]
) -> tuple[Optional[T], list[str]]:
"""Extract JSON and validate against Pydantic model."""
errors = []
data = cls.extract_json(text)
if data is None:
errors.append("Could not extract JSON from response")
return None, errors
try:
validated = model.model_validate(data)
return validated, []
except ValidationError as e:
for error in e.errors():
field = ".".join(str(loc) for loc in error["loc"])
errors.append(f"{field}: {error['msg']}")
return None, errors
# Usage
class ProductInfo(BaseModel):
name: str
price: float
in_stock: bool
categories: list[str]
llm_output = """
Here's the product information:
```json
{
"name": "Wireless Headphones",
"price": 99.99,
"in_stock": true,
"categories": ["electronics", "audio"]
}
Copy
### Multi-Step Validation Pipeline
```python
from dataclasses import dataclass
from typing import Callable, Any, Optional
from openai import OpenAI
@dataclass
class ValidationResult:
"""Result of a validation step."""
valid: bool
data: Any
errors: list[str]
class ValidationPipeline:
"""Multi-step validation pipeline for LLM outputs."""
def __init__(self):
self.steps: list[tuple[str, Callable]] = []
def add_step(
self,
name: str,
validator: Callable[[Any], ValidationResult]
) -> "ValidationPipeline":
"""Add a validation step."""
self.steps.append((name, validator))
return self
def validate(self, data: Any) -> ValidationResult:
"""Run all validation steps."""
current_data = data
all_errors = []
for name, validator in self.steps:
result = validator(current_data)
if not result.valid:
all_errors.extend([f"[{name}] {e}" for e in result.errors])
return ValidationResult(
valid=False,
data=current_data,
errors=all_errors
)
current_data = result.data
return ValidationResult(
valid=True,
data=current_data,
errors=[]
)
# Validation functions
def validate_not_empty(data: str) -> ValidationResult:
if not data or not data.strip():
return ValidationResult(False, data, ["Response is empty"])
return ValidationResult(True, data.strip(), [])
def validate_json_format(data: str) -> ValidationResult:
extracted = JSONExtractor.extract_json(data)
if extracted is None:
return ValidationResult(False, data, ["Invalid JSON format"])
return ValidationResult(True, extracted, [])
def validate_required_fields(required: list[str]):
def validator(data: dict) -> ValidationResult:
missing = [f for f in required if f not in data]
if missing:
return ValidationResult(
False, data, [f"Missing required field: {f}" for f in missing]
)
return ValidationResult(True, data, [])
return validator
def validate_field_types(type_map: dict):
def validator(data: dict) -> ValidationResult:
errors = []
for field, expected_type in type_map.items():
if field in data and not isinstance(data[field], expected_type):
errors.append(
f"Field '{field}' should be {expected_type.__name__}"
)
if errors:
return ValidationResult(False, data, errors)
return ValidationResult(True, data, [])
return validator
# Usage
pipeline = ValidationPipeline()
pipeline.add_step("not_empty", validate_not_empty)
pipeline.add_step("json_format", validate_json_format)
pipeline.add_step("required_fields", validate_required_fields(["name", "value"]))
pipeline.add_step("field_types", validate_field_types({"name": str, "value": (int, float)}))
# Test with LLM output
llm_output = '{"name": "temperature", "value": 72.5}'
result = pipeline.validate(llm_output)
if result.valid:
print(f"Valid data: {result.data}")
else:
print(f"Validation errors: {result.errors}")
LLM-Based Validation
Use an LLM to validate another LLM’s output.Copy
from openai import OpenAI
import json
class LLMValidator:
"""Use LLM to validate outputs."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def validate_factual(
self,
claim: str,
context: str = ""
) -> dict:
"""Check if a claim is factually accurate."""
prompt = f"""Evaluate the factual accuracy of this claim:
Claim: {claim}
{f"Context: {context}" if context else ""}
Respond with JSON:
{{
"is_accurate": true/false,
"confidence": 0.0-1.0,
"reasoning": "explanation",
"corrections": ["list of corrections if inaccurate"]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def validate_consistency(
self,
statements: list[str]
) -> dict:
"""Check if statements are consistent with each other."""
prompt = f"""Check these statements for logical consistency:
Statements:
{chr(10).join(f"{i+1}. {s}" for i, s in enumerate(statements))}
Respond with JSON:
{{
"is_consistent": true/false,
"contradictions": [
{{"statement_1": index, "statement_2": index, "explanation": "..."}}
],
"overall_assessment": "summary"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def validate_format(
self,
output: str,
expected_format: str
) -> dict:
"""Validate output matches expected format."""
prompt = f"""Check if this output matches the expected format:
Output:
{output}
Expected format:
{expected_format}
Respond with JSON:
{{
"matches_format": true/false,
"issues": ["list of format issues"],
"suggested_fix": "corrected version if needed"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def validate_safety(
self,
content: str
) -> dict:
"""Check content for safety issues."""
prompt = f"""Analyze this content for safety issues:
Content:
{content}
Check for:
1. Harmful or dangerous instructions
2. Personal information exposure
3. Inappropriate content
4. Potential misuse
Respond with JSON:
{{
"is_safe": true/false,
"issues": [
{{"type": "category", "severity": "low/medium/high", "description": "..."}}
],
"recommendation": "action to take"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# Usage
validator = LLMValidator()
# Check factual accuracy
result = validator.validate_factual(
"Python was created by Guido van Rossum in 1989."
)
print(f"Accurate: {result['is_accurate']}")
print(f"Confidence: {result['confidence']}")
# Check consistency
statements = [
"The meeting is at 3 PM.",
"Everyone should arrive by 2:30 PM.",
"The meeting was rescheduled to 4 PM."
]
result = validator.validate_consistency(statements)
print(f"Consistent: {result['is_consistent']}")
Fallback Parsing Strategies
Copy
from typing import Any, Optional, Callable
from dataclasses import dataclass
import json
@dataclass
class ParseResult:
"""Result of parsing attempt."""
success: bool
data: Any
method: str
error: Optional[str] = None
class FallbackParser:
"""Try multiple parsing strategies with fallbacks."""
def __init__(self):
self.parsers: list[tuple[str, Callable]] = []
def add_parser(
self,
name: str,
parser: Callable[[str], Any]
) -> "FallbackParser":
"""Add a parser to the fallback chain."""
self.parsers.append((name, parser))
return self
def parse(self, text: str) -> ParseResult:
"""Try parsers in order until one succeeds."""
for name, parser in self.parsers:
try:
result = parser(text)
return ParseResult(
success=True,
data=result,
method=name
)
except Exception as e:
continue
return ParseResult(
success=False,
data=None,
method="none",
error="All parsers failed"
)
# Parser functions
def parse_direct_json(text: str) -> dict:
return json.loads(text)
def parse_code_block_json(text: str) -> dict:
import re
match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
if match:
return json.loads(match.group(1))
raise ValueError("No code block found")
def parse_embedded_json(text: str) -> dict:
import re
match = re.search(r'(\{[\s\S]*\})', text)
if match:
return json.loads(match.group(1))
raise ValueError("No JSON object found")
def parse_key_value(text: str) -> dict:
"""Parse key: value format."""
import re
result = {}
for line in text.split("\n"):
match = re.match(r'^\s*["\']?(\w+)["\']?\s*[:=]\s*(.+)$', line)
if match:
key = match.group(1)
value = match.group(2).strip().strip('"\'')
# Try to convert to appropriate type
try:
value = json.loads(value)
except:
pass
result[key] = value
if not result:
raise ValueError("No key-value pairs found")
return result
def parse_with_llm(client, model: str = "gpt-4o-mini"):
"""Create an LLM-based parser as last resort."""
def parser(text: str) -> dict:
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": "Extract structured data as JSON from the given text."
},
{"role": "user", "content": text}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
return parser
# Usage
from openai import OpenAI
client = OpenAI()
parser = FallbackParser()
parser.add_parser("direct_json", parse_direct_json)
parser.add_parser("code_block", parse_code_block_json)
parser.add_parser("embedded", parse_embedded_json)
parser.add_parser("key_value", parse_key_value)
parser.add_parser("llm", parse_with_llm(client))
# Test with various formats
outputs = [
'{"name": "test", "value": 42}',
'Here is the data:\n```json\n{"name": "test"}\n```',
'name: test\nvalue: 42',
'The result is name=test and value=42',
]
for output in outputs:
result = parser.parse(output)
print(f"Method: {result.method}, Data: {result.data}")
Validation Best Practices
- Always validate LLM outputs before using them
- Use Pydantic for type-safe structured extraction
- Implement fallback strategies for robustness
- Consider LLM-based validation for complex checks
- Log validation failures for debugging and improvement
Practice Exercise
Build a validation system that:- Extracts structured data from free-form LLM responses
- Validates against Pydantic schemas with custom validators
- Implements multiple fallback parsing strategies
- Uses LLM-based validation for complex checks
- Provides detailed error messages for failures
- Handling edge cases and malformed outputs
- Performance optimization for high-volume validation
- Comprehensive logging of validation failures
- Automatic retry and correction strategies