December 2025 Update: Covers OpenAI fine-tuning, LoRA/QLoRA with Hugging Face, when to fine-tune vs. prompt, and cost analysis.
When to Fine-Tune
Fine-tuning is not your first option. Consider this decision tree:Copy
┌─────────────────────────────────────────────────────────────┐
│ Do you need fine-tuning? │
└─────────────────────────────────────────────────────────────┘
│
┌─────────▼─────────┐
│ Try better prompts│
│ first? │
└─────────┬─────────┘
│ Still not working
┌─────────▼─────────┐
│ Try few-shot │
│ examples? │
└─────────┬─────────┘
│ Still not working
┌─────────▼─────────┐
│ Try RAG for │
│ knowledge? │
└─────────┬─────────┘
│ Still not working
┌─────────▼─────────┐
│ NOW consider │
│ fine-tuning │
└───────────────────┘
Fine-Tune When
| Use Case | Why Fine-Tuning Helps |
|---|---|
| Specific format | Consistent JSON structure, code style |
| Domain language | Medical, legal, technical jargon |
| Brand voice | Consistent tone across all outputs |
| Latency | Smaller fine-tuned model beats large model |
| Cost | Reduce tokens by baking knowledge in |
| Classification | High-accuracy categorization |
Don’t Fine-Tune When
| Use Case | Better Alternative |
|---|---|
| Add new knowledge | Use RAG instead |
| One-off tasks | Better prompts |
| Rapidly changing info | RAG with fresh data |
| General improvement | Use a bigger model |
OpenAI Fine-Tuning
Preparing Training Data
Copy
import json
from pathlib import Path
def create_training_example(system: str, user: str, assistant: str) -> dict:
"""Create a single training example in OpenAI format"""
return {
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
{"role": "assistant", "content": assistant}
]
}
def prepare_dataset(examples: list[dict], output_path: str):
"""Prepare dataset in JSONL format"""
with open(output_path, 'w') as f:
for ex in examples:
f.write(json.dumps(ex) + '\n')
# Example: Customer support fine-tuning
SYSTEM_PROMPT = "You are a helpful customer support agent for TechCorp."
training_data = [
create_training_example(
SYSTEM_PROMPT,
"How do I reset my password?",
"I'd be happy to help you reset your password! Here's how:\n\n1. Go to techcorp.com/reset\n2. Enter your email address\n3. Click the link in the email we send you\n4. Create a new password\n\nIf you don't receive the email within 5 minutes, check your spam folder. Need more help?"
),
create_training_example(
SYSTEM_PROMPT,
"Your product sucks!",
"I'm sorry to hear you're frustrated. I'd really like to help make things right. Could you tell me more about what's not working for you? Once I understand the issue, I can either help fix it or connect you with someone who can."
),
# Add 50-100+ examples for good results
]
prepare_dataset(training_data, "training_data.jsonl")
Validating Data
Copy
import json
import tiktoken
def validate_training_file(path: str, model: str = "gpt-4o-mini"):
"""Validate training data before upload"""
enc = tiktoken.encoding_for_model("gpt-4")
errors = []
warnings = []
total_tokens = 0
with open(path) as f:
for i, line in enumerate(f, 1):
try:
example = json.loads(line)
except json.JSONDecodeError:
errors.append(f"Line {i}: Invalid JSON")
continue
# Check structure
if "messages" not in example:
errors.append(f"Line {i}: Missing 'messages' key")
continue
messages = example["messages"]
# Check roles
roles = [m.get("role") for m in messages]
if "assistant" not in roles:
errors.append(f"Line {i}: No assistant message")
if "user" not in roles:
warnings.append(f"Line {i}: No user message")
# Count tokens
text = " ".join(m.get("content", "") for m in messages)
tokens = len(enc.encode(text))
total_tokens += tokens
if tokens > 16000:
warnings.append(f"Line {i}: {tokens} tokens (may be truncated)")
# Summary
with open(path) as f:
num_examples = sum(1 for _ in f)
print(f"Examples: {num_examples}")
print(f"Total tokens: {total_tokens:,}")
print(f"Estimated cost: ${total_tokens * 0.008 / 1000:.2f} (training)")
print(f"Errors: {len(errors)}")
print(f"Warnings: {len(warnings)}")
for e in errors[:10]:
print(f" ❌ {e}")
for w in warnings[:10]:
print(f" ⚠️ {w}")
return len(errors) == 0
# Validate
validate_training_file("training_data.jsonl")
Running Fine-Tuning
Copy
from openai import OpenAI
client = OpenAI()
# Step 1: Upload training file
with open("training_data.jsonl", "rb") as f:
training_file = client.files.create(
file=f,
purpose="fine-tune"
)
print(f"Uploaded file: {training_file.id}")
# Step 2: Create fine-tuning job
job = client.fine_tuning.jobs.create(
training_file=training_file.id,
model="gpt-4o-mini-2024-07-18", # Base model
hyperparameters={
"n_epochs": 3, # 2-4 is usually good
"batch_size": "auto",
"learning_rate_multiplier": "auto"
},
suffix="techcorp-support-v1" # Model name suffix
)
print(f"Job created: {job.id}")
# Step 3: Monitor progress
import time
while True:
job = client.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {job.status}")
if job.status in ["succeeded", "failed", "cancelled"]:
break
time.sleep(60)
# Step 4: Use fine-tuned model
if job.status == "succeeded":
print(f"Fine-tuned model: {job.fine_tuned_model}")
response = client.chat.completions.create(
model=job.fine_tuned_model,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": "I can't log in"}
]
)
print(response.choices[0].message.content)
Local Fine-Tuning with LoRA
Why LoRA?
LoRA (Low-Rank Adaptation) lets you fine-tune large models on consumer hardware:| Method | VRAM Needed | Training Time | Quality |
|---|---|---|---|
| Full fine-tune | 80GB+ | Days | Best |
| LoRA | 16-24GB | Hours | Great |
| QLoRA | 8-12GB | Hours | Good |
Setup
Copy
pip install transformers datasets peft accelerate bitsandbytes trl
QLoRA Fine-Tuning
Copy
import torch
from datasets import load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
# Model configuration
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
OUTPUT_DIR = "./fine-tuned-model"
# QLoRA configuration (4-bit quantization)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)
# LoRA configuration
lora_config = LoraConfig(
r=16, # Rank - higher = more capacity, more memory
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # Usually <1% of params
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# Load your dataset
dataset = load_dataset("json", data_files="training_data.jsonl")
def format_prompt(example):
"""Format for instruction tuning"""
messages = example["messages"]
text = ""
for msg in messages:
if msg["role"] == "system":
text += f"<s>[INST] <<SYS>>\n{msg['content']}\n<</SYS>>\n\n"
elif msg["role"] == "user":
text += f"{msg['content']} [/INST] "
else:
text += f"{msg['content']}</s>"
return {"text": text}
dataset = dataset.map(format_prompt)
# Training arguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
weight_decay=0.01,
logging_steps=10,
save_strategy="epoch",
fp16=True,
optim="paged_adamw_8bit",
)
# Train
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
tokenizer=tokenizer,
args=training_args,
max_seq_length=2048,
dataset_text_field="text",
)
trainer.train()
# Save
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
Inference with LoRA Model
Copy
from peft import PeftModel
# Load base model + LoRA adapter
base_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
# Merge for faster inference (optional)
model = model.merge_and_unload()
# Generate
def generate(prompt: str, max_tokens: int = 500) -> str:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=0.7,
do_sample=True,
top_p=0.9,
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
Fine-Tuning Best Practices
Data Quality > Quantity
Copy
# ❌ Bad: Low-quality, inconsistent examples
bad_examples = [
{"user": "help", "assistant": "ok"},
{"user": "????", "assistant": "I don't understand"},
]
# ✅ Good: High-quality, consistent format
good_examples = [
{
"user": "How do I export my data?",
"assistant": "I'd be happy to help you export your data! Here's how:\n\n**For CSV export:**\n1. Go to Settings > Data\n2. Click 'Export'\n3. Select 'CSV' format\n4. Choose the date range\n5. Click 'Download'\n\nThe file will be emailed to you within 5 minutes. Let me know if you need help with anything else!"
}
]
Evaluation After Fine-Tuning
Copy
def evaluate_fine_tuned_model(
model_id: str,
eval_dataset: list[dict],
baseline_model: str = "gpt-4o-mini"
) -> dict:
"""Compare fine-tuned model to baseline"""
results = {"fine_tuned": [], "baseline": []}
for example in eval_dataset:
# Fine-tuned response
ft_response = client.chat.completions.create(
model=model_id,
messages=example["messages"][:-1] # Exclude assistant
).choices[0].message.content
# Baseline response
bl_response = client.chat.completions.create(
model=baseline_model,
messages=example["messages"][:-1]
).choices[0].message.content
# Score both
expected = example["messages"][-1]["content"]
ft_score = llm_judge(ft_response, expected)
bl_score = llm_judge(bl_response, expected)
results["fine_tuned"].append(ft_score)
results["baseline"].append(bl_score)
return {
"fine_tuned_avg": sum(results["fine_tuned"]) / len(results["fine_tuned"]),
"baseline_avg": sum(results["baseline"]) / len(results["baseline"]),
"improvement": (sum(results["fine_tuned"]) - sum(results["baseline"])) / len(results["fine_tuned"])
}
Cost Comparison
Copy
def calculate_fine_tuning_roi(
training_tokens: int,
daily_inference_tokens: int,
days: int = 30
) -> dict:
"""Calculate if fine-tuning is worth it"""
# OpenAI pricing (as of Dec 2025)
prices = {
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4o-mini-ft": {"input": 0.30, "output": 1.20, "training": 3.00},
"gpt-4o": {"input": 2.50, "output": 10.00},
}
# Training cost (one-time)
training_cost = (training_tokens / 1_000_000) * prices["gpt-4o-mini-ft"]["training"]
# Inference cost comparison
# Assume 50% input, 50% output tokens
input_tokens = daily_inference_tokens * 0.5
output_tokens = daily_inference_tokens * 0.5
# Scenario 1: Use base model with long prompts
base_daily = (
(input_tokens / 1_000_000) * prices["gpt-4o-mini"]["input"] +
(output_tokens / 1_000_000) * prices["gpt-4o-mini"]["output"]
)
# Scenario 2: Fine-tuned with shorter prompts (assume 40% fewer tokens)
ft_input = input_tokens * 0.6
ft_output = output_tokens # Output usually similar
ft_daily = (
(ft_input / 1_000_000) * prices["gpt-4o-mini-ft"]["input"] +
(ft_output / 1_000_000) * prices["gpt-4o-mini-ft"]["output"]
)
total_base = base_daily * days
total_ft = training_cost + (ft_daily * days)
return {
"training_cost": f"${training_cost:.2f}",
"base_model_monthly": f"${total_base:.2f}",
"fine_tuned_monthly": f"${total_ft:.2f}",
"savings": f"${total_base - total_ft:.2f}",
"break_even_days": training_cost / (base_daily - ft_daily) if base_daily > ft_daily else "Never"
}
# Example
print(calculate_fine_tuning_roi(
training_tokens=500_000, # 500K training tokens
daily_inference_tokens=1_000_000, # 1M tokens/day
days=30
))
Key Takeaways
Exhaust Alternatives First
Try prompts, few-shot, and RAG before fine-tuning. It’s often unnecessary.
Quality Over Quantity
50 perfect examples beat 500 mediocre ones. Curate carefully.
LoRA for Local
Use QLoRA to fine-tune 7B+ models on consumer GPUs.
Always Evaluate
Compare fine-tuned model to baseline. Measure the improvement.
What’s Next
Evaluation & Testing
Learn how to properly evaluate your fine-tuned models