Skip to main content
December 2025 Update: Covers OpenAI fine-tuning, LoRA/QLoRA with Hugging Face, when to fine-tune vs. prompt, and cost analysis.

When to Fine-Tune

Fine-tuning is not your first option. Consider this decision tree:
┌─────────────────────────────────────────────────────────────┐
│               Do you need fine-tuning?                       │
└─────────────────────────────────────────────────────────────┘

                    ┌─────────▼─────────┐
                    │ Try better prompts│
                    │   first?          │
                    └─────────┬─────────┘
                              │ Still not working
                    ┌─────────▼─────────┐
                    │ Try few-shot      │
                    │ examples?         │
                    └─────────┬─────────┘
                              │ Still not working
                    ┌─────────▼─────────┐
                    │ Try RAG for       │
                    │ knowledge?        │
                    └─────────┬─────────┘
                              │ Still not working
                    ┌─────────▼─────────┐
                    │ NOW consider      │
                    │ fine-tuning       │
                    └───────────────────┘

Fine-Tune When

Use CaseWhy Fine-Tuning Helps
Specific formatConsistent JSON structure, code style
Domain languageMedical, legal, technical jargon
Brand voiceConsistent tone across all outputs
LatencySmaller fine-tuned model beats large model
CostReduce tokens by baking knowledge in
ClassificationHigh-accuracy categorization

Don’t Fine-Tune When

Use CaseBetter Alternative
Add new knowledgeUse RAG instead
One-off tasksBetter prompts
Rapidly changing infoRAG with fresh data
General improvementUse a bigger model

OpenAI Fine-Tuning

Preparing Training Data

import json
from pathlib import Path

def create_training_example(system: str, user: str, assistant: str) -> dict:
    """Create a single training example in OpenAI format"""
    return {
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant}
        ]
    }

def prepare_dataset(examples: list[dict], output_path: str):
    """Prepare dataset in JSONL format"""
    with open(output_path, 'w') as f:
        for ex in examples:
            f.write(json.dumps(ex) + '\n')

# Example: Customer support fine-tuning
SYSTEM_PROMPT = "You are a helpful customer support agent for TechCorp."

training_data = [
    create_training_example(
        SYSTEM_PROMPT,
        "How do I reset my password?",
        "I'd be happy to help you reset your password! Here's how:\n\n1. Go to techcorp.com/reset\n2. Enter your email address\n3. Click the link in the email we send you\n4. Create a new password\n\nIf you don't receive the email within 5 minutes, check your spam folder. Need more help?"
    ),
    create_training_example(
        SYSTEM_PROMPT,
        "Your product sucks!",
        "I'm sorry to hear you're frustrated. I'd really like to help make things right. Could you tell me more about what's not working for you? Once I understand the issue, I can either help fix it or connect you with someone who can."
    ),
    # Add 50-100+ examples for good results
]

prepare_dataset(training_data, "training_data.jsonl")

Validating Data

import json
import tiktoken

def validate_training_file(path: str, model: str = "gpt-4o-mini"):
    """Validate training data before upload"""
    enc = tiktoken.encoding_for_model("gpt-4")
    
    errors = []
    warnings = []
    total_tokens = 0
    
    with open(path) as f:
        for i, line in enumerate(f, 1):
            try:
                example = json.loads(line)
            except json.JSONDecodeError:
                errors.append(f"Line {i}: Invalid JSON")
                continue
            
            # Check structure
            if "messages" not in example:
                errors.append(f"Line {i}: Missing 'messages' key")
                continue
            
            messages = example["messages"]
            
            # Check roles
            roles = [m.get("role") for m in messages]
            if "assistant" not in roles:
                errors.append(f"Line {i}: No assistant message")
            if "user" not in roles:
                warnings.append(f"Line {i}: No user message")
            
            # Count tokens
            text = " ".join(m.get("content", "") for m in messages)
            tokens = len(enc.encode(text))
            total_tokens += tokens
            
            if tokens > 16000:
                warnings.append(f"Line {i}: {tokens} tokens (may be truncated)")
    
    # Summary
    with open(path) as f:
        num_examples = sum(1 for _ in f)
    
    print(f"Examples: {num_examples}")
    print(f"Total tokens: {total_tokens:,}")
    print(f"Estimated cost: ${total_tokens * 0.008 / 1000:.2f} (training)")
    print(f"Errors: {len(errors)}")
    print(f"Warnings: {len(warnings)}")
    
    for e in errors[:10]:
        print(f"  ❌ {e}")
    for w in warnings[:10]:
        print(f"  ⚠️ {w}")
    
    return len(errors) == 0

# Validate
validate_training_file("training_data.jsonl")

Running Fine-Tuning

from openai import OpenAI

client = OpenAI()

# Step 1: Upload training file
with open("training_data.jsonl", "rb") as f:
    training_file = client.files.create(
        file=f,
        purpose="fine-tune"
    )
print(f"Uploaded file: {training_file.id}")

# Step 2: Create fine-tuning job
job = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    model="gpt-4o-mini-2024-07-18",  # Base model
    hyperparameters={
        "n_epochs": 3,  # 2-4 is usually good
        "batch_size": "auto",
        "learning_rate_multiplier": "auto"
    },
    suffix="techcorp-support-v1"  # Model name suffix
)
print(f"Job created: {job.id}")

# Step 3: Monitor progress
import time

while True:
    job = client.fine_tuning.jobs.retrieve(job.id)
    print(f"Status: {job.status}")
    
    if job.status in ["succeeded", "failed", "cancelled"]:
        break
    
    time.sleep(60)

# Step 4: Use fine-tuned model
if job.status == "succeeded":
    print(f"Fine-tuned model: {job.fine_tuned_model}")
    
    response = client.chat.completions.create(
        model=job.fine_tuned_model,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": "I can't log in"}
        ]
    )
    print(response.choices[0].message.content)

Local Fine-Tuning with LoRA

Why LoRA?

LoRA (Low-Rank Adaptation) lets you fine-tune large models on consumer hardware:
MethodVRAM NeededTraining TimeQuality
Full fine-tune80GB+DaysBest
LoRA16-24GBHoursGreat
QLoRA8-12GBHoursGood

Setup

pip install transformers datasets peft accelerate bitsandbytes trl

QLoRA Fine-Tuning

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

# Model configuration
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
OUTPUT_DIR = "./fine-tuned-model"

# QLoRA configuration (4-bit quantization)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank - higher = more capacity, more memory
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Usually <1% of params

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load your dataset
dataset = load_dataset("json", data_files="training_data.jsonl")

def format_prompt(example):
    """Format for instruction tuning"""
    messages = example["messages"]
    text = ""
    for msg in messages:
        if msg["role"] == "system":
            text += f"<s>[INST] <<SYS>>\n{msg['content']}\n<</SYS>>\n\n"
        elif msg["role"] == "user":
            text += f"{msg['content']} [/INST] "
        else:
            text += f"{msg['content']}</s>"
    return {"text": text}

dataset = dataset.map(format_prompt)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    optim="paged_adamw_8bit",
)

# Train
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=2048,
    dataset_text_field="text",
)

trainer.train()

# Save
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Inference with LoRA Model

from peft import PeftModel

# Load base model + LoRA adapter
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)

# Merge for faster inference (optional)
model = model.merge_and_unload()

# Generate
def generate(prompt: str, max_tokens: int = 500) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Fine-Tuning Best Practices

Data Quality > Quantity

# ❌ Bad: Low-quality, inconsistent examples
bad_examples = [
    {"user": "help", "assistant": "ok"},
    {"user": "????", "assistant": "I don't understand"},
]

# ✅ Good: High-quality, consistent format
good_examples = [
    {
        "user": "How do I export my data?",
        "assistant": "I'd be happy to help you export your data! Here's how:\n\n**For CSV export:**\n1. Go to Settings > Data\n2. Click 'Export'\n3. Select 'CSV' format\n4. Choose the date range\n5. Click 'Download'\n\nThe file will be emailed to you within 5 minutes. Let me know if you need help with anything else!"
    }
]

Evaluation After Fine-Tuning

def evaluate_fine_tuned_model(
    model_id: str,
    eval_dataset: list[dict],
    baseline_model: str = "gpt-4o-mini"
) -> dict:
    """Compare fine-tuned model to baseline"""
    
    results = {"fine_tuned": [], "baseline": []}
    
    for example in eval_dataset:
        # Fine-tuned response
        ft_response = client.chat.completions.create(
            model=model_id,
            messages=example["messages"][:-1]  # Exclude assistant
        ).choices[0].message.content
        
        # Baseline response
        bl_response = client.chat.completions.create(
            model=baseline_model,
            messages=example["messages"][:-1]
        ).choices[0].message.content
        
        # Score both
        expected = example["messages"][-1]["content"]
        
        ft_score = llm_judge(ft_response, expected)
        bl_score = llm_judge(bl_response, expected)
        
        results["fine_tuned"].append(ft_score)
        results["baseline"].append(bl_score)
    
    return {
        "fine_tuned_avg": sum(results["fine_tuned"]) / len(results["fine_tuned"]),
        "baseline_avg": sum(results["baseline"]) / len(results["baseline"]),
        "improvement": (sum(results["fine_tuned"]) - sum(results["baseline"])) / len(results["fine_tuned"])
    }

Cost Comparison

def calculate_fine_tuning_roi(
    training_tokens: int,
    daily_inference_tokens: int,
    days: int = 30
) -> dict:
    """Calculate if fine-tuning is worth it"""
    
    # OpenAI pricing (as of Dec 2025)
    prices = {
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "gpt-4o-mini-ft": {"input": 0.30, "output": 1.20, "training": 3.00},
        "gpt-4o": {"input": 2.50, "output": 10.00},
    }
    
    # Training cost (one-time)
    training_cost = (training_tokens / 1_000_000) * prices["gpt-4o-mini-ft"]["training"]
    
    # Inference cost comparison
    # Assume 50% input, 50% output tokens
    input_tokens = daily_inference_tokens * 0.5
    output_tokens = daily_inference_tokens * 0.5
    
    # Scenario 1: Use base model with long prompts
    base_daily = (
        (input_tokens / 1_000_000) * prices["gpt-4o-mini"]["input"] +
        (output_tokens / 1_000_000) * prices["gpt-4o-mini"]["output"]
    )
    
    # Scenario 2: Fine-tuned with shorter prompts (assume 40% fewer tokens)
    ft_input = input_tokens * 0.6
    ft_output = output_tokens  # Output usually similar
    ft_daily = (
        (ft_input / 1_000_000) * prices["gpt-4o-mini-ft"]["input"] +
        (ft_output / 1_000_000) * prices["gpt-4o-mini-ft"]["output"]
    )
    
    total_base = base_daily * days
    total_ft = training_cost + (ft_daily * days)
    
    return {
        "training_cost": f"${training_cost:.2f}",
        "base_model_monthly": f"${total_base:.2f}",
        "fine_tuned_monthly": f"${total_ft:.2f}",
        "savings": f"${total_base - total_ft:.2f}",
        "break_even_days": training_cost / (base_daily - ft_daily) if base_daily > ft_daily else "Never"
    }

# Example
print(calculate_fine_tuning_roi(
    training_tokens=500_000,      # 500K training tokens
    daily_inference_tokens=1_000_000,  # 1M tokens/day
    days=30
))

Key Takeaways

Exhaust Alternatives First

Try prompts, few-shot, and RAG before fine-tuning. It’s often unnecessary.

Quality Over Quantity

50 perfect examples beat 500 mediocre ones. Curate carefully.

LoRA for Local

Use QLoRA to fine-tune 7B+ models on consumer GPUs.

Always Evaluate

Compare fine-tuned model to baseline. Measure the improvement.

What’s Next

Evaluation & Testing

Learn how to properly evaluate your fine-tuned models