Skip to main content
Cross-Validation Strategies

Cross-Validation Strategies

Why One Train-Test Split Isn’t Enough

You built a model. It got 95% accuracy on your test set. Ship it? Not so fast.
That 95% might be luck. Your test set might have been “easy.” A different split might show 75%.

The Lucky Split Problem

import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Generate dataset
X, y = make_classification(n_samples=500, n_features=20, random_state=42)

# Try multiple splits
accuracies = []
for seed in range(50):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    accuracies.append(acc)

print(f"Accuracy range: {min(accuracies):.3f} to {max(accuracies):.3f}")
print(f"Standard deviation: {np.std(accuracies):.3f}")
Output:
Accuracy range: 0.820 to 0.920
Standard deviation: 0.024
That’s a 10 percentage point swing just from changing the random split!
Estimated Time: 2-3 hours
Difficulty: Intermediate
Prerequisites: Model Evaluation chapter
Tools: scikit-learn, numpy

K-Fold Cross-Validation

The gold standard. Split data into K parts, train on K-1, test on 1, rotate.
Fold 1: [TEST] [Train] [Train] [Train] [Train]
Fold 2: [Train] [TEST] [Train] [Train] [Train]
Fold 3: [Train] [Train] [TEST] [Train] [Train]
Fold 4: [Train] [Train] [Train] [TEST] [Train]
Fold 5: [Train] [Train] [Train] [Train] [TEST]
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# K-Fold cross-validation
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Different values of K
k_values = [3, 5, 10, 15, 20]
results = {}

for k in k_values:
    cv = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    results[k] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'scores': scores
    }
    print(f"K={k}: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot of scores
boxes = [results[k]['scores'] for k in k_values]
axes[0].boxplot(boxes, labels=k_values)
axes[0].set_xlabel('Number of Folds (K)')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('K-Fold Cross-Validation Scores')
axes[0].axhline(y=np.mean([r['mean'] for r in results.values()]), 
                color='red', linestyle='--', label='Average')
axes[0].legend()

# Mean and std
means = [results[k]['mean'] for k in k_values]
stds = [results[k]['std'] for k in k_values]
axes[1].errorbar(k_values, means, yerr=stds, fmt='o-', capsize=5)
axes[1].set_xlabel('Number of Folds (K)')
axes[1].set_ylabel('Mean Accuracy')
axes[1].set_title('Mean Accuracy with Standard Deviation')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
Choosing K:
  • K=5: Good default, balances bias and variance
  • K=10: More reliable estimate, more compute
  • K=n (LOOCV): Lowest bias, highest variance, very expensive

Stratified K-Fold

When classes are imbalanced, regular K-Fold can create folds with unequal class distributions.
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.datasets import make_classification

# Create imbalanced dataset
X, y = make_classification(
    n_samples=1000, 
    n_features=20,
    weights=[0.9, 0.1],  # 90% class 0, 10% class 1
    random_state=42
)

print(f"Class distribution: {np.bincount(y)}")

# Regular K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print("\nRegular K-Fold class distribution:")
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    train_dist = np.bincount(y[train_idx])
    test_dist = np.bincount(y[test_idx])
    print(f"Fold {fold+1}: Train={train_dist}, Test={test_dist}")

# Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("\nStratified K-Fold class distribution:")
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    train_dist = np.bincount(y[train_idx])
    test_dist = np.bincount(y[test_idx])
    print(f"Fold {fold+1}: Train={train_dist}, Test={test_dist}")
Output:
Class distribution: [900 100]

Regular K-Fold class distribution:
Fold 1: Train=[720  80], Test=[180  20]
Fold 2: Train=[718  82], Test=[182  18]
Fold 3: Train=[722  78], Test=[178  22]
Fold 4: Train=[719  81], Test=[181  19]
Fold 5: Train=[721  79], Test=[179  21]

Stratified K-Fold class distribution:
Fold 1: Train=[720  80], Test=[180  20]
Fold 2: Train=[720  80], Test=[180  20]
Fold 3: Train=[720  80], Test=[180  20]
Fold 4: Train=[720  80], Test=[180  20]
Fold 5: Train=[720  80], Test=[180  20]
Stratified preserves class ratios in every fold!

Leave-One-Out Cross-Validation (LOOCV)

Extreme case: K = n (number of samples). Train on all but one, test on one.
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
import time

iris = load_iris()
X, y = iris.data, iris.target

model = KNeighborsClassifier(n_neighbors=3)

# Time comparison
start = time.time()
loo_scores = cross_val_score(model, X, y, cv=LeaveOneOut())
loo_time = time.time() - start

start = time.time()
kf_scores = cross_val_score(model, X, y, cv=KFold(n_splits=5))
kf_time = time.time() - start

print(f"LOOCV: {loo_scores.mean():.3f} (+/- {loo_scores.std()*2:.3f})")
print(f"Time: {loo_time:.2f}s, {len(loo_scores)} iterations")

print(f"\n5-Fold: {kf_scores.mean():.3f} (+/- {kf_scores.std()*2:.3f})")
print(f"Time: {kf_time:.2f}s, {len(kf_scores)} iterations")
LOOCV Pitfalls:
  • Computationally expensive (n train-test cycles)
  • High variance in estimates
  • Use only for very small datasets

Time Series Cross-Validation

Time series data is special: you can’t peek at the future!
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import matplotlib.pyplot as plt

# Generate time series data
np.random.seed(42)
n = 100
time_index = np.arange(n)
X = np.random.randn(n, 5)
y = np.sin(time_index * 0.1) + np.random.randn(n) * 0.1

# Time Series Split
tscv = TimeSeriesSplit(n_splits=5)

fig, ax = plt.subplots(figsize=(12, 6))

for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    train_y = np.ones(len(train_idx)) * fold
    test_y = np.ones(len(test_idx)) * fold
    
    ax.scatter(train_idx, train_y, c='blue', marker='s', s=20, label='Train' if fold == 0 else '')
    ax.scatter(test_idx, test_y, c='red', marker='o', s=20, label='Test' if fold == 0 else '')

ax.set_xlabel('Time Index')
ax.set_ylabel('Fold Number')
ax.set_title('Time Series Cross-Validation')
ax.legend()
ax.set_yticks(range(5))
ax.set_yticklabels([f'Fold {i+1}' for i in range(5)])
plt.show()

# Proper time series CV
from sklearn.linear_model import Ridge

scores = []
for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    model = Ridge()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
    
    print(f"Train: {train_idx[0]}-{train_idx[-1]}, Test: {test_idx[0]}-{test_idx[-1]}, Score: {scores[-1]:.3f}")

print(f"\nMean score: {np.mean(scores):.3f}")
Why it matters:
ApproachProblem
Regular K-Fold on time seriesData leakage! Training on future to predict past
TimeSeriesSplitAlways predicts future from past

Group K-Fold

When data has groups (e.g., multiple samples from same patient), ensure entire groups stay together.
from sklearn.model_selection import GroupKFold
import numpy as np

# Medical data: multiple readings per patient
np.random.seed(42)
n_patients = 20
readings_per_patient = 5

X = np.random.randn(n_patients * readings_per_patient, 10)
y = np.random.randint(0, 2, n_patients * readings_per_patient)
groups = np.repeat(np.arange(n_patients), readings_per_patient)

print(f"Total samples: {len(X)}")
print(f"Unique patients: {len(np.unique(groups))}")

# Group K-Fold
gkf = GroupKFold(n_splits=5)

print("\nGroup K-Fold splits (patients in each fold):")
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups)):
    train_patients = np.unique(groups[train_idx])
    test_patients = np.unique(groups[test_idx])
    
    # Check for overlap
    overlap = np.intersect1d(train_patients, test_patients)
    
    print(f"Fold {fold+1}: Train patients={len(train_patients)}, Test patients={len(test_patients)}, Overlap={len(overlap)}")
When to use Group K-Fold:
  • Medical: Multiple readings per patient
  • E-commerce: Multiple transactions per user
  • Text: Multiple documents per author
  • Any scenario where samples aren’t independent

Nested Cross-Validation

For hyperparameter tuning + evaluation. Two loops:
  • Outer loop: Evaluate model
  • Inner loop: Tune hyperparameters
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X, y = data.data, data.target

# WRONG: Use same data for tuning and evaluation
model = SVC()
param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}

# This leaks info!
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X, y)
print(f"Best params: {grid_search.best_params_}")
print(f"Optimistic score: {grid_search.best_score_:.3f}")

# RIGHT: Nested cross-validation
from sklearn.model_selection import cross_val_score

# Inner CV for tuning (passed to GridSearchCV)
# Outer CV for evaluation (passed to cross_val_score)
nested_score = cross_val_score(
    GridSearchCV(SVC(), param_grid, cv=5),
    X, y, cv=5, scoring='accuracy'
)

print(f"\nNested CV score: {nested_score.mean():.3f} (+/- {nested_score.std()*2:.3f})")
print("This is the unbiased estimate!")
Best params: {'C': 10, 'gamma': 'scale'}
Optimistic score: 0.977

Nested CV score: 0.964 (+/- 0.024)
This is the unbiased estimate!
The nested CV score is lower but honest!

Repeated Cross-Validation

Run K-Fold multiple times with different shuffles for more stable estimates:
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine

wine = load_wine()
X, y = wine.data, wine.target

model = RandomForestClassifier(n_estimators=100, random_state=42)

# Single 5-fold
single_scores = cross_val_score(model, X, y, cv=5)

# Repeated 5-fold (10 repetitions)
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)
repeated_scores = cross_val_score(model, X, y, cv=rkf)

print(f"Single 5-Fold: {single_scores.mean():.3f} (+/- {single_scores.std()*2:.3f})")
print(f"Repeated 5-Fold (10x): {repeated_scores.mean():.3f} (+/- {repeated_scores.std()*2:.3f})")

# Visualize
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].hist(single_scores, bins=5, edgecolor='black', alpha=0.7)
axes[0].axvline(single_scores.mean(), color='red', linestyle='--')
axes[0].set_title(f'Single 5-Fold (n=5)')
axes[0].set_xlabel('Accuracy')
axes[0].set_ylabel('Count')

axes[1].hist(repeated_scores, bins=20, edgecolor='black', alpha=0.7)
axes[1].axvline(repeated_scores.mean(), color='red', linestyle='--')
axes[1].set_title(f'Repeated 5-Fold 10x (n=50)')
axes[1].set_xlabel('Accuracy')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

Choosing the Right Strategy

ScenarioRecommended CV
General classificationStratified K-Fold (K=5 or 10)
RegressionK-Fold (K=5 or 10)
Time seriesTimeSeriesSplit
Grouped dataGroupKFold
Hyperparameter tuningNested CV
Very small datasetLOOCV or Repeated K-Fold
Imbalanced classesStratified K-Fold
Critical applicationsRepeated Stratified K-Fold

Summary

Cross-validation transforms unreliable single-split estimates into robust performance measures:
  • K-Fold: Standard approach, every sample tested exactly once
  • Stratified: Maintains class balance
  • Time Series: Respects temporal order
  • Group: Keeps related samples together
  • Nested: Unbiased tuning + evaluation
  • Repeated: Reduces variance in estimates
Rule of Thumb: When in doubt, use Stratified 5-Fold for classification and 5-Fold for regression. Add repetition for critical applications.
# Your go-to template
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f"Accuracy: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")