Your model achieves 99% accuracy in development. You celebrate. You deploy it.It performs no better than random guessing.What went wrong? Data leakage.
A hospital builds a model to predict pneumonia from chest X-rays:
Copy
# Simplified example of what went wrongimport pandas as pdimport numpy as np# Hospital datadata = pd.DataFrame({ 'patient_id': range(1000), 'has_portable_xray': np.random.choice([0, 1], 1000, p=[0.7, 0.3]), 'pneumonia': np.zeros(1000)})# The leak: sick patients often get portable X-rays# (they're too sick to go to the X-ray room)sick_mask = data['has_portable_xray'] == 1data.loc[sick_mask, 'pneumonia'] = np.random.choice([0, 1], sick_mask.sum(), p=[0.2, 0.8])# Model learns: portable X-ray → pneumonia# But this is correlation, not causation!correlation = data['has_portable_xray'].corr(data['pneumonia'])print(f"Correlation: {correlation:.3f}") # Very high!
The model learned that portable X-ray equipment in the image predicts pneumonia - not the lung patterns!
from sklearn.feature_selection import SelectKBest, f_classif# ❌ WRONG: Select features using all dataselector = SelectKBest(f_classif, k=5)X_selected = selector.fit_transform(X, y) # Uses test data!X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2)# ✅ CORRECT: Select features only on training dataX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)selector = SelectKBest(f_classif, k=5)X_train_selected = selector.fit_transform(X_train, y_train)X_test_selected = selector.transform(X_test)
import pandas as pdimport numpy as np# Stock datadates = pd.date_range('2023-01-01', periods=100, freq='D')prices = 100 + np.cumsum(np.random.randn(100))stock_data = pd.DataFrame({ 'date': dates, 'price': prices, 'next_day_price': np.roll(prices, -1) # LEAK: future price!})# ❌ WRONG: Random split# This might put Feb 15 in training and Feb 14 in test!# Model could learn from future to predict past# ✅ CORRECT: Temporal splittrain_cutoff = dates[int(len(dates) * 0.8)]train_data = stock_data[stock_data['date'] < train_cutoff]test_data = stock_data[stock_data['date'] >= train_cutoff]print(f"Training period: {train_data['date'].min()} to {train_data['date'].max()}")print(f"Testing period: {test_data['date'].min()} to {test_data['date'].max()}")
from sklearn.model_selection import TimeSeriesSplitimport matplotlib.pyplot as plt# Correct way to cross-validate time seriestscv = TimeSeriesSplit(n_splits=5)fig, ax = plt.subplots(figsize=(12, 4))for i, (train_idx, test_idx) in enumerate(tscv.split(stock_data)): ax.plot(train_idx, [i] * len(train_idx), 'b-', linewidth=3) ax.plot(test_idx, [i] * len(test_idx), 'r-', linewidth=3)ax.set_xlabel('Time')ax.set_ylabel('CV Fold')ax.set_title('Time Series Cross-Validation (No Future Leakage)')plt.show()
from sklearn.ensemble import RandomForestClassifierimport matplotlib.pyplot as plt# If irrelevant features are most important, you have leakagemodel = RandomForestClassifier()model.fit(X_train, y_train)importances = pd.DataFrame({ 'feature': [f'feature_{i}' for i in range(X_train.shape[1])], 'importance': model.feature_importances_}).sort_values('importance', ascending=False)print("Top features (check if these make sense!):")print(importances.head(10))
# Large gap between CV and holdout = potential leakagecv_score = cross_val_score(model, X_train, y_train, cv=5).mean()holdout_score = model.fit(X_train, y_train).score(X_test, y_test)gap = abs(cv_score - holdout_score)if gap > 0.1: print(f"🚨 Warning: Large gap between CV ({cv_score:.3f}) and holdout ({holdout_score:.3f})") print("Possible data leakage or distribution shift!")