Skip to main content
Data Leakage Concept
Data Leakage Real World Example

Data Leakage

The Hidden Danger

Your model achieves 99% accuracy in development. You celebrate. You deploy it. It performs no better than random guessing. What went wrong? Data leakage.

What is Data Leakage?

Data leakage occurs when information from outside the training data sneaks into your model during training. Types:
  1. Target leakage: Features contain information about the target
  2. Train-test contamination: Test data influences training
  3. Temporal leakage: Future information used to predict the past
Data leakage is extremely common and often subtle. It’s responsible for many “too good to be true” results.

Real Example: The 99% Accuracy Trap

A hospital builds a model to predict pneumonia from chest X-rays:
# Simplified example of what went wrong
import pandas as pd
import numpy as np

# Hospital data
data = pd.DataFrame({
    'patient_id': range(1000),
    'has_portable_xray': np.random.choice([0, 1], 1000, p=[0.7, 0.3]),
    'pneumonia': np.zeros(1000)
})

# The leak: sick patients often get portable X-rays
# (they're too sick to go to the X-ray room)
sick_mask = data['has_portable_xray'] == 1
data.loc[sick_mask, 'pneumonia'] = np.random.choice([0, 1], sick_mask.sum(), p=[0.2, 0.8])

# Model learns: portable X-ray → pneumonia
# But this is correlation, not causation!
correlation = data['has_portable_xray'].corr(data['pneumonia'])
print(f"Correlation: {correlation:.3f}")  # Very high!
The model learned that portable X-ray equipment in the image predicts pneumonia - not the lung patterns!

Type 1: Target Leakage

Information derived from the target leaks into features:

Example: Credit Card Fraud

# LEAKY DATASET
fraud_data = pd.DataFrame({
    'transaction_amount': [100, 500, 50, 10000],
    'merchant_category': ['grocery', 'electronics', 'gas', 'jewelry'],
    'is_fraud': [0, 0, 0, 1],
    'fraud_investigation_date': [None, None, None, '2024-01-15'],  # LEAK!
    'chargeback_amount': [0, 0, 0, 10000]  # LEAK!
})

# These features only exist BECAUSE of fraud
# They leak the target!
Problem: fraud_investigation_date and chargeback_amount only exist for fraudulent transactions!

How to Fix

# CLEAN DATASET - only use features available at prediction time
clean_fraud_data = pd.DataFrame({
    'transaction_amount': [100, 500, 50, 10000],
    'merchant_category': ['grocery', 'electronics', 'gas', 'jewelry'],
    'time_since_last_transaction': [3600, 86400, 1800, 120],  # seconds
    'distance_from_home': [2, 50, 5, 500],  # miles
    'is_fraud': [0, 0, 0, 1]
})

# Ask: "Would I have this feature at prediction time?"

Type 2: Train-Test Contamination

Test data influences the training process:

Example: Data Preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

np.random.seed(42)
X = np.random.randn(1000, 10)
y = (X[:, 0] + X[:, 1] > 0).astype(int)

# ❌ WRONG: Fit scaler on all data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Test data statistics leak in!
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

model = LogisticRegression()
model.fit(X_train, y_train)
leaky_accuracy = model.score(X_test, y_test)

# ✅ CORRECT: Fit scaler only on training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit only on train
X_test_scaled = scaler.transform(X_test)  # Transform (don't fit) test

model = LogisticRegression()
model.fit(X_train_scaled, y_train)
clean_accuracy = model.score(X_test_scaled, y_test)

print(f"Leaky accuracy: {leaky_accuracy:.4f}")
print(f"Clean accuracy: {clean_accuracy:.4f}")

Example: Feature Selection

from sklearn.feature_selection import SelectKBest, f_classif

# ❌ WRONG: Select features using all data
selector = SelectKBest(f_classif, k=5)
X_selected = selector.fit_transform(X, y)  # Uses test data!
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2)

# ✅ CORRECT: Select features only on training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

selector = SelectKBest(f_classif, k=5)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

Type 3: Temporal Leakage

Using future information to predict the past:

Example: Stock Prediction

import pandas as pd
import numpy as np

# Stock data
dates = pd.date_range('2023-01-01', periods=100, freq='D')
prices = 100 + np.cumsum(np.random.randn(100))

stock_data = pd.DataFrame({
    'date': dates,
    'price': prices,
    'next_day_price': np.roll(prices, -1)  # LEAK: future price!
})

# ❌ WRONG: Random split
# This might put Feb 15 in training and Feb 14 in test!
# Model could learn from future to predict past

# ✅ CORRECT: Temporal split
train_cutoff = dates[int(len(dates) * 0.8)]
train_data = stock_data[stock_data['date'] < train_cutoff]
test_data = stock_data[stock_data['date'] >= train_cutoff]

print(f"Training period: {train_data['date'].min()} to {train_data['date'].max()}")
print(f"Testing period: {test_data['date'].min()} to {test_data['date'].max()}")

Time Series Cross-Validation

from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

# Correct way to cross-validate time series
tscv = TimeSeriesSplit(n_splits=5)

fig, ax = plt.subplots(figsize=(12, 4))
for i, (train_idx, test_idx) in enumerate(tscv.split(stock_data)):
    ax.plot(train_idx, [i] * len(train_idx), 'b-', linewidth=3)
    ax.plot(test_idx, [i] * len(test_idx), 'r-', linewidth=3)

ax.set_xlabel('Time')
ax.set_ylabel('CV Fold')
ax.set_title('Time Series Cross-Validation (No Future Leakage)')
plt.show()

The Pipeline Solution

Use scikit-learn Pipelines to prevent contamination:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Create a pipeline - preprocessing happens inside CV
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(k=5)),
    ('classifier', LogisticRegression())
])

# Cross-validation now correctly separates train/test
scores = cross_val_score(pipeline, X, y, cv=5)
print(f"CV Scores: {scores}")
print(f"Mean: {scores.mean():.3f} ± {scores.std():.3f}")
Always use Pipelines for preprocessing + modeling. They ensure that transformations are properly fit only on training folds.

Common Leakage Sources Checklist

Data Collection Issues

  • Timestamps mixed between train/test
  • Data from same entity in both train and test
  • Features computed from all data (global statistics)

Feature Engineering Issues

  • Features derived from target
  • Future information in features
  • Rolling windows including future data

Preprocessing Issues

  • Scaling fit on all data
  • Imputation fit on all data
  • Feature selection using all data
  • PCA/dimensionality reduction on all data

Validation Issues

  • Random split on time series
  • Same group/patient in train and test
  • Test set seen during hyperparameter tuning

How to Detect Leakage

1. Suspiciously Good Results

# If your model seems too good to be true...
if accuracy > 0.98:
    print("🚨 Warning: Accuracy suspiciously high!")
    print("Check for data leakage!")

2. Feature Importance Analysis

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# If irrelevant features are most important, you have leakage
model = RandomForestClassifier()
model.fit(X_train, y_train)

importances = pd.DataFrame({
    'feature': [f'feature_{i}' for i in range(X_train.shape[1])],
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top features (check if these make sense!):")
print(importances.head(10))

3. Validation Gap

# Large gap between CV and holdout = potential leakage
cv_score = cross_val_score(model, X_train, y_train, cv=5).mean()
holdout_score = model.fit(X_train, y_train).score(X_test, y_test)

gap = abs(cv_score - holdout_score)
if gap > 0.1:
    print(f"🚨 Warning: Large gap between CV ({cv_score:.3f}) and holdout ({holdout_score:.3f})")
    print("Possible data leakage or distribution shift!")

Real-World Prevention Strategy

def safe_ml_pipeline(X, y, time_column=None):
    """
    A leakage-safe ML pipeline template.
    """
    # 1. Split FIRST, before any processing
    if time_column:
        # Temporal split
        X_sorted = X.sort_values(time_column)
        split_idx = int(len(X_sorted) * 0.8)
        X_train = X_sorted.iloc[:split_idx]
        X_test = X_sorted.iloc[split_idx:]
        y_train = y.iloc[X_train.index]
        y_test = y.iloc[X_test.index]
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
    
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    
    # 2. Create pipeline (all preprocessing inside)
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestClassifier())
    ])
    
    # 3. Fit only on training data
    pipeline.fit(X_train, y_train)
    
    # 4. Evaluate on untouched test set
    train_score = pipeline.score(X_train, y_train)
    test_score = pipeline.score(X_test, y_test)
    
    print(f"Train score: {train_score:.3f}")
    print(f"Test score: {test_score:.3f}")
    print(f"Gap: {train_score - test_score:.3f}")
    
    if train_score - test_score > 0.2:
        print("⚠️  Large train-test gap - check for overfitting or leakage")
    
    return pipeline

Key Takeaways

Split First

Always split data before any preprocessing or analysis

Use Pipelines

Scikit-learn Pipelines prevent contamination

Question Features

Ask: “Would I have this at prediction time?”

Validate Results

If it’s too good to be true, it probably is

What’s Next?

Now let’s learn about dimensionality reduction - handling high-dimensional data effectively!

Continue to Dimensionality Reduction

PCA, t-SNE, and handling the curse of dimensionality