Skip to main content

Ensemble Methods

Ensemble Methods - Multiple Models Voting

The Wisdom of Crowds

Question: Who’s smarter - one expert or 100 average people? Surprisingly: The crowd often wins!

A Real Experiment

In 1906, statistician Francis Galton visited a county fair. 787 people guessed the weight of an ox:
  • Individual guesses ranged wildly
  • Average of all guesses: 1,197 pounds
  • Actual weight: 1,198 pounds
The crowd was off by 1 pound! This is the core idea behind ensemble learning:
Many weak learners combined can outperform a single strong learner
Credit Scoring with Ensemble

Why Ensembles Work

Imagine 5 decision trees, each 70% accurate:
import numpy as np

# Each model votes
def ensemble_vote(models_correct):
    """
    Returns True if majority of models are correct.
    """
    return sum(models_correct) > len(models_correct) / 2

# Simulate 1000 predictions
np.random.seed(42)
accuracy = 0.7
n_models = 5
n_simulations = 10000

correct = 0
for _ in range(n_simulations):
    # Each model independently right 70% of the time
    votes = np.random.random(n_models) < accuracy
    if ensemble_vote(votes):
        correct += 1

print(f"Single model accuracy: {accuracy:.1%}")
print(f"Ensemble accuracy:     {correct/n_simulations:.1%}")
# Output: ~83%!
The Math: For majority voting with independent 70% accurate models:P(majority correct)=k=35(5k)(0.7)k(0.3)5k83.7%P(\text{majority correct}) = \sum_{k=3}^{5} \binom{5}{k} (0.7)^k (0.3)^{5-k} \approx 83.7\%

Bagging: Bootstrap Aggregating

Idea: Train multiple models on different random samples of data.

How Bagging Works

  1. Create N random samples (with replacement) from training data
  2. Train a model on each sample
  3. Average predictions (regression) or vote (classification)
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

# Load data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

class SimpleBaggingClassifier:
    def __init__(self, base_model, n_estimators=10):
        self.n_estimators = n_estimators
        self.base_model = base_model
        self.models = []
    
    def fit(self, X, y):
        n_samples = len(X)
        
        for _ in range(self.n_estimators):
            # Bootstrap sample (sample with replacement)
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_bootstrap = X[indices]
            y_bootstrap = y[indices]
            
            # Train a model
            model = DecisionTreeClassifier(max_depth=5)
            model.fit(X_bootstrap, y_bootstrap)
            self.models.append(model)
    
    def predict(self, X):
        # Get predictions from all models
        predictions = np.array([model.predict(X) for model in self.models])
        
        # Majority vote
        from scipy.stats import mode
        return mode(predictions, axis=0)[0].ravel()

# Test
bagging = SimpleBaggingClassifier(DecisionTreeClassifier, n_estimators=10)
bagging.fit(X_train, y_train)
predictions = bagging.predict(X_test)
print(f"Bagging Accuracy: {np.mean(predictions == y_test):.2%}")

Random Forest: Bagging + Feature Randomness

Random Forest = Bagging + Random Feature Selection At each split, only consider a random subset of features! This makes trees more diverse, improving ensemble performance.
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, test_size=0.2, random_state=42
)

# Train Random Forest
rf = RandomForestClassifier(
    n_estimators=100,      # 100 trees
    max_depth=10,          # Limit tree depth
    max_features='sqrt',   # Random subset of features at each split
    random_state=42
)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))

# Compare to single tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=10, random_state=42)
tree.fit(X_train, y_train)
print(f"\nSingle Tree Accuracy:    {tree.score(X_test, y_test):.2%}")
print(f"Random Forest Accuracy:  {rf.score(X_test, y_test):.2%}")

Feature Importance

Random Forests tell you which features matter most:
import matplotlib.pyplot as plt

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:10]  # Top 10

# Plot
plt.figure(figsize=(10, 6))
plt.bar(range(10), importances[indices])
plt.xticks(range(10), [cancer.feature_names[i] for i in indices], rotation=45, ha='right')
plt.title('Top 10 Feature Importances')
plt.tight_layout()
plt.show()

Boosting: Learning from Mistakes

Key Idea: Train models sequentially, each focusing on what previous models got wrong.

AdaBoost (Adaptive Boosting)

  1. Train a model
  2. Increase weights of misclassified samples
  3. Train next model (focuses on hard examples)
  4. Combine all models with weighted voting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Train AdaBoost
ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),  # "Stumps"
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)
ada.fit(X_train, y_train)

print(f"AdaBoost Accuracy: {ada.score(X_test, y_test):.2%}")

Gradient Boosting

Instead of reweighting samples, fit each tree to the residual errors: New Model=Previous Model+Learning Rate×Tree that predicts errors\text{New Model} = \text{Previous Model} + \text{Learning Rate} \times \text{Tree that predicts errors}
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient Boosting
gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gb.fit(X_train, y_train)

print(f"Gradient Boosting Accuracy: {gb.score(X_test, y_test):.2%}")

XGBoost: The Competition Winner

XGBoost (Extreme Gradient Boosting) is often the best choice for tabular data.
# pip install xgboost
from xgboost import XGBClassifier

# Train XGBoost
xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train, y_train)

print(f"XGBoost Accuracy: {xgb.score(X_test, y_test):.2%}")

Why XGBoost Wins

  • Regularization: Built-in L1/L2 regularization
  • Parallel training: Uses all CPU cores
  • Missing values: Handles them automatically
  • Optimized: Carefully engineered for speed

Comparison: When to Use What?

from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score

# Load data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Compare models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
}

print("5-Fold Cross-Validation Scores:\n")
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5)
    print(f"{name:22s}: {scores.mean():.4f} (+/- {scores.std():.4f})")

Bagging vs Boosting

Bagging (Random Forest)

  • Train in parallel
  • Reduce variance (overfitting)
  • Works with high-variance models
  • More robust to outliers
  • Harder to overfit

Boosting (XGBoost)

  • Train sequentially
  • Reduce bias (underfitting)
  • Learns from mistakes
  • Usually more accurate
  • Can overfit if not tuned

Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Grid search
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Use best model
best_model = grid_search.best_estimator_
print(f"Test score: {best_model.score(X_test, y_test):.4f}")

Voting Classifier: Mix Different Models

Combine different types of models:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Create ensemble of different model types
voting = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=5000)),
        ('rf', RandomForestClassifier(n_estimators=100)),
        ('svc', SVC(probability=True))
    ],
    voting='soft'  # Use probabilities for voting
)

voting.fit(X_train, y_train)
print(f"Voting Ensemble Accuracy: {voting.score(X_test, y_test):.2%}")

Stacking: Models Learn from Models

Train a meta-model on the predictions of base models:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Create stacking ensemble
stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

stacking.fit(X_train, y_train)
print(f"Stacking Accuracy: {stacking.score(X_test, y_test):.2%}")

🚀 Mini Projects

Project 1

Build and tune a Random Forest classifier

Project 2

Gradient Boosting for regression

Project 3

Ensemble comparison on real dataset

Key Takeaways

Crowd Wisdom

Many weak models beat one strong model

Bagging = Parallel

Train on different data samples

Boosting = Sequential

Each model fixes previous mistakes

Random Forest

Best starting point for tabular data

What’s Next?

Now that you understand the main ML algorithms, let’s learn how to properly evaluate and compare models!

Continue to Module 7: Model Evaluation

Learn cross-validation, metrics, and how to avoid common mistakes