import numpy as np# Each model votesdef ensemble_vote(models_correct): """ Returns True if majority of models are correct. """ return sum(models_correct) > len(models_correct) / 2# Simulate 1000 predictionsnp.random.seed(42)accuracy = 0.7n_models = 5n_simulations = 10000correct = 0for _ in range(n_simulations): # Each model independently right 70% of the time votes = np.random.random(n_models) < accuracy if ensemble_vote(votes): correct += 1print(f"Single model accuracy: {accuracy:.1%}")print(f"Ensemble accuracy: {correct/n_simulations:.1%}")# Output: ~83%!
The Math: For majority voting with independent 70% accurate models:P(majority correct)=∑k=35(k5)(0.7)k(0.3)5−k≈83.7%
Random Forest = Bagging + Random Feature SelectionAt each split, only consider a random subset of features (typically sqrt(n_features) for classification, n_features/3 for regression).This makes trees more diverse, improving ensemble performance. Here’s the key insight: if one feature is extremely predictive (like “credit score” for loan approval), regular bagging would make every tree split on that feature first, and all trees would look nearly identical. By forcing each tree to sometimes “ignore” the best feature, we create a diverse committee where each tree brings a different perspective. Diversity is the secret sauce of ensembles — correlated models don’t add value, but diverse models cancel each other’s errors.
from sklearn.ensemble import RandomForestClassifierfrom sklearn.datasets import load_breast_cancerfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import classification_report# Load datacancer = load_breast_cancer()X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, test_size=0.2, random_state=42)# Train Random Forestrf = RandomForestClassifier( n_estimators=100, # 100 trees -- more trees = better (diminishing returns after ~200) max_depth=10, # Limit tree depth -- prevents individual trees from memorizing data max_features='sqrt', # Only consider sqrt(n) features per split -- the key to diversity random_state=42 # For reproducibility -- remove in production)# Practical tip: Random Forest is remarkably forgiving of hyperparameters.# n_estimators=100 and defaults for everything else is a solid starting point.# Unlike gradient boosting, more trees in a Random Forest NEVER hurts# (accuracy plateaus but doesn't decrease) -- it only costs compute time.rf.fit(X_train, y_train)# Evaluatey_pred = rf.predict(X_test)print("Random Forest Performance:")print(classification_report(y_test, y_pred, target_names=cancer.target_names))# Compare to single treefrom sklearn.tree import DecisionTreeClassifiertree = DecisionTreeClassifier(max_depth=10, random_state=42)tree.fit(X_train, y_train)print(f"\nSingle Tree Accuracy: {tree.score(X_test, y_test):.2%}")print(f"Random Forest Accuracy: {rf.score(X_test, y_test):.2%}")
Random Forests tell you which features matter most:
import matplotlib.pyplot as plt# Get feature importancesimportances = rf.feature_importances_indices = np.argsort(importances)[::-1][:10] # Top 10# Plotplt.figure(figsize=(10, 6))plt.bar(range(10), importances[indices])plt.xticks(range(10), [cancer.feature_names[i] for i in indices], rotation=45, ha='right')plt.title('Top 10 Feature Importances')plt.tight_layout()plt.show()
Instead of reweighting samples, fit each tree to the residual errors — the gap between what we predicted and what actually happened:New Model=Previous Model+Learning Rate×Tree that predicts errorsThink of it like a team of editors reviewing a document. The first editor makes a rough draft. The second editor only focuses on fixing the first editor’s mistakes. The third editor fixes what the second one missed. Each editor makes the document incrementally better, and the learning rate controls how much you trust each editor’s corrections (a small learning rate means “make cautious edits,” which usually works better).
Regularization: Built-in L1/L2 regularization prevents overfitting without you having to think about it
Parallel training: Uses all CPU cores — tree building is parallelized at the split-finding level
Missing values: Handles them automatically by learning the optimal default direction at each split
Optimized: Carefully engineered for speed with histogram-based splitting and cache-aware computation
When to reach for XGBoost vs Random Forest: XGBoost typically achieves 1-3% higher accuracy on tabular data but requires more careful tuning (learning_rate, max_depth, n_estimators interact heavily). Random Forest is more “set and forget.” If you have time to tune, use XGBoost. If you need a quick, reliable baseline, use Random Forest.
Train a meta-model on the predictions of base models. The idea: each base model has different strengths and weaknesses. A logistic regression meta-learner can figure out “trust the Random Forest on samples like these, but trust the SVM on samples like those.”
from sklearn.ensemble import StackingClassifierfrom sklearn.linear_model import LogisticRegressionfrom sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifierfrom sklearn.svm import SVC# Define base models -- choose models with DIFFERENT inductive biases.# If all base models make the same mistakes, stacking won't help.base_models = [ ('rf', RandomForestClassifier(n_estimators=50, random_state=42)), # Tree-based ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)), # Sequential boosting ('svc', SVC(probability=True, random_state=42)) # Margin-based]# The meta-model learns: "given these 3 predictions, what's the best final answer?"# Use a simple model here to avoid overfitting the stacking layer.meta_model = LogisticRegression()# cv=5 means base model predictions are generated via cross-validation# to prevent data leakage -- the meta-model never sees predictions# that base models made on their own training data.stacking = StackingClassifier( estimators=base_models, final_estimator=meta_model, cv=5)stacking.fit(X_train, y_train)print(f"Stacking Accuracy: {stacking.score(X_test, y_test):.2%}")# Note: Stacking adds complexity and training time. Use it when# you need that last 0.5-1% improvement (e.g., Kaggle competitions),# not as your first approach.