Dimensionality Reduction
The Curse of Dimensionality
As features increase:
- Data becomes sparse (points far apart)
- Models need exponentially more data
- Distance metrics become meaningless
- Training time explodes
This is the “curse of dimensionality.”
Why Reduce Dimensions?
Visualization
Plot 100D data in 2D
Speed
Faster training and inference
Noise Reduction
Remove noisy features
Better Models
Reduce overfitting
PCA: Principal Component Analysis
PCA finds new axes that capture the most variance.
The Intuition
Imagine data shaped like a cigar:
- First principal component: Along the cigar (most variance)
- Second principal component: Across the cigar (second most)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Create cigar-shaped data
np.random.seed(42)
n_samples = 200
# Original data with correlation
x = np.random.randn(n_samples)
y = 2 * x + np.random.randn(n_samples) * 0.5
X = np.column_stack([x, y])
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Original data
axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.5)
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].set_title('Original Data')
axes[0].set_aspect('equal')
# Draw principal components
mean = X_scaled.mean(axis=0)
for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
axes[0].arrow(mean[0], mean[1],
comp[0] * np.sqrt(var) * 2,
comp[1] * np.sqrt(var) * 2,
head_width=0.1, head_length=0.1, fc=f'C{i}', ec=f'C{i}',
label=f'PC{i+1}: {pca.explained_variance_ratio_[i]:.1%}')
axes[0].legend()
# Transformed data
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('PCA Transformed Data')
axes[1].set_aspect('equal')
plt.tight_layout()
plt.show()
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance captured: {pca.explained_variance_ratio_.sum():.1%}")
Math Connection: PCA uses eigendecomposition of the covariance matrix. See Linear Algebra Course for the full theory.
Choosing the Number of Components
Method 1: Explained Variance
from sklearn.datasets import load_digits
# Load digit images (64 features)
digits = load_digits()
X = digits.data
y = digits.target
print(f"Original shape: {X.shape}") # (1797, 64)
# Fit PCA with all components
pca_full = PCA()
pca_full.fit(X)
# Plot cumulative explained variance
cumsum = np.cumsum(pca_full.explained_variance_ratio_)
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(cumsum) + 1), cumsum, 'b-o')
plt.axhline(0.95, color='r', linestyle='--', label='95% threshold')
plt.axhline(0.90, color='orange', linestyle='--', label='90% threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.legend()
plt.grid(True)
plt.show()
# Find number of components for 95% variance
n_components_95 = np.argmax(cumsum >= 0.95) + 1
print(f"Components for 95% variance: {n_components_95}")
Method 2: Preserve Target Variance
# Use cross-validation to find optimal components
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
n_components_range = [5, 10, 15, 20, 30, 40, 50, 64]
scores = []
for n_comp in n_components_range:
pipeline = Pipeline([
('pca', PCA(n_components=n_comp)),
('classifier', LogisticRegression(max_iter=5000))
])
score = cross_val_score(pipeline, X, y, cv=5).mean()
scores.append(score)
print(f"n_components={n_comp:2d}: accuracy={score:.3f}")
plt.figure(figsize=(10, 5))
plt.plot(n_components_range, scores, 'b-o')
plt.xlabel('Number of Components')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Model Performance vs PCA Components')
plt.grid(True)
plt.show()
Visualizing High-Dimensional Data
Digits in 2D
# Reduce 64D to 2D for visualization
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title(f'Digits Dataset in 2D (PCA)\nExplained Variance: {pca_2d.explained_variance_ratio_.sum():.1%}')
plt.show()
t-SNE: For Visualization
t-SNE preserves local structure - nearby points stay nearby:
from sklearn.manifold import TSNE
# t-SNE is slow on large data, sample if needed
sample_size = min(1000, len(X))
indices = np.random.choice(len(X), sample_size, replace=False)
X_sample = X[indices]
y_sample = y[indices]
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_sample)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_sample, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('Digits Dataset - t-SNE Visualization')
plt.show()
PCA vs t-SNE
| Aspect | PCA | t-SNE |
|---|
| Speed | Fast | Slow |
| Purpose | Feature reduction, preprocessing | Visualization only |
| Preserves | Global structure | Local structure |
| New data | Can transform | Must refit |
| Interpretable | Yes (loadings) | No |
t-SNE is for visualization only! Don’t use it as a preprocessing step for models.
UMAP: Best of Both Worlds
UMAP is faster than t-SNE and preserves more global structure:
# pip install umap-learn
from umap import UMAP
# Apply UMAP
umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
X_umap = umap_model.fit_transform(X)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.title('Digits Dataset - UMAP Visualization')
plt.show()
PCA for Preprocessing
Speed Up Training
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import time
# Load MNIST (larger dataset)
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X_mnist, y_mnist = mnist.data, mnist.target
# Sample for demo
X_sample, _, y_sample, _ = train_test_split(X_mnist, y_mnist, train_size=10000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2)
print(f"Original features: {X_train.shape[1]}")
# Without PCA
start = time.time()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
time_no_pca = time.time() - start
acc_no_pca = rf.score(X_test, y_test)
# With PCA (95% variance)
pca = PCA(n_components=0.95) # Keep 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(f"PCA features: {X_train_pca.shape[1]}")
start = time.time()
rf_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)
time_pca = time.time() - start
acc_pca = rf_pca.score(X_test_pca, y_test)
print(f"\nWithout PCA: {acc_no_pca:.3f} accuracy, {time_no_pca:.1f}s")
print(f"With PCA: {acc_pca:.3f} accuracy, {time_pca:.1f}s")
print(f"Speedup: {time_no_pca/time_pca:.1f}x")
Noise Reduction
# Add noise to images
np.random.seed(42)
noise = np.random.randn(*X_sample.shape) * 50
X_noisy = X_sample + noise
X_noisy = np.clip(X_noisy, 0, 255) # Keep valid pixel range
# Denoise using PCA
pca_denoise = PCA(n_components=100)
X_denoised = pca_denoise.inverse_transform(pca_denoise.fit_transform(X_noisy))
# Visualize
fig, axes = plt.subplots(3, 5, figsize=(12, 8))
for i in range(5):
axes[0, i].imshow(X_sample[i].reshape(28, 28), cmap='gray')
axes[0, i].axis('off')
if i == 0:
axes[0, i].set_title('Original')
axes[1, i].imshow(X_noisy[i].reshape(28, 28), cmap='gray')
axes[1, i].axis('off')
if i == 0:
axes[1, i].set_title('Noisy')
axes[2, i].imshow(X_denoised[i].reshape(28, 28), cmap='gray')
axes[2, i].axis('off')
if i == 0:
axes[2, i].set_title('Denoised (PCA)')
plt.tight_layout()
plt.show()
| Aspect | Feature Selection | Feature Extraction (PCA) |
|---|
| Method | Pick best features | Create new features |
| Interpretability | High (original features) | Lower (combinations) |
| Information | May lose some | Preserves variance |
| Examples | SelectKBest, RFE | PCA, LDA |
from sklearn.feature_selection import SelectKBest, f_classif
# Feature selection: pick top k features
selector = SelectKBest(f_classif, k=50)
X_selected = selector.fit_transform(X_train, y_train)
# Feature extraction: create new features
pca = PCA(n_components=50)
X_extracted = pca.fit_transform(X_train)
print(f"Selection: {X_selected.shape}")
print(f"Extraction: {X_extracted.shape}")
LDA: Supervised Dimensionality Reduction
Linear Discriminant Analysis maximizes class separation:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# LDA uses class labels
lda = LinearDiscriminantAnalysis(n_components=9) # Max = n_classes - 1
X_lda = lda.fit_transform(X, y)
# Compare PCA vs LDA
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.5)
axes[0].set_title('PCA (Unsupervised)')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
# LDA
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)
axes[1].scatter(X_lda[:, 0], X_lda[:, 1], c=y, cmap='tab10', alpha=0.5)
axes[1].set_title('LDA (Supervised)')
axes[1].set_xlabel('LD1')
axes[1].set_ylabel('LD2')
plt.tight_layout()
plt.show()
When to Use What
Visualization?
│
┌─────────────┴─────────────┐
│ │
Yes No
│ │
┌───────┴───────┐ ┌────────┴────────┐
│ │ │ │
Small data Large data Supervised? Unsupervised?
│ │ │ │
t-SNE UMAP LDA PCA
Practical Example: Image Compression
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
# Original images
digits = load_digits()
X = digits.data
# Compress with different components
components = [5, 10, 20, 40, 64]
fig, axes = plt.subplots(len(components), 5, figsize=(10, 12))
for row, n_comp in enumerate(components):
if n_comp < 64:
pca = PCA(n_components=n_comp)
X_compressed = pca.fit_transform(X)
X_reconstructed = pca.inverse_transform(X_compressed)
compression_ratio = 64 / n_comp
else:
X_reconstructed = X
compression_ratio = 1
for col in range(5):
axes[row, col].imshow(X_reconstructed[col].reshape(8, 8), cmap='gray')
axes[row, col].axis('off')
if col == 0:
axes[row, col].set_ylabel(f'{n_comp} comp\n({compression_ratio:.1f}x)')
plt.suptitle('Image Compression with PCA')
plt.tight_layout()
plt.show()
Key Takeaways
PCA for Preprocessing
Reduce dimensions while keeping variance
t-SNE/UMAP for Visualization
See high-dimensional data in 2D/3D
LDA for Classification
Maximize class separation
Choose Components Wisely
Use explained variance or CV performance
What’s Next?
Congratulations! You’ve completed the advanced topics. Now let’s bring everything together in a capstone project!
Continue to Capstone Project
Build a complete ML system from scratch