Skip to main content
Dimensionality Reduction Concept
Dimensionality Reduction Real World Example

Dimensionality Reduction

The Curse of Dimensionality

As features increase:
  • Data becomes sparse (points far apart)
  • Models need exponentially more data
  • Distance metrics become meaningless
  • Training time explodes
This is the “curse of dimensionality.”

Why Reduce Dimensions?

Visualization

Plot 100D data in 2D

Speed

Faster training and inference

Noise Reduction

Remove noisy features

Better Models

Reduce overfitting

PCA: Principal Component Analysis

PCA finds new axes that capture the most variance.

The Intuition

Imagine data shaped like a cigar:
  • First principal component: Along the cigar (most variance)
  • Second principal component: Across the cigar (second most)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Create cigar-shaped data
np.random.seed(42)
n_samples = 200

# Original data with correlation
x = np.random.randn(n_samples)
y = 2 * x + np.random.randn(n_samples) * 0.5
X = np.column_stack([x, y])

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Original data
axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.5)
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].set_title('Original Data')
axes[0].set_aspect('equal')

# Draw principal components
mean = X_scaled.mean(axis=0)
for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
    axes[0].arrow(mean[0], mean[1], 
                  comp[0] * np.sqrt(var) * 2, 
                  comp[1] * np.sqrt(var) * 2,
                  head_width=0.1, head_length=0.1, fc=f'C{i}', ec=f'C{i}',
                  label=f'PC{i+1}: {pca.explained_variance_ratio_[i]:.1%}')
axes[0].legend()

# Transformed data
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('PCA Transformed Data')
axes[1].set_aspect('equal')

plt.tight_layout()
plt.show()

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance captured: {pca.explained_variance_ratio_.sum():.1%}")
Math Connection: PCA uses eigendecomposition of the covariance matrix. See Linear Algebra Course for the full theory.

Choosing the Number of Components

Method 1: Explained Variance

from sklearn.datasets import load_digits

# Load digit images (64 features)
digits = load_digits()
X = digits.data
y = digits.target

print(f"Original shape: {X.shape}")  # (1797, 64)

# Fit PCA with all components
pca_full = PCA()
pca_full.fit(X)

# Plot cumulative explained variance
cumsum = np.cumsum(pca_full.explained_variance_ratio_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, len(cumsum) + 1), cumsum, 'b-o')
plt.axhline(0.95, color='r', linestyle='--', label='95% threshold')
plt.axhline(0.90, color='orange', linestyle='--', label='90% threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.legend()
plt.grid(True)
plt.show()

# Find number of components for 95% variance
n_components_95 = np.argmax(cumsum >= 0.95) + 1
print(f"Components for 95% variance: {n_components_95}")

Method 2: Preserve Target Variance

# Use cross-validation to find optimal components
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

n_components_range = [5, 10, 15, 20, 30, 40, 50, 64]
scores = []

for n_comp in n_components_range:
    pipeline = Pipeline([
        ('pca', PCA(n_components=n_comp)),
        ('classifier', LogisticRegression(max_iter=5000))
    ])
    
    score = cross_val_score(pipeline, X, y, cv=5).mean()
    scores.append(score)
    print(f"n_components={n_comp:2d}: accuracy={score:.3f}")

plt.figure(figsize=(10, 5))
plt.plot(n_components_range, scores, 'b-o')
plt.xlabel('Number of Components')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Model Performance vs PCA Components')
plt.grid(True)
plt.show()

Visualizing High-Dimensional Data

Digits in 2D

# Reduce 64D to 2D for visualization
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title(f'Digits Dataset in 2D (PCA)\nExplained Variance: {pca_2d.explained_variance_ratio_.sum():.1%}')
plt.show()

t-SNE: For Visualization

t-SNE preserves local structure - nearby points stay nearby:
from sklearn.manifold import TSNE

# t-SNE is slow on large data, sample if needed
sample_size = min(1000, len(X))
indices = np.random.choice(len(X), sample_size, replace=False)
X_sample = X[indices]
y_sample = y[indices]

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_sample)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_sample, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('Digits Dataset - t-SNE Visualization')
plt.show()

PCA vs t-SNE

AspectPCAt-SNE
SpeedFastSlow
PurposeFeature reduction, preprocessingVisualization only
PreservesGlobal structureLocal structure
New dataCan transformMust refit
InterpretableYes (loadings)No
t-SNE is for visualization only! Don’t use it as a preprocessing step for models.

UMAP: Best of Both Worlds

UMAP is faster than t-SNE and preserves more global structure:
# pip install umap-learn
from umap import UMAP

# Apply UMAP
umap_model = UMAP(n_components=2, n_neighbors=15, min_dist=0.1, random_state=42)
X_umap = umap_model.fit_transform(X)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y, cmap='tab10', alpha=0.6)
plt.colorbar(scatter, label='Digit')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.title('Digits Dataset - UMAP Visualization')
plt.show()

PCA for Preprocessing

Speed Up Training

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import time

# Load MNIST (larger dataset)
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X_mnist, y_mnist = mnist.data, mnist.target

# Sample for demo
X_sample, _, y_sample, _ = train_test_split(X_mnist, y_mnist, train_size=10000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2)

print(f"Original features: {X_train.shape[1]}")

# Without PCA
start = time.time()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
time_no_pca = time.time() - start
acc_no_pca = rf.score(X_test, y_test)

# With PCA (95% variance)
pca = PCA(n_components=0.95)  # Keep 95% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"PCA features: {X_train_pca.shape[1]}")

start = time.time()
rf_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)
time_pca = time.time() - start
acc_pca = rf_pca.score(X_test_pca, y_test)

print(f"\nWithout PCA: {acc_no_pca:.3f} accuracy, {time_no_pca:.1f}s")
print(f"With PCA:    {acc_pca:.3f} accuracy, {time_pca:.1f}s")
print(f"Speedup: {time_no_pca/time_pca:.1f}x")

Noise Reduction

# Add noise to images
np.random.seed(42)
noise = np.random.randn(*X_sample.shape) * 50
X_noisy = X_sample + noise
X_noisy = np.clip(X_noisy, 0, 255)  # Keep valid pixel range

# Denoise using PCA
pca_denoise = PCA(n_components=100)
X_denoised = pca_denoise.inverse_transform(pca_denoise.fit_transform(X_noisy))

# Visualize
fig, axes = plt.subplots(3, 5, figsize=(12, 8))

for i in range(5):
    axes[0, i].imshow(X_sample[i].reshape(28, 28), cmap='gray')
    axes[0, i].axis('off')
    if i == 0:
        axes[0, i].set_title('Original')
    
    axes[1, i].imshow(X_noisy[i].reshape(28, 28), cmap='gray')
    axes[1, i].axis('off')
    if i == 0:
        axes[1, i].set_title('Noisy')
    
    axes[2, i].imshow(X_denoised[i].reshape(28, 28), cmap='gray')
    axes[2, i].axis('off')
    if i == 0:
        axes[2, i].set_title('Denoised (PCA)')

plt.tight_layout()
plt.show()

Feature Selection vs Feature Extraction

AspectFeature SelectionFeature Extraction (PCA)
MethodPick best featuresCreate new features
InterpretabilityHigh (original features)Lower (combinations)
InformationMay lose somePreserves variance
ExamplesSelectKBest, RFEPCA, LDA
from sklearn.feature_selection import SelectKBest, f_classif

# Feature selection: pick top k features
selector = SelectKBest(f_classif, k=50)
X_selected = selector.fit_transform(X_train, y_train)

# Feature extraction: create new features
pca = PCA(n_components=50)
X_extracted = pca.fit_transform(X_train)

print(f"Selection: {X_selected.shape}")
print(f"Extraction: {X_extracted.shape}")

LDA: Supervised Dimensionality Reduction

Linear Discriminant Analysis maximizes class separation:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# LDA uses class labels
lda = LinearDiscriminantAnalysis(n_components=9)  # Max = n_classes - 1
X_lda = lda.fit_transform(X, y)

# Compare PCA vs LDA
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.5)
axes[0].set_title('PCA (Unsupervised)')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')

# LDA
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)
axes[1].scatter(X_lda[:, 0], X_lda[:, 1], c=y, cmap='tab10', alpha=0.5)
axes[1].set_title('LDA (Supervised)')
axes[1].set_xlabel('LD1')
axes[1].set_ylabel('LD2')

plt.tight_layout()
plt.show()

When to Use What

                    Visualization?

           ┌─────────────┴─────────────┐
           │                           │
          Yes                         No
           │                           │
   ┌───────┴───────┐         ┌────────┴────────┐
   │               │         │                 │
Small data    Large data   Supervised?    Unsupervised?
   │               │         │                 │
 t-SNE          UMAP       LDA              PCA

Practical Example: Image Compression

from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

# Original images
digits = load_digits()
X = digits.data

# Compress with different components
components = [5, 10, 20, 40, 64]

fig, axes = plt.subplots(len(components), 5, figsize=(10, 12))

for row, n_comp in enumerate(components):
    if n_comp < 64:
        pca = PCA(n_components=n_comp)
        X_compressed = pca.fit_transform(X)
        X_reconstructed = pca.inverse_transform(X_compressed)
        compression_ratio = 64 / n_comp
    else:
        X_reconstructed = X
        compression_ratio = 1
    
    for col in range(5):
        axes[row, col].imshow(X_reconstructed[col].reshape(8, 8), cmap='gray')
        axes[row, col].axis('off')
        if col == 0:
            axes[row, col].set_ylabel(f'{n_comp} comp\n({compression_ratio:.1f}x)')

plt.suptitle('Image Compression with PCA')
plt.tight_layout()
plt.show()

Key Takeaways

PCA for Preprocessing

Reduce dimensions while keeping variance

t-SNE/UMAP for Visualization

See high-dimensional data in 2D/3D

LDA for Classification

Maximize class separation

Choose Components Wisely

Use explained variance or CV performance

What’s Next?

Congratulations! You’ve completed the advanced topics. Now let’s bring everything together in a capstone project!

Continue to Capstone Project

Build a complete ML system from scratch