Skip to main content

Support Vector Machines (SVM)

SVM Maximum Margin Hyperplane

The Widest Street Problem

Imagine you’re a city planner. You need to draw a road between two neighborhoods - one residential (blue), one commercial (red). You have many possible routes:
Option A: Barely squeezes through, buildings on both sides
Option B: Wide boulevard with buffer zones on both sides
Which is better? The wide boulevard! If a building is slightly misplaced, Option A fails. Option B has room for error. SVM finds the widest possible street (boundary) between classes.
Image Classification with SVM

The Maximum Margin Classifier

import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC

# Two clearly separated groups
np.random.seed(42)
X_class0 = np.random.randn(20, 2) + np.array([-2, -2])
X_class1 = np.random.randn(20, 2) + np.array([2, 2])
X = np.vstack([X_class0, X_class1])
y = np.array([0] * 20 + [1] * 20)

# Train SVM
svm = SVC(kernel='linear')
svm.fit(X, y)

# Visualize
plt.figure(figsize=(10, 8))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', s=50)

# Plot decision boundary
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()

xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50),
                     np.linspace(ylim[0], ylim[1], 50))
Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Decision boundary and margins
plt.contour(xx, yy, Z, colors='k', levels=[-1, 0, 1], 
            linestyles=['--', '-', '--'])

# Highlight support vectors
plt.scatter(svm.support_vectors_[:, 0], svm.support_vectors_[:, 1], 
            s=200, linewidth=1, facecolors='none', edgecolors='k',
            label='Support Vectors')

plt.title('SVM: Maximum Margin Classifier')
plt.legend()
plt.show()

print(f"Number of support vectors: {len(svm.support_vectors_)}")

Support Vectors: The Important Points

Not all training points matter equally! Support vectors are the points closest to the decision boundary. They’re the only ones that determine where the boundary goes.
# The model only remembers these points
print("Support vectors per class:", svm.n_support_)
print("Support vector indices:", svm.support_)
Why this matters:
  • SVM is memory efficient (stores only support vectors)
  • Robust to outliers far from the boundary
  • Decision is based on “hardest” examples to classify

The Kernel Trick: Handling Non-Linear Data

What if data isn’t linearly separable?
from sklearn.datasets import make_circles

# Create circular data (can't separate with a line!)
X_circles, y_circles = make_circles(n_samples=200, noise=0.1, factor=0.3)

# Linear SVM fails
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_circles, y_circles)
print(f"Linear SVM accuracy: {svm_linear.score(X_circles, y_circles):.2%}")

# RBF kernel SVM succeeds!
svm_rbf = SVC(kernel='rbf', gamma='auto')
svm_rbf.fit(X_circles, y_circles)
print(f"RBF SVM accuracy: {svm_rbf.score(X_circles, y_circles):.2%}")

Visualizing Kernels

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

kernels = ['linear', 'poly', 'rbf']

for ax, kernel in zip(axes, kernels):
    svm = SVC(kernel=kernel, gamma='auto', degree=3)
    svm.fit(X_circles, y_circles)
    
    # Create mesh for decision boundary
    xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 100),
                         np.linspace(-1.5, 1.5, 100))
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
    ax.scatter(X_circles[:, 0], X_circles[:, 1], c=y_circles, cmap='coolwarm')
    ax.set_title(f'{kernel.upper()} Kernel (acc: {svm.score(X_circles, y_circles):.0%})')

plt.tight_layout()
plt.show()

How The Kernel Trick Works (Intuition)

The kernel trick maps data to a higher dimension where it becomes linearly separable. 2D circles → 3D cone:
  • Inner circle: low on the cone
  • Outer circle: high on the cone
  • Now a flat plane can separate them!
# What the RBF kernel "sees"
from mpl_toolkits.mplot3d import Axes3D

# Map to 3D using a simple transformation
r = np.sqrt(X_circles[:, 0]**2 + X_circles[:, 1]**2)
X_3d = np.column_stack([X_circles, r])

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], c=y_circles, cmap='coolwarm')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('R (distance from origin)')
ax.set_title('Data Lifted to 3D - Now Linearly Separable!')
plt.show()
Math Connection: The kernel function computes dot products in high-dimensional space without actually transforming the data. This is computationally efficient! See Vectors for dot product intuition.

Common Kernels

KernelBest ForKey Parameter
LinearHigh-dimensional data, text-
RBF (Gaussian)Most non-linear problemsgamma (width)
PolynomialFeature interactionsdegree, coef0
SigmoidNeural network-likegamma, coef0

Key Hyperparameters

C: Regularization Parameter

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, C in zip(axes, [0.1, 1, 100]):
    svm = SVC(kernel='rbf', C=C, gamma='auto')
    svm.fit(X_circles, y_circles)
    
    xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 100),
                         np.linspace(-1.5, 1.5, 100))
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
    ax.scatter(X_circles[:, 0], X_circles[:, 1], c=y_circles, cmap='coolwarm')
    ax.set_title(f'C = {C}')

plt.tight_layout()
plt.show()
  • Small C: Wide margin, allows some misclassification (soft margin)
  • Large C: Narrow margin, tries to classify all correctly (hard margin)

Gamma: RBF Kernel Width

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for ax, gamma in zip(axes, [0.1, 1, 10]):
    svm = SVC(kernel='rbf', gamma=gamma)
    svm.fit(X_circles, y_circles)
    
    xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 100),
                         np.linspace(-1.5, 1.5, 100))
    Z = svm.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
    ax.scatter(X_circles[:, 0], X_circles[:, 1], c=y_circles, cmap='coolwarm')
    ax.set_title(f'gamma = {gamma}')

plt.tight_layout()
plt.show()
  • Small gamma: Smooth decision boundary (underfitting risk)
  • Large gamma: Complex, wiggly boundary (overfitting risk)

Real Example: Digit Recognition

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load digits
digits = load_digits()
X, y = digits.data, digits.target

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Grid search for best parameters
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.01, 0.1]
}

grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

SVM for Regression: SVR

from sklearn.svm import SVR
from sklearn.datasets import make_regression

# Create regression data
X_reg, y_reg = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

# Fit SVR
svr = SVR(kernel='rbf', C=100, gamma='auto', epsilon=0.1)
svr.fit(X_reg, y_reg)

# Plot
X_test_reg = np.linspace(X_reg.min(), X_reg.max(), 100).reshape(-1, 1)
y_pred_reg = svr.predict(X_test_reg)

plt.figure(figsize=(10, 6))
plt.scatter(X_reg, y_reg, alpha=0.5, label='Data')
plt.plot(X_test_reg, y_pred_reg, 'r-', linewidth=2, label='SVR')
plt.fill_between(X_test_reg.ravel(), 
                 y_pred_reg - svr.epsilon, 
                 y_pred_reg + svr.epsilon, 
                 alpha=0.2, color='red', label='Epsilon tube')
plt.legend()
plt.title('Support Vector Regression')
plt.show()

When to Use SVM

Good For

  • High-dimensional data (text, genomics)
  • Clear margin of separation
  • When you need probability estimates (use probability=True)
  • Small to medium datasets

Not Great For

  • Very large datasets (slow training)
  • Lots of noise and overlapping classes
  • When interpretability is crucial
  • When you need feature importance

Key Takeaways

Maximum Margin

Find the widest possible boundary between classes

Support Vectors

Only boundary points matter for the decision

Kernel Trick

Handle non-linear data by mapping to higher dimensions

Scale Your Data

SVM is sensitive to feature scales!

What’s Next?

Let’s learn about Naive Bayes - a completely different approach based on probability!

Continue to Module 5b: Naive Bayes

Simple probabilistic classification that works surprisingly well