Mathematical Foundations for Deep Learning
Why Math Matters
Deep learning isn’t just “import torch and train” — understanding the mathematics enables you to:- Debug when models fail (why are gradients exploding?)
- Innovate by designing new architectures and loss functions
- Optimize by understanding what optimizers actually do
- Read papers that describe cutting-edge research
This chapter provides a comprehensive mathematical foundation. If you’re already comfortable with linear algebra and calculus, use this as a reference. If not, work through each section carefully — it will pay dividends throughout your deep learning journey.
Part 1: Linear Algebra
Vectors and Vector Spaces
A vector in Rn is an ordered collection of n real numbers: x=x1x2⋮xn∈RnCopy
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Vectors in Python
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])
print("Vector x:", x)
print("Vector y:", y)
print("Shape:", x.shape)
Vector Operations
Copy
# ========================================
# FUNDAMENTAL VECTOR OPERATIONS
# ========================================
def vector_operations():
x = np.array([1.0, 2.0, 3.0])
y = np.array([4.0, 5.0, 6.0])
# Addition (element-wise)
print("Addition: x + y =", x + y)
# Scalar multiplication
print("Scalar mult: 2 * x =", 2 * x)
# Dot product (inner product)
dot = np.dot(x, y) # = x₁y₁ + x₂y₂ + x₃y₃
print(f"Dot product: x · y = {dot}")
# Magnitude (L2 norm)
magnitude = np.linalg.norm(x) # = √(x₁² + x₂² + x₃²)
print(f"Magnitude ||x|| = {magnitude:.4f}")
# Unit vector (direction)
unit = x / magnitude
print(f"Unit vector: x/||x|| = {unit}")
print(f"Unit vector magnitude: {np.linalg.norm(unit):.4f}") # Should be 1
# Angle between vectors
cos_theta = np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
theta = np.arccos(cos_theta)
print(f"Angle between x and y: {np.degrees(theta):.2f}°")
return x, y
vector_operations()
The Dot Product: Geometric Interpretation
The dot product has profound geometric meaning: x⋅y=∥x∥∥y∥cos(θ)Copy
def visualize_dot_product():
"""Visualize the geometric meaning of dot product."""
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
vectors = [
(np.array([1, 0]), np.array([1, 1]), "Acute angle"),
(np.array([1, 0]), np.array([0, 1]), "Perpendicular"),
(np.array([1, 0]), np.array([-1, 0.5]), "Obtuse angle"),
]
for ax, (x, y, title) in zip(axes, vectors):
ax.quiver(0, 0, x[0], x[1], angles='xy', scale_units='xy', scale=1,
color='blue', label='x', width=0.02)
ax.quiver(0, 0, y[0], y[1], angles='xy', scale_units='xy', scale=1,
color='red', label='y', width=0.02)
dot = np.dot(x, y)
ax.set_title(f"{title}\nx·y = {dot:.2f}")
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-0.5, 1.5)
ax.grid(True, alpha=0.3)
ax.set_aspect('equal')
ax.legend()
plt.tight_layout()
plt.show()
print("Key insight:")
print(" • x·y > 0: vectors point in similar direction")
print(" • x·y = 0: vectors are perpendicular (orthogonal)")
print(" • x·y < 0: vectors point in opposite directions")
visualize_dot_product()
In Neural Networks: The dot product is the fundamental operation in neurons!
z=w⋅x+b computes how “aligned” the input is with the weight vector.
Matrices and Matrix Operations
A matrix is a 2D array of numbers: A=a11a21⋮am1a12a22⋮am2⋯⋯⋱⋯a1na2n⋮amn∈Rm×nCopy
# ========================================
# MATRIX OPERATIONS
# ========================================
def matrix_operations():
A = np.array([
[1, 2, 3],
[4, 5, 6]
]) # 2×3 matrix
B = np.array([
[7, 8],
[9, 10],
[11, 12]
]) # 3×2 matrix
print("Matrix A (2×3):")
print(A)
print("\nMatrix B (3×2):")
print(B)
# Matrix multiplication
# (2×3) @ (3×2) = (2×2)
C = A @ B # or np.matmul(A, B)
print("\nA @ B (2×2):")
print(C)
# How matrix multiplication works:
# C[i,j] = sum(A[i,:] * B[:,j]) = dot product of row i and column j
print("\nManual calculation of C[0,0]:")
print(f" A[0,:] · B[:,0] = {A[0,:]} · {B[:,0]} = {np.dot(A[0,:], B[:,0])}")
# Transpose
print("\nTranspose A^T:")
print(A.T)
# Element-wise multiplication (Hadamard product)
D = np.array([[1, 2], [3, 4]])
E = np.array([[5, 6], [7, 8]])
print("\nElement-wise D ⊙ E:")
print(D * E)
return A, B, C
matrix_operations()
Matrix Multiplication Dimensions
Critical Rule: For C=A×B:
- A is (m×n)
- B is (n×p)
- C is (m×p)
Copy
def matrix_dimension_examples():
"""Common matrix dimension patterns in deep learning."""
# Fully connected layer: y = Wx + b
batch_size = 32
input_dim = 784 # e.g., flattened 28×28 image
output_dim = 256 # hidden layer size
X = np.random.randn(batch_size, input_dim) # (32, 784)
W = np.random.randn(input_dim, output_dim) # (784, 256)
b = np.random.randn(output_dim) # (256,)
Y = X @ W + b # (32, 784) @ (784, 256) = (32, 256)
print(f"Input X: {X.shape}")
print(f"Weights W: {W.shape}")
print(f"Bias b: {b.shape}")
print(f"Output Y: {Y.shape}")
print(f"\nOperation: ({batch_size}×{input_dim}) @ ({input_dim}×{output_dim}) = ({batch_size}×{output_dim})")
matrix_dimension_examples()
Eigenvalues and Eigenvectors
Eigenvectors are special directions that only get scaled (not rotated) by a matrix: Av=λvCopy
def eigenvalue_demo():
"""Eigenvalues and eigenvectors demonstration."""
# Symmetric matrix (always has real eigenvalues)
A = np.array([
[4, 2],
[2, 3]
])
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(A)
print("Matrix A:")
print(A)
print(f"\nEigenvalues: {eigenvalues}")
print(f"\nEigenvectors (as columns):")
print(eigenvectors)
# Verify: Av = λv
for i in range(len(eigenvalues)):
v = eigenvectors[:, i]
lam = eigenvalues[i]
Av = A @ v
lambda_v = lam * v
print(f"\nEigenvector {i+1}: v = {v}")
print(f" A @ v = {Av}")
print(f" λ * v = {lambda_v}")
print(f" Equal? {np.allclose(Av, lambda_v)}")
# Visualize
fig, ax = plt.subplots(figsize=(8, 8))
# Draw unit circle and its transformation
theta = np.linspace(0, 2*np.pi, 100)
circle = np.array([np.cos(theta), np.sin(theta)])
transformed = A @ circle
ax.plot(circle[0], circle[1], 'b-', label='Unit circle', alpha=0.5)
ax.plot(transformed[0], transformed[1], 'r-', label='A @ circle', alpha=0.5)
# Draw eigenvectors
for i in range(len(eigenvalues)):
v = eigenvectors[:, i] * 2 # Scale for visibility
ax.arrow(0, 0, v[0], v[1], head_width=0.1, head_length=0.1,
fc=f'C{i+2}', ec=f'C{i+2}', label=f'Eigenvector {i+1} (λ={eigenvalues[i]:.2f})')
ax.set_xlim(-6, 6)
ax.set_ylim(-6, 6)
ax.set_aspect('equal')
ax.grid(True, alpha=0.3)
ax.legend()
ax.set_title('Eigenvalues: Directions that only scale')
plt.show()
eigenvalue_demo()
In Deep Learning: Eigenvalues are crucial for understanding:
- PCA for dimensionality reduction
- Weight matrix conditioning (ratio of max/min eigenvalues affects training)
- Hessian analysis for optimization landscapes
Singular Value Decomposition (SVD)
Any matrix can be decomposed as: A=UΣVTCopy
def svd_demo():
"""Singular Value Decomposition and its applications."""
# Example: Image compression with SVD
from skimage import data
from skimage.color import rgb2gray
# Load image
image = rgb2gray(data.astronaut())
print(f"Original image shape: {image.shape}")
# Compute SVD
U, s, Vt = np.linalg.svd(image, full_matrices=False)
print(f"U shape: {U.shape}")
print(f"Singular values shape: {s.shape}")
print(f"V^T shape: {Vt.shape}")
# Reconstruct with different ranks
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
ranks = [5, 20, 50, 100, len(s)]
for ax, k in zip(axes, ranks):
# Low-rank approximation
reconstructed = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
compression = 100 * (1 - (k * (U.shape[0] + Vt.shape[1])) / image.size)
ax.imshow(reconstructed, cmap='gray')
ax.set_title(f'Rank {k}\n{compression:.1f}% compression')
ax.axis('off')
plt.tight_layout()
plt.show()
# Singular value spectrum
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(s[:100])
plt.xlabel('Index')
plt.ylabel('Singular Value')
plt.title('Singular Value Spectrum')
plt.subplot(1, 2, 2)
plt.plot(np.cumsum(s**2) / np.sum(s**2))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance Explained')
plt.title('Variance Explained')
plt.axhline(y=0.95, color='r', linestyle='--', label='95%')
plt.legend()
plt.tight_layout()
plt.show()
svd_demo()
Part 2: Calculus for Deep Learning
Derivatives: The Foundation of Learning
A derivative measures the rate of change: f′(x)=dxdf=h→0limhf(x+h)−f(x)Copy
def derivative_basics():
"""Understanding derivatives numerically and analytically."""
# Example function: f(x) = x²
def f(x):
return x ** 2
# Numerical derivative
def numerical_derivative(f, x, h=1e-7):
return (f(x + h) - f(x - h)) / (2 * h) # Central difference
# Analytical derivative: f'(x) = 2x
def analytical_derivative(x):
return 2 * x
# Compare at different points
test_points = [-2, -1, 0, 1, 2, 3]
print("x\t\tNumerical\tAnalytical")
print("-" * 40)
for x in test_points:
num = numerical_derivative(f, x)
ana = analytical_derivative(x)
print(f"{x}\t\t{num:.6f}\t{ana:.6f}")
# Visualize
x = np.linspace(-3, 3, 100)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(x, f(x), label='f(x) = x²')
axes[0].set_title('Function')
axes[0].set_xlabel('x')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[1].plot(x, analytical_derivative(x), label="f'(x) = 2x")
axes[1].set_title('Derivative')
axes[1].set_xlabel('x')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
derivative_basics()
The Chain Rule: How Backpropagation Works
For composite functions f(g(x)): dxd[f(g(x))]=f′(g(x))⋅g′(x)Copy
def chain_rule_demo():
"""The chain rule is the heart of backpropagation."""
# Example: f(x) = (x² + 1)³
# Let g(x) = x² + 1, h(g) = g³
# Then f(x) = h(g(x))
def g(x):
return x**2 + 1
def h(z):
return z**3
def f(x):
return h(g(x))
# Derivatives
def dg_dx(x):
return 2*x
def dh_dg(z):
return 3*z**2
def df_dx_chain_rule(x):
"""Using chain rule: df/dx = dh/dg * dg/dx"""
return dh_dg(g(x)) * dg_dx(x)
# Verify numerically
def numerical_derivative(f, x, h=1e-7):
return (f(x + h) - f(x - h)) / (2 * h)
x_test = 2.0
print(f"Testing at x = {x_test}")
print(f"Chain rule: df/dx = {df_dx_chain_rule(x_test)}")
print(f"Numerical: df/dx = {numerical_derivative(f, x_test):.6f}")
print(f"Match: {np.isclose(df_dx_chain_rule(x_test), numerical_derivative(f, x_test))}")
# Longer chain: f(x) = sin(exp(x²))
print("\n" + "="*50)
print("Longer chain: f(x) = sin(exp(x²))")
def long_chain(x):
return np.sin(np.exp(x**2))
# df/dx = cos(exp(x²)) * exp(x²) * 2x
def long_chain_derivative(x):
return np.cos(np.exp(x**2)) * np.exp(x**2) * 2*x
x_test = 0.5
print(f"Chain rule: {long_chain_derivative(x_test):.6f}")
print(f"Numerical: {numerical_derivative(long_chain, x_test):.6f}")
chain_rule_demo()
The Chain Rule in Neural Networks:For a loss L at the end of a network with layers f1,f2,...,fn:∂θ1∂L=∂fn∂L⋅∂fn−1∂fn⋅...⋅∂f1∂f2⋅∂θ1∂f1This is backpropagation — applying the chain rule from output to input!
Partial Derivatives and Gradients
For functions of multiple variables, the gradient is the vector of all partial derivatives: ∇f=∂x1∂f∂x2∂f⋮∂xn∂fCopy
def gradient_demo():
"""Gradients for multivariable functions."""
# f(x, y) = x² + y² (paraboloid)
def f(xy):
x, y = xy
return x**2 + y**2
def gradient_f(xy):
x, y = xy
return np.array([2*x, 2*y])
# Visualize the gradient field
x = np.linspace(-2, 2, 20)
y = np.linspace(-2, 2, 20)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2
fig = plt.figure(figsize=(15, 5))
# 3D surface
ax1 = fig.add_subplot(131, projection='3d')
ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax1.set_xlabel('x')
ax1.set_ylabel('y')
ax1.set_zlabel('f(x,y)')
ax1.set_title('Function: f(x,y) = x² + y²')
# Contour with gradient arrows
ax2 = fig.add_subplot(132)
ax2.contour(X, Y, Z, levels=20, cmap='viridis')
# Gradient field (subsample for clarity)
skip = 2
ax2.quiver(X[::skip, ::skip], Y[::skip, ::skip],
2*X[::skip, ::skip], 2*Y[::skip, ::skip],
color='red', alpha=0.7)
ax2.set_xlabel('x')
ax2.set_ylabel('y')
ax2.set_title('Gradient field (arrows)')
ax2.set_aspect('equal')
# Gradient descent path
ax3 = fig.add_subplot(133)
ax3.contour(X, Y, Z, levels=20, cmap='viridis')
# Simulate gradient descent
point = np.array([1.8, 1.5])
path = [point.copy()]
lr = 0.1
for _ in range(20):
grad = gradient_f(point)
point = point - lr * grad # Gradient descent step
path.append(point.copy())
path = np.array(path)
ax3.plot(path[:, 0], path[:, 1], 'ro-', markersize=4, label='GD path')
ax3.plot(path[0, 0], path[0, 1], 'go', markersize=10, label='Start')
ax3.plot(path[-1, 0], path[-1, 1], 'b*', markersize=15, label='End')
ax3.set_xlabel('x')
ax3.set_ylabel('y')
ax3.set_title('Gradient Descent Path')
ax3.legend()
ax3.set_aspect('equal')
plt.tight_layout()
plt.show()
print(f"Starting point: {path[0]}")
print(f"Ending point: {path[-1]}")
print(f"Minimum at (0, 0)")
gradient_demo()
The Jacobian Matrix
For vector-valued functions f:Rn→Rm, the Jacobian is the matrix of all partial derivatives: J=∂x1∂f1⋮∂x1∂fm⋯⋱⋯∂xn∂f1⋮∂xn∂fmCopy
import torch
def jacobian_demo():
"""Jacobian matrices in PyTorch."""
# Function: f(x) = [x₁², x₁x₂, x₂²]ᵀ
def f(x):
return torch.stack([
x[0]**2,
x[0] * x[1],
x[1]**2
])
x = torch.tensor([2.0, 3.0], requires_grad=True)
# Compute Jacobian
jacobian = torch.autograd.functional.jacobian(f, x)
print("Input x:", x.detach().numpy())
print("f(x):", f(x).detach().numpy())
print("\nJacobian matrix:")
print(jacobian.numpy())
# Manual verification
print("\nManual calculation:")
print("∂f₁/∂x₁ = 2x₁ =", 2*x[0].item())
print("∂f₁/∂x₂ = 0")
print("∂f₂/∂x₁ = x₂ =", x[1].item())
print("∂f₂/∂x₂ = x₁ =", x[0].item())
print("∂f₃/∂x₁ = 0")
print("∂f₃/∂x₂ = 2x₂ =", 2*x[1].item())
jacobian_demo()
The Hessian Matrix
The Hessian is the matrix of second-order partial derivatives: H=∂x12∂2f∂x2∂x1∂2f⋮∂x1∂x2∂2f∂x22∂2f⋮⋯⋯⋱Copy
def hessian_demo():
"""Hessian matrix and curvature analysis."""
# f(x, y) = x² + 4xy + y²
def f(xy):
x, y = xy
return x**2 + 4*x*y + y**2
xy = torch.tensor([1.0, 1.0], requires_grad=True)
# Compute Hessian
hessian = torch.autograd.functional.hessian(
lambda x: x[0]**2 + 4*x[0]*x[1] + x[1]**2, xy
)
print("Hessian matrix:")
print(hessian.numpy())
# Eigenvalues of Hessian tell us about curvature
eigenvalues = np.linalg.eigvalsh(hessian.numpy())
print(f"\nEigenvalues: {eigenvalues}")
if np.all(eigenvalues > 0):
print("→ Positive definite: LOCAL MINIMUM")
elif np.all(eigenvalues < 0):
print("→ Negative definite: LOCAL MAXIMUM")
else:
print("→ Indefinite: SADDLE POINT")
# Condition number: ratio of max/min eigenvalue
condition = max(abs(eigenvalues)) / min(abs(eigenvalues))
print(f"\nCondition number: {condition:.2f}")
print("(High condition number = ill-conditioned = hard to optimize)")
hessian_demo()
Part 3: Probability and Statistics
Random Variables and Distributions
Copy
from scipy import stats
def probability_basics():
"""Probability distributions commonly used in deep learning."""
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
x = np.linspace(-5, 5, 1000)
# 1. Gaussian (Normal) Distribution
ax = axes[0, 0]
for mu, sigma in [(0, 1), (0, 2), (2, 0.5)]:
y = stats.norm.pdf(x, mu, sigma)
ax.plot(x, y, label=f'μ={mu}, σ={sigma}')
ax.set_title('Gaussian Distribution')
ax.set_xlabel('x')
ax.set_ylabel('Probability Density')
ax.legend()
ax.grid(True, alpha=0.3)
# 2. Bernoulli Distribution
ax = axes[0, 1]
for p in [0.2, 0.5, 0.8]:
ax.bar([0, 1], [1-p, p], alpha=0.5, label=f'p={p}')
ax.set_title('Bernoulli Distribution')
ax.set_xlabel('x')
ax.set_ylabel('Probability')
ax.set_xticks([0, 1])
ax.legend()
# 3. Categorical (Multinomial) Distribution
ax = axes[0, 2]
categories = ['A', 'B', 'C', 'D']
probs = [0.1, 0.3, 0.4, 0.2]
ax.bar(categories, probs)
ax.set_title('Categorical Distribution')
ax.set_xlabel('Category')
ax.set_ylabel('Probability')
# 4. Uniform Distribution
ax = axes[1, 0]
x_unif = np.linspace(-1, 3, 1000)
y_unif = stats.uniform.pdf(x_unif, loc=0, scale=2)
ax.plot(x_unif, y_unif)
ax.set_title('Uniform Distribution [0, 2]')
ax.set_xlabel('x')
ax.set_ylabel('Probability Density')
ax.grid(True, alpha=0.3)
# 5. Exponential Distribution
ax = axes[1, 1]
x_exp = np.linspace(0, 5, 1000)
for lam in [0.5, 1, 2]:
y_exp = stats.expon.pdf(x_exp, scale=1/lam)
ax.plot(x_exp, y_exp, label=f'λ={lam}')
ax.set_title('Exponential Distribution')
ax.set_xlabel('x')
ax.set_ylabel('Probability Density')
ax.legend()
ax.grid(True, alpha=0.3)
# 6. Beta Distribution (for probabilities of probabilities)
ax = axes[1, 2]
x_beta = np.linspace(0, 1, 1000)
for a, b in [(0.5, 0.5), (2, 2), (2, 5), (5, 2)]:
y_beta = stats.beta.pdf(x_beta, a, b)
ax.plot(x_beta, y_beta, label=f'α={a}, β={b}')
ax.set_title('Beta Distribution')
ax.set_xlabel('x')
ax.set_ylabel('Probability Density')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
probability_basics()
Information Theory: Entropy and Cross-Entropy
Entropy measures uncertainty in a distribution: H(p)=−x∑p(x)logp(x) Cross-Entropy measures how well distribution q approximates p: H(p,q)=−x∑p(x)logq(x)Copy
def information_theory_demo():
"""Entropy, cross-entropy, and KL divergence."""
# Example: True distribution p and model distribution q
p = np.array([0.7, 0.2, 0.1]) # True labels
q = np.array([0.5, 0.3, 0.2]) # Model predictions
# Entropy of p
entropy_p = -np.sum(p * np.log(p + 1e-10))
print(f"Entropy H(p) = {entropy_p:.4f}")
print(" Measures inherent uncertainty in true distribution")
# Cross-entropy H(p, q)
cross_entropy = -np.sum(p * np.log(q + 1e-10))
print(f"\nCross-Entropy H(p,q) = {cross_entropy:.4f}")
print(" This is our LOSS FUNCTION in classification!")
# KL Divergence = Cross-Entropy - Entropy
kl_divergence = cross_entropy - entropy_p
print(f"\nKL Divergence D_KL(p||q) = {kl_divergence:.4f}")
print(" Extra bits needed due to using q instead of p")
# Verify: KL divergence is always >= 0
print(f"\nKL >= 0: {kl_divergence >= 0}")
# Example: Why cross-entropy loss works
print("\n" + "="*50)
print("WHY CROSS-ENTROPY LOSS WORKS")
print("="*50)
# True label: class 0 (one-hot)
true_label = np.array([1, 0, 0])
# Different predictions
predictions = [
np.array([0.9, 0.05, 0.05]), # Good
np.array([0.6, 0.2, 0.2]), # OK
np.array([0.3, 0.4, 0.3]), # Bad
np.array([0.1, 0.5, 0.4]), # Very bad
]
print("\nTrue label: class 0")
print("Prediction\t\t\tCross-Entropy Loss")
print("-" * 50)
for pred in predictions:
loss = -np.sum(true_label * np.log(pred + 1e-10))
print(f"{pred}\t\t{loss:.4f}")
information_theory_demo()
Maximum Likelihood Estimation
Copy
def mle_demo():
"""Maximum Likelihood Estimation - the foundation of training."""
# Generate data from unknown distribution
np.random.seed(42)
true_mu = 5.0
true_sigma = 2.0
data = np.random.normal(true_mu, true_sigma, 100)
# MLE for Gaussian: find μ and σ that maximize likelihood
# For Gaussian, MLE gives: μ̂ = mean(data), σ̂ = std(data)
mu_range = np.linspace(3, 7, 100)
sigma_range = np.linspace(0.5, 4, 100)
# Compute log-likelihood surface
def log_likelihood(mu, sigma, data):
n = len(data)
ll = -n/2 * np.log(2*np.pi) - n*np.log(sigma) - np.sum((data - mu)**2) / (2*sigma**2)
return ll
MU, SIGMA = np.meshgrid(mu_range, sigma_range)
LL = np.zeros_like(MU)
for i in range(len(sigma_range)):
for j in range(len(mu_range)):
LL[i, j] = log_likelihood(mu_range[j], sigma_range[i], data)
# Find MLE
mle_mu = np.mean(data)
mle_sigma = np.std(data)
print(f"True parameters: μ={true_mu}, σ={true_sigma}")
print(f"MLE estimates: μ̂={mle_mu:.4f}, σ̂={mle_sigma:.4f}")
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Log-likelihood surface
ax = axes[0]
contour = ax.contourf(MU, SIGMA, LL, levels=50, cmap='viridis')
ax.plot(mle_mu, mle_sigma, 'r*', markersize=15, label='MLE')
ax.plot(true_mu, true_sigma, 'w+', markersize=15, mew=3, label='True')
ax.set_xlabel('μ')
ax.set_ylabel('σ')
ax.set_title('Log-Likelihood Surface')
ax.legend()
plt.colorbar(contour, ax=ax)
# Data and fitted distribution
ax = axes[1]
ax.hist(data, bins=20, density=True, alpha=0.7, label='Data')
x = np.linspace(min(data)-1, max(data)+1, 100)
ax.plot(x, stats.norm.pdf(x, mle_mu, mle_sigma), 'r-', lw=2, label='MLE fit')
ax.plot(x, stats.norm.pdf(x, true_mu, true_sigma), 'g--', lw=2, label='True dist')
ax.set_xlabel('x')
ax.set_ylabel('Density')
ax.set_title('Data and Fitted Distribution')
ax.legend()
plt.tight_layout()
plt.show()
print("\nKey insight: Training neural networks = Maximum Likelihood Estimation!")
print(" - Cross-entropy loss = negative log-likelihood")
print(" - Minimizing loss = Maximizing likelihood")
mle_demo()
Part 4: Putting It All Together
The Full Picture
Copy
def full_picture():
"""How math concepts connect in a neural network."""
print("="*60)
print("MATHEMATICS IN A SINGLE FORWARD-BACKWARD PASS")
print("="*60)
print("""
1. LINEAR ALGEBRA
─────────────────
• Forward pass: y = Wx + b (matrix multiplication)
• Parameters are matrices and vectors
• Batch processing: multiple samples at once
2. CALCULUS
─────────────────
• Loss function: L = f(ŷ, y)
• Gradients: ∂L/∂W, ∂L/∂b via chain rule
• Backpropagation: recursive application of chain rule
• Update: W ← W - η∇L
3. PROBABILITY
─────────────────
• Softmax: converts logits to probabilities
• Cross-entropy: measures prediction quality
• Training = Maximum Likelihood Estimation
• Dropout: approximates Bayesian inference
4. OPTIMIZATION (combines all three)
─────────────────
• Gradient descent navigates loss landscape
• Momentum uses exponential moving averages
• Adam adapts learning rates per parameter
• Curvature (Hessian) affects convergence
""")
full_picture()
Exercises
Exercise 1: Implement Gradient Descent from Scratch
Exercise 1: Implement Gradient Descent from Scratch
Implement gradient descent for a quadratic function f(x,y)=x2+10y2
(an ill-conditioned function). Compare with different learning rates.
Copy
def exercise_1():
def f(xy):
return xy[0]**2 + 10*xy[1]**2
def grad_f(xy):
return np.array([2*xy[0], 20*xy[1]])
# TODO: Implement gradient descent
# TODO: Try learning rates 0.01, 0.05, 0.1
# TODO: Plot the optimization paths
pass
Exercise 2: Eigenvalue Analysis of Weight Matrices
Exercise 2: Eigenvalue Analysis of Weight Matrices
Analyze the eigenvalue spectrum of randomly initialized weight matrices.
How does initialization scale affect the eigenvalues?
Copy
def exercise_2():
sizes = [100, 500, 1000]
for size in sizes:
W_xavier = np.random.randn(size, size) * np.sqrt(1/size)
W_he = np.random.randn(size, size) * np.sqrt(2/size)
# TODO: Compute eigenvalues
# TODO: Plot eigenvalue distributions
# TODO: Compute condition numbers
pass
Exercise 3: Numerical Gradient Checking
Exercise 3: Numerical Gradient Checking
Implement gradient checking to verify backpropagation is correct.
Copy
def exercise_3():
"""Compare analytical and numerical gradients."""
def check_gradients(f, grad_f, x, eps=1e-5):
numerical_grad = np.zeros_like(x)
for i in range(len(x)):
x_plus = x.copy()
x_minus = x.copy()
x_plus[i] += eps
x_minus[i] -= eps
numerical_grad[i] = (f(x_plus) - f(x_minus)) / (2*eps)
analytical_grad = grad_f(x)
# TODO: Compute relative error
# TODO: Return True if gradients match
pass
Exercise 4: Information Theory in Classification
Exercise 4: Information Theory in Classification
Explore how entropy and cross-entropy change during training.
Copy
def exercise_4():
# Simulate training: predictions getting better over epochs
epochs = [1, 10, 50, 100]
true_label = np.array([1, 0, 0, 0, 0]) # One-hot
# Predictions at different epochs (getting more confident)
predictions = [
np.array([0.3, 0.2, 0.2, 0.15, 0.15]), # Epoch 1
np.array([0.5, 0.15, 0.15, 0.1, 0.1]), # Epoch 10
np.array([0.8, 0.07, 0.07, 0.03, 0.03]), # Epoch 50
np.array([0.95, 0.02, 0.01, 0.01, 0.01]) # Epoch 100
]
# TODO: Compute entropy of predictions
# TODO: Compute cross-entropy loss
# TODO: Plot how they change over epochs
pass
Exercise 5: Hessian and Optimization Difficulty
Exercise 5: Hessian and Optimization Difficulty
Analyze how the condition number of the Hessian affects optimization.
Copy
def exercise_5():
# Well-conditioned function
def f1(xy):
return xy[0]**2 + xy[1]**2 # Condition number = 1
# Ill-conditioned function
def f2(xy):
return xy[0]**2 + 100*xy[1]**2 # Condition number = 100
# TODO: Run gradient descent on both
# TODO: Compare number of iterations to converge
# TODO: Visualize the optimization paths
pass