Skip to main content
Mathematical Foundations

Mathematical Foundations for Deep Learning

Why Math Matters

Deep learning isn’t just “import torch and train” — understanding the mathematics enables you to:
  • Debug when models fail (why are gradients exploding?)
  • Innovate by designing new architectures and loss functions
  • Optimize by understanding what optimizers actually do
  • Read papers that describe cutting-edge research
This chapter provides a comprehensive mathematical foundation. If you’re already comfortable with linear algebra and calculus, use this as a reference. If not, work through each section carefully — it will pay dividends throughout your deep learning journey.

Part 1: Linear Algebra

Vectors and Vector Spaces

A vector in Rn\mathbb{R}^n is an ordered collection of nn real numbers: x=[x1x2xn]Rn\mathbf{x} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_n \end{bmatrix} \in \mathbb{R}^n
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Vectors in Python
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])

print("Vector x:", x)
print("Vector y:", y)
print("Shape:", x.shape)

Vector Operations

# ========================================
# FUNDAMENTAL VECTOR OPERATIONS
# ========================================

def vector_operations():
    x = np.array([1.0, 2.0, 3.0])
    y = np.array([4.0, 5.0, 6.0])
    
    # Addition (element-wise)
    print("Addition: x + y =", x + y)
    
    # Scalar multiplication
    print("Scalar mult: 2 * x =", 2 * x)
    
    # Dot product (inner product)
    dot = np.dot(x, y)  # = x₁y₁ + x₂y₂ + x₃y₃
    print(f"Dot product: x · y = {dot}")
    
    # Magnitude (L2 norm)
    magnitude = np.linalg.norm(x)  # = √(x₁² + x₂² + x₃²)
    print(f"Magnitude ||x|| = {magnitude:.4f}")
    
    # Unit vector (direction)
    unit = x / magnitude
    print(f"Unit vector: x/||x|| = {unit}")
    print(f"Unit vector magnitude: {np.linalg.norm(unit):.4f}")  # Should be 1
    
    # Angle between vectors
    cos_theta = np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    theta = np.arccos(cos_theta)
    print(f"Angle between x and y: {np.degrees(theta):.2f}°")
    
    return x, y

vector_operations()

The Dot Product: Geometric Interpretation

The dot product has profound geometric meaning: xy=xycos(θ)\mathbf{x} \cdot \mathbf{y} = \|\mathbf{x}\| \|\mathbf{y}\| \cos(\theta)
def visualize_dot_product():
    """Visualize the geometric meaning of dot product."""
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    vectors = [
        (np.array([1, 0]), np.array([1, 1]), "Acute angle"),
        (np.array([1, 0]), np.array([0, 1]), "Perpendicular"),
        (np.array([1, 0]), np.array([-1, 0.5]), "Obtuse angle"),
    ]
    
    for ax, (x, y, title) in zip(axes, vectors):
        ax.quiver(0, 0, x[0], x[1], angles='xy', scale_units='xy', scale=1, 
                  color='blue', label='x', width=0.02)
        ax.quiver(0, 0, y[0], y[1], angles='xy', scale_units='xy', scale=1, 
                  color='red', label='y', width=0.02)
        
        dot = np.dot(x, y)
        ax.set_title(f"{title}\nx·y = {dot:.2f}")
        ax.set_xlim(-1.5, 1.5)
        ax.set_ylim(-0.5, 1.5)
        ax.grid(True, alpha=0.3)
        ax.set_aspect('equal')
        ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("Key insight:")
    print("  • x·y > 0: vectors point in similar direction")
    print("  • x·y = 0: vectors are perpendicular (orthogonal)")
    print("  • x·y < 0: vectors point in opposite directions")

visualize_dot_product()
In Neural Networks: The dot product is the fundamental operation in neurons! z=wx+bz = \mathbf{w} \cdot \mathbf{x} + b computes how “aligned” the input is with the weight vector.

Matrices and Matrix Operations

A matrix is a 2D array of numbers: A=[a11a12a1na21a22a2nam1am2amn]Rm×n\mathbf{A} = \begin{bmatrix} a_{11} & a_{12} & \cdots & a_{1n} \\ a_{21} & a_{22} & \cdots & a_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ a_{m1} & a_{m2} & \cdots & a_{mn} \end{bmatrix} \in \mathbb{R}^{m \times n}
# ========================================
# MATRIX OPERATIONS
# ========================================

def matrix_operations():
    A = np.array([
        [1, 2, 3],
        [4, 5, 6]
    ])  # 2×3 matrix
    
    B = np.array([
        [7, 8],
        [9, 10],
        [11, 12]
    ])  # 3×2 matrix
    
    print("Matrix A (2×3):")
    print(A)
    print("\nMatrix B (3×2):")
    print(B)
    
    # Matrix multiplication
    # (2×3) @ (3×2) = (2×2)
    C = A @ B  # or np.matmul(A, B)
    print("\nA @ B (2×2):")
    print(C)
    
    # How matrix multiplication works:
    # C[i,j] = sum(A[i,:] * B[:,j]) = dot product of row i and column j
    print("\nManual calculation of C[0,0]:")
    print(f"  A[0,:] · B[:,0] = {A[0,:]} · {B[:,0]} = {np.dot(A[0,:], B[:,0])}")
    
    # Transpose
    print("\nTranspose A^T:")
    print(A.T)
    
    # Element-wise multiplication (Hadamard product)
    D = np.array([[1, 2], [3, 4]])
    E = np.array([[5, 6], [7, 8]])
    print("\nElement-wise D ⊙ E:")
    print(D * E)
    
    return A, B, C

matrix_operations()

Matrix Multiplication Dimensions

Critical Rule: For C=A×B\mathbf{C} = \mathbf{A} \times \mathbf{B}:
  • A\mathbf{A} is (m×n)(m \times n)
  • B\mathbf{B} is (n×p)(n \times p)
  • C\mathbf{C} is (m×p)(m \times p)
The inner dimensions must match!
def matrix_dimension_examples():
    """Common matrix dimension patterns in deep learning."""
    
    # Fully connected layer: y = Wx + b
    batch_size = 32
    input_dim = 784   # e.g., flattened 28×28 image
    output_dim = 256  # hidden layer size
    
    X = np.random.randn(batch_size, input_dim)  # (32, 784)
    W = np.random.randn(input_dim, output_dim)  # (784, 256)
    b = np.random.randn(output_dim)             # (256,)
    
    Y = X @ W + b  # (32, 784) @ (784, 256) = (32, 256)
    
    print(f"Input X: {X.shape}")
    print(f"Weights W: {W.shape}")
    print(f"Bias b: {b.shape}")
    print(f"Output Y: {Y.shape}")
    print(f"\nOperation: ({batch_size}×{input_dim}) @ ({input_dim}×{output_dim}) = ({batch_size}×{output_dim})")

matrix_dimension_examples()

Eigenvalues and Eigenvectors

Eigenvectors are special directions that only get scaled (not rotated) by a matrix: Av=λv\mathbf{A}\mathbf{v} = \lambda\mathbf{v}
def eigenvalue_demo():
    """Eigenvalues and eigenvectors demonstration."""
    # Symmetric matrix (always has real eigenvalues)
    A = np.array([
        [4, 2],
        [2, 3]
    ])
    
    # Compute eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(A)
    
    print("Matrix A:")
    print(A)
    print(f"\nEigenvalues: {eigenvalues}")
    print(f"\nEigenvectors (as columns):")
    print(eigenvectors)
    
    # Verify: Av = λv
    for i in range(len(eigenvalues)):
        v = eigenvectors[:, i]
        lam = eigenvalues[i]
        Av = A @ v
        lambda_v = lam * v
        print(f"\nEigenvector {i+1}: v = {v}")
        print(f"  A @ v     = {Av}")
        print(f"  λ * v     = {lambda_v}")
        print(f"  Equal? {np.allclose(Av, lambda_v)}")
    
    # Visualize
    fig, ax = plt.subplots(figsize=(8, 8))
    
    # Draw unit circle and its transformation
    theta = np.linspace(0, 2*np.pi, 100)
    circle = np.array([np.cos(theta), np.sin(theta)])
    transformed = A @ circle
    
    ax.plot(circle[0], circle[1], 'b-', label='Unit circle', alpha=0.5)
    ax.plot(transformed[0], transformed[1], 'r-', label='A @ circle', alpha=0.5)
    
    # Draw eigenvectors
    for i in range(len(eigenvalues)):
        v = eigenvectors[:, i] * 2  # Scale for visibility
        ax.arrow(0, 0, v[0], v[1], head_width=0.1, head_length=0.1, 
                fc=f'C{i+2}', ec=f'C{i+2}', label=f'Eigenvector {i+1} (λ={eigenvalues[i]:.2f})')
    
    ax.set_xlim(-6, 6)
    ax.set_ylim(-6, 6)
    ax.set_aspect('equal')
    ax.grid(True, alpha=0.3)
    ax.legend()
    ax.set_title('Eigenvalues: Directions that only scale')
    plt.show()

eigenvalue_demo()
In Deep Learning: Eigenvalues are crucial for understanding:
  • PCA for dimensionality reduction
  • Weight matrix conditioning (ratio of max/min eigenvalues affects training)
  • Hessian analysis for optimization landscapes

Singular Value Decomposition (SVD)

Any matrix can be decomposed as: A=UΣVT\mathbf{A} = \mathbf{U}\mathbf{\Sigma}\mathbf{V}^T
def svd_demo():
    """Singular Value Decomposition and its applications."""
    # Example: Image compression with SVD
    from skimage import data
    from skimage.color import rgb2gray
    
    # Load image
    image = rgb2gray(data.astronaut())
    print(f"Original image shape: {image.shape}")
    
    # Compute SVD
    U, s, Vt = np.linalg.svd(image, full_matrices=False)
    print(f"U shape: {U.shape}")
    print(f"Singular values shape: {s.shape}")
    print(f"V^T shape: {Vt.shape}")
    
    # Reconstruct with different ranks
    fig, axes = plt.subplots(1, 5, figsize=(20, 4))
    
    ranks = [5, 20, 50, 100, len(s)]
    for ax, k in zip(axes, ranks):
        # Low-rank approximation
        reconstructed = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
        compression = 100 * (1 - (k * (U.shape[0] + Vt.shape[1])) / image.size)
        
        ax.imshow(reconstructed, cmap='gray')
        ax.set_title(f'Rank {k}\n{compression:.1f}% compression')
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Singular value spectrum
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(s[:100])
    plt.xlabel('Index')
    plt.ylabel('Singular Value')
    plt.title('Singular Value Spectrum')
    
    plt.subplot(1, 2, 2)
    plt.plot(np.cumsum(s**2) / np.sum(s**2))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Variance Explained')
    plt.title('Variance Explained')
    plt.axhline(y=0.95, color='r', linestyle='--', label='95%')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

svd_demo()

Part 2: Calculus for Deep Learning

Derivatives: The Foundation of Learning

A derivative measures the rate of change: f(x)=dfdx=limh0f(x+h)f(x)hf'(x) = \frac{df}{dx} = \lim_{h \to 0} \frac{f(x+h) - f(x)}{h}
def derivative_basics():
    """Understanding derivatives numerically and analytically."""
    
    # Example function: f(x) = x²
    def f(x):
        return x ** 2
    
    # Numerical derivative
    def numerical_derivative(f, x, h=1e-7):
        return (f(x + h) - f(x - h)) / (2 * h)  # Central difference
    
    # Analytical derivative: f'(x) = 2x
    def analytical_derivative(x):
        return 2 * x
    
    # Compare at different points
    test_points = [-2, -1, 0, 1, 2, 3]
    print("x\t\tNumerical\tAnalytical")
    print("-" * 40)
    for x in test_points:
        num = numerical_derivative(f, x)
        ana = analytical_derivative(x)
        print(f"{x}\t\t{num:.6f}\t{ana:.6f}")
    
    # Visualize
    x = np.linspace(-3, 3, 100)
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].plot(x, f(x), label='f(x) = x²')
    axes[0].set_title('Function')
    axes[0].set_xlabel('x')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(x, analytical_derivative(x), label="f'(x) = 2x")
    axes[1].set_title('Derivative')
    axes[1].set_xlabel('x')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

derivative_basics()

The Chain Rule: How Backpropagation Works

For composite functions f(g(x))f(g(x)): ddx[f(g(x))]=f(g(x))g(x)\frac{d}{dx}[f(g(x))] = f'(g(x)) \cdot g'(x)
def chain_rule_demo():
    """The chain rule is the heart of backpropagation."""
    
    # Example: f(x) = (x² + 1)³
    # Let g(x) = x² + 1, h(g) = g³
    # Then f(x) = h(g(x))
    
    def g(x):
        return x**2 + 1
    
    def h(z):
        return z**3
    
    def f(x):
        return h(g(x))
    
    # Derivatives
    def dg_dx(x):
        return 2*x
    
    def dh_dg(z):
        return 3*z**2
    
    def df_dx_chain_rule(x):
        """Using chain rule: df/dx = dh/dg * dg/dx"""
        return dh_dg(g(x)) * dg_dx(x)
    
    # Verify numerically
    def numerical_derivative(f, x, h=1e-7):
        return (f(x + h) - f(x - h)) / (2 * h)
    
    x_test = 2.0
    print(f"Testing at x = {x_test}")
    print(f"Chain rule: df/dx = {df_dx_chain_rule(x_test)}")
    print(f"Numerical:  df/dx = {numerical_derivative(f, x_test):.6f}")
    print(f"Match: {np.isclose(df_dx_chain_rule(x_test), numerical_derivative(f, x_test))}")
    
    # Longer chain: f(x) = sin(exp(x²))
    print("\n" + "="*50)
    print("Longer chain: f(x) = sin(exp(x²))")
    
    def long_chain(x):
        return np.sin(np.exp(x**2))
    
    # df/dx = cos(exp(x²)) * exp(x²) * 2x
    def long_chain_derivative(x):
        return np.cos(np.exp(x**2)) * np.exp(x**2) * 2*x
    
    x_test = 0.5
    print(f"Chain rule: {long_chain_derivative(x_test):.6f}")
    print(f"Numerical:  {numerical_derivative(long_chain, x_test):.6f}")

chain_rule_demo()
The Chain Rule in Neural Networks:For a loss LL at the end of a network with layers f1,f2,...,fnf_1, f_2, ..., f_n:Lθ1=Lfnfnfn1...f2f1f1θ1\frac{\partial L}{\partial \theta_1} = \frac{\partial L}{\partial f_n} \cdot \frac{\partial f_n}{\partial f_{n-1}} \cdot ... \cdot \frac{\partial f_2}{\partial f_1} \cdot \frac{\partial f_1}{\partial \theta_1}This is backpropagation — applying the chain rule from output to input!

Partial Derivatives and Gradients

For functions of multiple variables, the gradient is the vector of all partial derivatives: f=[fx1fx2fxn]\nabla f = \begin{bmatrix} \frac{\partial f}{\partial x_1} \\ \frac{\partial f}{\partial x_2} \\ \vdots \\ \frac{\partial f}{\partial x_n} \end{bmatrix}
def gradient_demo():
    """Gradients for multivariable functions."""
    
    # f(x, y) = x² + y² (paraboloid)
    def f(xy):
        x, y = xy
        return x**2 + y**2
    
    def gradient_f(xy):
        x, y = xy
        return np.array([2*x, 2*y])
    
    # Visualize the gradient field
    x = np.linspace(-2, 2, 20)
    y = np.linspace(-2, 2, 20)
    X, Y = np.meshgrid(x, y)
    Z = X**2 + Y**2
    
    fig = plt.figure(figsize=(15, 5))
    
    # 3D surface
    ax1 = fig.add_subplot(131, projection='3d')
    ax1.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    ax1.set_zlabel('f(x,y)')
    ax1.set_title('Function: f(x,y) = x² + y²')
    
    # Contour with gradient arrows
    ax2 = fig.add_subplot(132)
    ax2.contour(X, Y, Z, levels=20, cmap='viridis')
    
    # Gradient field (subsample for clarity)
    skip = 2
    ax2.quiver(X[::skip, ::skip], Y[::skip, ::skip], 
               2*X[::skip, ::skip], 2*Y[::skip, ::skip],
               color='red', alpha=0.7)
    ax2.set_xlabel('x')
    ax2.set_ylabel('y')
    ax2.set_title('Gradient field (arrows)')
    ax2.set_aspect('equal')
    
    # Gradient descent path
    ax3 = fig.add_subplot(133)
    ax3.contour(X, Y, Z, levels=20, cmap='viridis')
    
    # Simulate gradient descent
    point = np.array([1.8, 1.5])
    path = [point.copy()]
    lr = 0.1
    
    for _ in range(20):
        grad = gradient_f(point)
        point = point - lr * grad  # Gradient descent step
        path.append(point.copy())
    
    path = np.array(path)
    ax3.plot(path[:, 0], path[:, 1], 'ro-', markersize=4, label='GD path')
    ax3.plot(path[0, 0], path[0, 1], 'go', markersize=10, label='Start')
    ax3.plot(path[-1, 0], path[-1, 1], 'b*', markersize=15, label='End')
    ax3.set_xlabel('x')
    ax3.set_ylabel('y')
    ax3.set_title('Gradient Descent Path')
    ax3.legend()
    ax3.set_aspect('equal')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Starting point: {path[0]}")
    print(f"Ending point: {path[-1]}")
    print(f"Minimum at (0, 0)")

gradient_demo()

The Jacobian Matrix

For vector-valued functions f:RnRm\mathbf{f}: \mathbb{R}^n \to \mathbb{R}^m, the Jacobian is the matrix of all partial derivatives: J=[f1x1f1xnfmx1fmxn]\mathbf{J} = \begin{bmatrix} \frac{\partial f_1}{\partial x_1} & \cdots & \frac{\partial f_1}{\partial x_n} \\ \vdots & \ddots & \vdots \\ \frac{\partial f_m}{\partial x_1} & \cdots & \frac{\partial f_m}{\partial x_n} \end{bmatrix}
import torch

def jacobian_demo():
    """Jacobian matrices in PyTorch."""
    
    # Function: f(x) = [x₁², x₁x₂, x₂²]ᵀ
    def f(x):
        return torch.stack([
            x[0]**2,
            x[0] * x[1],
            x[1]**2
        ])
    
    x = torch.tensor([2.0, 3.0], requires_grad=True)
    
    # Compute Jacobian
    jacobian = torch.autograd.functional.jacobian(f, x)
    
    print("Input x:", x.detach().numpy())
    print("f(x):", f(x).detach().numpy())
    print("\nJacobian matrix:")
    print(jacobian.numpy())
    
    # Manual verification
    print("\nManual calculation:")
    print("∂f₁/∂x₁ = 2x₁ =", 2*x[0].item())
    print("∂f₁/∂x₂ = 0")
    print("∂f₂/∂x₁ = x₂ =", x[1].item())
    print("∂f₂/∂x₂ = x₁ =", x[0].item())
    print("∂f₃/∂x₁ = 0")
    print("∂f₃/∂x₂ = 2x₂ =", 2*x[1].item())

jacobian_demo()

The Hessian Matrix

The Hessian is the matrix of second-order partial derivatives: H=[2fx122fx1x22fx2x12fx22]\mathbf{H} = \begin{bmatrix} \frac{\partial^2 f}{\partial x_1^2} & \frac{\partial^2 f}{\partial x_1 \partial x_2} & \cdots \\ \frac{\partial^2 f}{\partial x_2 \partial x_1} & \frac{\partial^2 f}{\partial x_2^2} & \cdots \\ \vdots & \vdots & \ddots \end{bmatrix}
def hessian_demo():
    """Hessian matrix and curvature analysis."""
    
    # f(x, y) = x² + 4xy + y²
    def f(xy):
        x, y = xy
        return x**2 + 4*x*y + y**2
    
    xy = torch.tensor([1.0, 1.0], requires_grad=True)
    
    # Compute Hessian
    hessian = torch.autograd.functional.hessian(
        lambda x: x[0]**2 + 4*x[0]*x[1] + x[1]**2, xy
    )
    
    print("Hessian matrix:")
    print(hessian.numpy())
    
    # Eigenvalues of Hessian tell us about curvature
    eigenvalues = np.linalg.eigvalsh(hessian.numpy())
    print(f"\nEigenvalues: {eigenvalues}")
    
    if np.all(eigenvalues > 0):
        print("→ Positive definite: LOCAL MINIMUM")
    elif np.all(eigenvalues < 0):
        print("→ Negative definite: LOCAL MAXIMUM")
    else:
        print("→ Indefinite: SADDLE POINT")
    
    # Condition number: ratio of max/min eigenvalue
    condition = max(abs(eigenvalues)) / min(abs(eigenvalues))
    print(f"\nCondition number: {condition:.2f}")
    print("(High condition number = ill-conditioned = hard to optimize)")

hessian_demo()

Part 3: Probability and Statistics

Random Variables and Distributions

from scipy import stats

def probability_basics():
    """Probability distributions commonly used in deep learning."""
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    x = np.linspace(-5, 5, 1000)
    
    # 1. Gaussian (Normal) Distribution
    ax = axes[0, 0]
    for mu, sigma in [(0, 1), (0, 2), (2, 0.5)]:
        y = stats.norm.pdf(x, mu, sigma)
        ax.plot(x, y, label=f'μ={mu}, σ={sigma}')
    ax.set_title('Gaussian Distribution')
    ax.set_xlabel('x')
    ax.set_ylabel('Probability Density')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # 2. Bernoulli Distribution
    ax = axes[0, 1]
    for p in [0.2, 0.5, 0.8]:
        ax.bar([0, 1], [1-p, p], alpha=0.5, label=f'p={p}')
    ax.set_title('Bernoulli Distribution')
    ax.set_xlabel('x')
    ax.set_ylabel('Probability')
    ax.set_xticks([0, 1])
    ax.legend()
    
    # 3. Categorical (Multinomial) Distribution
    ax = axes[0, 2]
    categories = ['A', 'B', 'C', 'D']
    probs = [0.1, 0.3, 0.4, 0.2]
    ax.bar(categories, probs)
    ax.set_title('Categorical Distribution')
    ax.set_xlabel('Category')
    ax.set_ylabel('Probability')
    
    # 4. Uniform Distribution
    ax = axes[1, 0]
    x_unif = np.linspace(-1, 3, 1000)
    y_unif = stats.uniform.pdf(x_unif, loc=0, scale=2)
    ax.plot(x_unif, y_unif)
    ax.set_title('Uniform Distribution [0, 2]')
    ax.set_xlabel('x')
    ax.set_ylabel('Probability Density')
    ax.grid(True, alpha=0.3)
    
    # 5. Exponential Distribution
    ax = axes[1, 1]
    x_exp = np.linspace(0, 5, 1000)
    for lam in [0.5, 1, 2]:
        y_exp = stats.expon.pdf(x_exp, scale=1/lam)
        ax.plot(x_exp, y_exp, label=f'λ={lam}')
    ax.set_title('Exponential Distribution')
    ax.set_xlabel('x')
    ax.set_ylabel('Probability Density')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # 6. Beta Distribution (for probabilities of probabilities)
    ax = axes[1, 2]
    x_beta = np.linspace(0, 1, 1000)
    for a, b in [(0.5, 0.5), (2, 2), (2, 5), (5, 2)]:
        y_beta = stats.beta.pdf(x_beta, a, b)
        ax.plot(x_beta, y_beta, label=f'α={a}, β={b}')
    ax.set_title('Beta Distribution')
    ax.set_xlabel('x')
    ax.set_ylabel('Probability Density')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

probability_basics()

Information Theory: Entropy and Cross-Entropy

Entropy measures uncertainty in a distribution: H(p)=xp(x)logp(x)H(p) = -\sum_{x} p(x) \log p(x) Cross-Entropy measures how well distribution qq approximates pp: H(p,q)=xp(x)logq(x)H(p, q) = -\sum_{x} p(x) \log q(x)
def information_theory_demo():
    """Entropy, cross-entropy, and KL divergence."""
    
    # Example: True distribution p and model distribution q
    p = np.array([0.7, 0.2, 0.1])  # True labels
    q = np.array([0.5, 0.3, 0.2])  # Model predictions
    
    # Entropy of p
    entropy_p = -np.sum(p * np.log(p + 1e-10))
    print(f"Entropy H(p) = {entropy_p:.4f}")
    print("  Measures inherent uncertainty in true distribution")
    
    # Cross-entropy H(p, q)
    cross_entropy = -np.sum(p * np.log(q + 1e-10))
    print(f"\nCross-Entropy H(p,q) = {cross_entropy:.4f}")
    print("  This is our LOSS FUNCTION in classification!")
    
    # KL Divergence = Cross-Entropy - Entropy
    kl_divergence = cross_entropy - entropy_p
    print(f"\nKL Divergence D_KL(p||q) = {kl_divergence:.4f}")
    print("  Extra bits needed due to using q instead of p")
    
    # Verify: KL divergence is always >= 0
    print(f"\nKL >= 0: {kl_divergence >= 0}")
    
    # Example: Why cross-entropy loss works
    print("\n" + "="*50)
    print("WHY CROSS-ENTROPY LOSS WORKS")
    print("="*50)
    
    # True label: class 0 (one-hot)
    true_label = np.array([1, 0, 0])
    
    # Different predictions
    predictions = [
        np.array([0.9, 0.05, 0.05]),  # Good
        np.array([0.6, 0.2, 0.2]),    # OK
        np.array([0.3, 0.4, 0.3]),    # Bad
        np.array([0.1, 0.5, 0.4]),    # Very bad
    ]
    
    print("\nTrue label: class 0")
    print("Prediction\t\t\tCross-Entropy Loss")
    print("-" * 50)
    
    for pred in predictions:
        loss = -np.sum(true_label * np.log(pred + 1e-10))
        print(f"{pred}\t\t{loss:.4f}")

information_theory_demo()

Maximum Likelihood Estimation

def mle_demo():
    """Maximum Likelihood Estimation - the foundation of training."""
    
    # Generate data from unknown distribution
    np.random.seed(42)
    true_mu = 5.0
    true_sigma = 2.0
    data = np.random.normal(true_mu, true_sigma, 100)
    
    # MLE for Gaussian: find μ and σ that maximize likelihood
    # For Gaussian, MLE gives: μ̂ = mean(data), σ̂ = std(data)
    
    mu_range = np.linspace(3, 7, 100)
    sigma_range = np.linspace(0.5, 4, 100)
    
    # Compute log-likelihood surface
    def log_likelihood(mu, sigma, data):
        n = len(data)
        ll = -n/2 * np.log(2*np.pi) - n*np.log(sigma) - np.sum((data - mu)**2) / (2*sigma**2)
        return ll
    
    MU, SIGMA = np.meshgrid(mu_range, sigma_range)
    LL = np.zeros_like(MU)
    
    for i in range(len(sigma_range)):
        for j in range(len(mu_range)):
            LL[i, j] = log_likelihood(mu_range[j], sigma_range[i], data)
    
    # Find MLE
    mle_mu = np.mean(data)
    mle_sigma = np.std(data)
    
    print(f"True parameters: μ={true_mu}, σ={true_sigma}")
    print(f"MLE estimates:   μ̂={mle_mu:.4f}, σ̂={mle_sigma:.4f}")
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Log-likelihood surface
    ax = axes[0]
    contour = ax.contourf(MU, SIGMA, LL, levels=50, cmap='viridis')
    ax.plot(mle_mu, mle_sigma, 'r*', markersize=15, label='MLE')
    ax.plot(true_mu, true_sigma, 'w+', markersize=15, mew=3, label='True')
    ax.set_xlabel('μ')
    ax.set_ylabel('σ')
    ax.set_title('Log-Likelihood Surface')
    ax.legend()
    plt.colorbar(contour, ax=ax)
    
    # Data and fitted distribution
    ax = axes[1]
    ax.hist(data, bins=20, density=True, alpha=0.7, label='Data')
    x = np.linspace(min(data)-1, max(data)+1, 100)
    ax.plot(x, stats.norm.pdf(x, mle_mu, mle_sigma), 'r-', lw=2, label='MLE fit')
    ax.plot(x, stats.norm.pdf(x, true_mu, true_sigma), 'g--', lw=2, label='True dist')
    ax.set_xlabel('x')
    ax.set_ylabel('Density')
    ax.set_title('Data and Fitted Distribution')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey insight: Training neural networks = Maximum Likelihood Estimation!")
    print("  - Cross-entropy loss = negative log-likelihood")
    print("  - Minimizing loss = Maximizing likelihood")

mle_demo()

Part 4: Putting It All Together

The Full Picture

def full_picture():
    """How math concepts connect in a neural network."""
    
    print("="*60)
    print("MATHEMATICS IN A SINGLE FORWARD-BACKWARD PASS")
    print("="*60)
    
    print("""
    1. LINEAR ALGEBRA
       ─────────────────
       • Forward pass: y = Wx + b (matrix multiplication)
       • Parameters are matrices and vectors
       • Batch processing: multiple samples at once
    
    2. CALCULUS
       ─────────────────
       • Loss function: L = f(ŷ, y)
       • Gradients: ∂L/∂W, ∂L/∂b via chain rule
       • Backpropagation: recursive application of chain rule
       • Update: W ← W - η∇L
    
    3. PROBABILITY
       ─────────────────
       • Softmax: converts logits to probabilities
       • Cross-entropy: measures prediction quality
       • Training = Maximum Likelihood Estimation
       • Dropout: approximates Bayesian inference
    
    4. OPTIMIZATION (combines all three)
       ─────────────────
       • Gradient descent navigates loss landscape
       • Momentum uses exponential moving averages
       • Adam adapts learning rates per parameter
       • Curvature (Hessian) affects convergence
    """)

full_picture()

Exercises

Implement gradient descent for a quadratic function f(x,y)=x2+10y2f(x, y) = x^2 + 10y^2 (an ill-conditioned function). Compare with different learning rates.
def exercise_1():
    def f(xy):
        return xy[0]**2 + 10*xy[1]**2
    
    def grad_f(xy):
        return np.array([2*xy[0], 20*xy[1]])
    
    # TODO: Implement gradient descent
    # TODO: Try learning rates 0.01, 0.05, 0.1
    # TODO: Plot the optimization paths
    pass
Analyze the eigenvalue spectrum of randomly initialized weight matrices. How does initialization scale affect the eigenvalues?
def exercise_2():
    sizes = [100, 500, 1000]
    
    for size in sizes:
        W_xavier = np.random.randn(size, size) * np.sqrt(1/size)
        W_he = np.random.randn(size, size) * np.sqrt(2/size)
        
        # TODO: Compute eigenvalues
        # TODO: Plot eigenvalue distributions
        # TODO: Compute condition numbers
    pass
Implement gradient checking to verify backpropagation is correct.
def exercise_3():
    """Compare analytical and numerical gradients."""
    def check_gradients(f, grad_f, x, eps=1e-5):
        numerical_grad = np.zeros_like(x)
        
        for i in range(len(x)):
            x_plus = x.copy()
            x_minus = x.copy()
            x_plus[i] += eps
            x_minus[i] -= eps
            numerical_grad[i] = (f(x_plus) - f(x_minus)) / (2*eps)
        
        analytical_grad = grad_f(x)
        
        # TODO: Compute relative error
        # TODO: Return True if gradients match
        pass
Explore how entropy and cross-entropy change during training.
def exercise_4():
    # Simulate training: predictions getting better over epochs
    epochs = [1, 10, 50, 100]
    
    true_label = np.array([1, 0, 0, 0, 0])  # One-hot
    
    # Predictions at different epochs (getting more confident)
    predictions = [
        np.array([0.3, 0.2, 0.2, 0.15, 0.15]),  # Epoch 1
        np.array([0.5, 0.15, 0.15, 0.1, 0.1]),  # Epoch 10
        np.array([0.8, 0.07, 0.07, 0.03, 0.03]), # Epoch 50
        np.array([0.95, 0.02, 0.01, 0.01, 0.01]) # Epoch 100
    ]
    
    # TODO: Compute entropy of predictions
    # TODO: Compute cross-entropy loss
    # TODO: Plot how they change over epochs
    pass
Analyze how the condition number of the Hessian affects optimization.
def exercise_5():
    # Well-conditioned function
    def f1(xy):
        return xy[0]**2 + xy[1]**2  # Condition number = 1
    
    # Ill-conditioned function
    def f2(xy):
        return xy[0]**2 + 100*xy[1]**2  # Condition number = 100
    
    # TODO: Run gradient descent on both
    # TODO: Compare number of iterations to converge
    # TODO: Visualize the optimization paths
    pass

What’s Next?

Now that you have a solid mathematical foundation, you’re ready to understand deep learning at a fundamental level. Continue to: