Skip to main content
CNN Architectures

The Evolution of CNN Architectures

Timeline of Innovation

1998: LeNet-5      → First practical CNN (32x32 images)
2012: AlexNet      → Deep learning revolution (227x227)
2014: VGGNet       → Deeper is better (224x224)
2014: GoogLeNet    → Inception modules, efficient depth
2015: ResNet       → Skip connections, 1000+ layers possible
2016: DenseNet     → Dense connections, feature reuse
2017: MobileNet    → Efficient architectures
2019: EfficientNet → Compound scaling
2020: Vision Transformers → Attention in vision
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import List, Tuple, Optional

torch.manual_seed(42)

VGGNet: Simplicity and Depth

Design Philosophy

VGGNet proved that depth with small filters beats shallow networks with large filters. Key insights:
  • Use only 3×3 convolutions (receptive field with depth)
  • Double channels when halving spatial dimensions
  • Simple, uniform architecture
class VGGBlock(nn.Module):
    """VGG-style convolutional block."""
    
    def __init__(self, in_channels: int, out_channels: int, num_convs: int):
        super().__init__()
        layers = []
        for i in range(num_convs):
            layers.append(nn.Conv2d(
                in_channels if i == 0 else out_channels,
                out_channels, 
                kernel_size=3, 
                padding=1
            ))
            layers.append(nn.ReLU(inplace=True))
        layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        self.block = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.block(x)


class VGG16(nn.Module):
    """VGG-16 implementation."""
    
    def __init__(self, num_classes: int = 1000):
        super().__init__()
        
        # Configuration: (num_convs, out_channels)
        config = [
            (2, 64),   # Block 1: 224 -> 112
            (2, 128),  # Block 2: 112 -> 56
            (3, 256),  # Block 3: 56 -> 28
            (3, 512),  # Block 4: 28 -> 14
            (3, 512),  # Block 5: 14 -> 7
        ]
        
        blocks = []
        in_channels = 3
        for num_convs, out_channels in config:
            blocks.append(VGGBlock(in_channels, out_channels, num_convs))
            in_channels = out_channels
        
        self.features = nn.Sequential(*blocks)
        
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


# Test
vgg = VGG16(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = vgg(x)
print(f"VGG16 output shape: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in vgg.parameters()):,}")

Why 3×3 Filters?

Two 3×3 convolutions = one 5×5 receptive field, but:
  • Fewer parameters: 2×(32)=182 \times (3^2) = 18 vs 52=255^2 = 25
  • More non-linearities: 2 ReLU activations vs 1
  • Better regularization: More structured model
def receptive_field_analysis():
    """Demonstrate receptive field growth."""
    
    print("Receptive Field Analysis")
    print("="*50)
    
    # One 5x5 conv
    conv5x5 = nn.Conv2d(64, 64, kernel_size=5, padding=2)
    params_5x5 = sum(p.numel() for p in conv5x5.parameters())
    
    # Two 3x3 convs
    conv3x3_stack = nn.Sequential(
        nn.Conv2d(64, 64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, padding=1)
    )
    params_3x3 = sum(p.numel() for p in conv3x3_stack.parameters())
    
    print(f"5×5 conv parameters: {params_5x5:,}")
    print(f"2× 3×3 conv parameters: {params_3x3:,}")
    print(f"Both have 5×5 receptive field")
    print(f"Parameter savings: {(1 - params_3x3/params_5x5)*100:.1f}%")

receptive_field_analysis()

GoogLeNet/Inception: Multi-Scale Processing

The Inception Module

Process input at multiple scales simultaneously:
class InceptionModuleV1(nn.Module):
    """Original Inception module from GoogLeNet."""
    
    def __init__(
        self, 
        in_channels: int,
        out_1x1: int,
        reduce_3x3: int,
        out_3x3: int,
        reduce_5x5: int,
        out_5x5: int,
        out_pool: int
    ):
        super().__init__()
        
        # Branch 1: 1×1 convolution
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, out_1x1, kernel_size=1),
            nn.ReLU(inplace=True)
        )
        
        # Branch 2: 1×1 reduction then 3×3 convolution
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, reduce_3x3, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(reduce_3x3, out_3x3, kernel_size=3, padding=1),
            nn.ReLU(inplace=True)
        )
        
        # Branch 3: 1×1 reduction then 5×5 convolution
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, reduce_5x5, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(reduce_5x5, out_5x5, kernel_size=5, padding=2),
            nn.ReLU(inplace=True)
        )
        
        # Branch 4: 3×3 max pool then 1×1 convolution
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels, out_pool, kernel_size=1),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        b1 = self.branch1(x)
        b2 = self.branch2(x)
        b3 = self.branch3(x)
        b4 = self.branch4(x)
        
        # Concatenate along channel dimension
        return torch.cat([b1, b2, b3, b4], dim=1)


# Test
inception = InceptionModuleV1(
    in_channels=192,
    out_1x1=64,
    reduce_3x3=96, out_3x3=128,
    reduce_5x5=16, out_5x5=32,
    out_pool=32
)
x = torch.randn(1, 192, 28, 28)
output = inception(x)
print(f"Inception output: {output.shape}")  # [1, 256, 28, 28]

Inception V2/V3: Factorized Convolutions

class InceptionModuleV3(nn.Module):
    """Inception V3 with factorized convolutions."""
    
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        
        # Factorize 5×5 into two 3×3
        # Factorize n×n into 1×n and n×1 (asymmetric)
        
        c7 = 128  # Channels for 7×7 factorization
        
        # Branch 1: 1×1
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, 192, kernel_size=1),
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True)
        )
        
        # Branch 2: 1×1 -> 1×7 -> 7×1
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, c7, kernel_size=1),
            nn.BatchNorm2d(c7),
            nn.ReLU(inplace=True),
            nn.Conv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
            nn.BatchNorm2d(c7),
            nn.ReLU(inplace=True),
            nn.Conv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0)),
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True)
        )
        
        # Branch 3: Double 7×7 factorized
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, c7, kernel_size=1),
            nn.BatchNorm2d(c7),
            nn.ReLU(inplace=True),
            nn.Conv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
            nn.BatchNorm2d(c7),
            nn.ReLU(inplace=True),
            nn.Conv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)),
            nn.BatchNorm2d(c7),
            nn.ReLU(inplace=True),
            nn.Conv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
            nn.BatchNorm2d(c7),
            nn.ReLU(inplace=True),
            nn.Conv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0)),
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True)
        )
        
        # Branch 4: Pool -> 1×1
        self.branch4 = nn.Sequential(
            nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels, 192, kernel_size=1),
            nn.BatchNorm2d(192),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        return torch.cat([
            self.branch1(x),
            self.branch2(x),
            self.branch3(x),
            self.branch4(x)
        ], dim=1)


# Test
inception_v3 = InceptionModuleV3(in_channels=768, out_channels=768)
x = torch.randn(1, 768, 17, 17)
output = inception_v3(x)
print(f"Inception V3 output: {output.shape}")

ResNet: Skip Connections

The Residual Block

Instead of learning H(x)\mathcal{H}(x), learn the residual F(x)=H(x)x\mathcal{F}(x) = \mathcal{H}(x) - x: output=F(x)+x\text{output} = \mathcal{F}(x) + x
class BasicBlock(nn.Module):
    """Basic residual block for ResNet-18/34."""
    
    expansion = 1
    
    def __init__(
        self, 
        in_channels: int, 
        out_channels: int, 
        stride: int = 1,
        downsample: Optional[nn.Module] = None
    ):
        super().__init__()
        
        self.conv1 = nn.Conv2d(
            in_channels, out_channels, 
            kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn1 = nn.BatchNorm2d(out_channels)
        
        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=3, stride=1, padding=1, bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        self.downsample = downsample
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        identity = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            identity = self.downsample(x)
        
        out += identity  # The residual connection!
        out = self.relu(out)
        
        return out


class Bottleneck(nn.Module):
    """Bottleneck block for ResNet-50/101/152."""
    
    expansion = 4
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None
    ):
        super().__init__()
        
        # 1×1 convolution to reduce channels
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        
        # 3×3 convolution
        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # 1×1 convolution to expand channels
        self.conv3 = nn.Conv2d(
            out_channels, out_channels * self.expansion,
            kernel_size=1, bias=False
        )
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        
        self.downsample = downsample
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        identity = x
        
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        
        if self.downsample is not None:
            identity = self.downsample(x)
        
        out += identity
        out = self.relu(out)
        
        return out


class ResNet(nn.Module):
    """ResNet implementation."""
    
    def __init__(
        self,
        block: type,
        layers: List[int],
        num_classes: int = 1000
    ):
        super().__init__()
        
        self.in_channels = 64
        
        # Stem
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # Residual layers
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        
        # Classifier
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # Initialize weights
        self._init_weights()
    
    def _make_layer(
        self, 
        block: type, 
        out_channels: int, 
        num_blocks: int, 
        stride: int = 1
    ) -> nn.Sequential:
        
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels, out_channels * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(out_channels * block.expansion)
            )
        
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        
        self.in_channels = out_channels * block.expansion
        
        for _ in range(1, num_blocks):
            layers.append(block(self.in_channels, out_channels))
        
        return nn.Sequential(*layers)
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        
        return x


# Factory functions
def resnet18(num_classes=1000):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

def resnet34(num_classes=1000):
    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)

def resnet50(num_classes=1000):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)

def resnet101(num_classes=1000):
    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)

def resnet152(num_classes=1000):
    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)


# Test
resnet = resnet50(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = resnet(x)
print(f"ResNet-50 output: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in resnet.parameters()):,}")

Pre-Activation ResNet

class PreActBlock(nn.Module):
    """Pre-activation residual block (BN-ReLU-Conv)."""
    
    expansion = 1
    
    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super().__init__()
        
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(
            in_channels, out_channels,
            kernel_size=3, stride=stride, padding=1, bias=False
        )
        
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=3, padding=1, bias=False
        )
        
        self.downsample = downsample
        self.relu = nn.ReLU(inplace=True)
    
    def forward(self, x):
        identity = x
        
        out = self.relu(self.bn1(x))
        
        if self.downsample is not None:
            identity = self.downsample(out)
        
        out = self.conv1(out)
        out = self.conv2(self.relu(self.bn2(out)))
        
        out += identity
        return out

DenseNet: Feature Reuse

Dense Connections

Each layer receives input from ALL preceding layers:
class DenseLayer(nn.Module):
    """A single dense layer in DenseNet."""
    
    def __init__(self, in_channels: int, growth_rate: int, bn_size: int = 4):
        super().__init__()
        
        # Bottleneck
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(
            in_channels, bn_size * growth_rate,
            kernel_size=1, bias=False
        )
        
        # Main convolution
        self.bn2 = nn.BatchNorm2d(bn_size * growth_rate)
        self.conv2 = nn.Conv2d(
            bn_size * growth_rate, growth_rate,
            kernel_size=3, padding=1, bias=False
        )
    
    def forward(self, features: List[torch.Tensor]) -> torch.Tensor:
        # Concatenate all previous features
        x = torch.cat(features, dim=1)
        
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        
        return out


class DenseBlock(nn.Module):
    """Dense block with multiple dense layers."""
    
    def __init__(
        self, 
        num_layers: int, 
        in_channels: int, 
        growth_rate: int
    ):
        super().__init__()
        
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(
                DenseLayer(in_channels + i * growth_rate, growth_rate)
            )
    
    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_features = layer(features)
            features.append(new_features)
        
        return torch.cat(features, dim=1)


class TransitionLayer(nn.Module):
    """Transition layer between dense blocks."""
    
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.bn = nn.BatchNorm2d(in_channels)
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
    
    def forward(self, x):
        return self.pool(self.conv(F.relu(self.bn(x))))


class DenseNet(nn.Module):
    """DenseNet implementation."""
    
    def __init__(
        self,
        growth_rate: int = 32,
        block_config: Tuple[int, ...] = (6, 12, 24, 16),
        num_init_features: int = 64,
        compression: float = 0.5,
        num_classes: int = 1000
    ):
        super().__init__()
        
        # Initial convolution
        self.features = nn.Sequential(
            nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(num_init_features),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        # Dense blocks
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
            block = DenseBlock(num_layers, num_features, growth_rate)
            self.features.add_module(f'denseblock{i+1}', block)
            
            num_features = num_features + num_layers * growth_rate
            
            if i != len(block_config) - 1:
                trans = TransitionLayer(
                    num_features, 
                    int(num_features * compression)
                )
                self.features.add_module(f'transition{i+1}', trans)
                num_features = int(num_features * compression)
        
        # Final batch norm
        self.features.add_module('final_bn', nn.BatchNorm2d(num_features))
        
        # Classifier
        self.classifier = nn.Linear(num_features, num_classes)
    
    def forward(self, x):
        x = self.features(x)
        x = F.relu(x)
        x = F.adaptive_avg_pool2d(x, (1, 1))
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


# Factory functions
def densenet121(num_classes=1000):
    return DenseNet(32, (6, 12, 24, 16), 64, num_classes=num_classes)

def densenet169(num_classes=1000):
    return DenseNet(32, (6, 12, 32, 32), 64, num_classes=num_classes)

def densenet201(num_classes=1000):
    return DenseNet(32, (6, 12, 48, 32), 64, num_classes=num_classes)


# Test
densenet = densenet121(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = densenet(x)
print(f"DenseNet-121 output: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in densenet.parameters()):,}")

EfficientNet: Compound Scaling

The Scaling Problem

Width: More channels per layer Depth: More layers Resolution: Higher input resolution EfficientNet scales all three together with compound coefficient ϕ\phi: depth=αϕ,width=βϕ,resolution=γϕ\text{depth} = \alpha^\phi, \quad \text{width} = \beta^\phi, \quad \text{resolution} = \gamma^\phi Subject to: αβ2γ22\alpha \cdot \beta^2 \cdot \gamma^2 \approx 2
class MBConvBlock(nn.Module):
    """Mobile Inverted Bottleneck (MBConv) with Squeeze-and-Excitation."""
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        expand_ratio: int,
        se_ratio: float = 0.25
    ):
        super().__init__()
        
        self.stride = stride
        self.use_residual = (stride == 1 and in_channels == out_channels)
        
        hidden_dim = in_channels * expand_ratio
        
        layers = []
        
        # Expansion phase
        if expand_ratio != 1:
            layers.extend([
                nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.SiLU(inplace=True)  # Swish activation
            ])
        
        # Depthwise convolution
        layers.extend([
            nn.Conv2d(
                hidden_dim, hidden_dim, kernel_size,
                stride=stride, padding=kernel_size // 2,
                groups=hidden_dim, bias=False
            ),
            nn.BatchNorm2d(hidden_dim),
            nn.SiLU(inplace=True)
        ])
        
        self.conv = nn.Sequential(*layers)
        
        # Squeeze-and-Excitation
        se_channels = max(1, int(in_channels * se_ratio))
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(hidden_dim, se_channels, 1),
            nn.SiLU(inplace=True),
            nn.Conv2d(se_channels, hidden_dim, 1),
            nn.Sigmoid()
        )
        
        # Projection
        self.project = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels)
        )
        
        # Stochastic depth (drop path)
        self.drop_rate = 0.2
    
    def forward(self, x):
        identity = x
        
        x = self.conv(x)
        x = x * self.se(x)  # SE attention
        x = self.project(x)
        
        if self.use_residual:
            if self.training and self.drop_rate > 0:
                x = self._drop_path(x)
            x = x + identity
        
        return x
    
    def _drop_path(self, x):
        """Stochastic depth."""
        if not self.training:
            return x
        
        keep_prob = 1 - self.drop_rate
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
        random_tensor.floor_()
        return x.div(keep_prob) * random_tensor


class EfficientNet(nn.Module):
    """EfficientNet-B0 implementation."""
    
    def __init__(self, num_classes: int = 1000, width_mult: float = 1.0, depth_mult: float = 1.0):
        super().__init__()
        
        # EfficientNet-B0 configuration
        # (expand_ratio, channels, num_layers, stride, kernel_size)
        config = [
            (1, 16, 1, 1, 3),
            (6, 24, 2, 2, 3),
            (6, 40, 2, 2, 5),
            (6, 80, 3, 2, 3),
            (6, 112, 3, 1, 5),
            (6, 192, 4, 2, 5),
            (6, 320, 1, 1, 3),
        ]
        
        # Stem
        out_channels = self._round_channels(32, width_mult)
        self.stem = nn.Sequential(
            nn.Conv2d(3, out_channels, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.SiLU(inplace=True)
        )
        
        # Build blocks
        layers = []
        in_channels = out_channels
        
        for expand_ratio, channels, num_layers, stride, kernel_size in config:
            out_channels = self._round_channels(channels, width_mult)
            num_layers = self._round_layers(num_layers, depth_mult)
            
            for i in range(num_layers):
                layers.append(MBConvBlock(
                    in_channels,
                    out_channels,
                    kernel_size,
                    stride if i == 0 else 1,
                    expand_ratio
                ))
                in_channels = out_channels
        
        self.blocks = nn.Sequential(*layers)
        
        # Head
        head_channels = self._round_channels(1280, width_mult)
        self.head = nn.Sequential(
            nn.Conv2d(in_channels, head_channels, 1, bias=False),
            nn.BatchNorm2d(head_channels),
            nn.SiLU(inplace=True),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Dropout(0.2),
            nn.Linear(head_channels, num_classes)
        )
    
    @staticmethod
    def _round_channels(channels: int, mult: float) -> int:
        """Round channels to nearest multiple of 8."""
        return int(max(8, int(channels * mult + 4) // 8 * 8))
    
    @staticmethod
    def _round_layers(layers: int, mult: float) -> int:
        return int(np.ceil(layers * mult))
    
    def forward(self, x):
        x = self.stem(x)
        x = self.blocks(x)
        x = self.head(x)
        return x


# Factory functions with compound scaling
def efficientnet_b0(num_classes=1000):
    return EfficientNet(num_classes, width_mult=1.0, depth_mult=1.0)

def efficientnet_b1(num_classes=1000):
    return EfficientNet(num_classes, width_mult=1.0, depth_mult=1.1)

def efficientnet_b2(num_classes=1000):
    return EfficientNet(num_classes, width_mult=1.1, depth_mult=1.2)

def efficientnet_b3(num_classes=1000):
    return EfficientNet(num_classes, width_mult=1.2, depth_mult=1.4)

def efficientnet_b4(num_classes=1000):
    return EfficientNet(num_classes, width_mult=1.4, depth_mult=1.8)


# Test
effnet = efficientnet_b0(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = effnet(x)
print(f"EfficientNet-B0 output: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in effnet.parameters()):,}")

ResNeXt: Cardinality

Split-Transform-Merge

Increase “cardinality” (number of parallel paths) instead of depth/width:
class ResNeXtBlock(nn.Module):
    """ResNeXt block with grouped convolutions."""
    
    expansion = 4
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        cardinality: int = 32,
        base_width: int = 4,
        downsample: Optional[nn.Module] = None
    ):
        super().__init__()
        
        # Width of each group
        D = int(out_channels * (base_width / 64)) * cardinality
        
        self.conv1 = nn.Conv2d(in_channels, D, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(D)
        
        # Grouped convolution
        self.conv2 = nn.Conv2d(
            D, D, kernel_size=3, stride=stride, padding=1,
            groups=cardinality, bias=False
        )
        self.bn2 = nn.BatchNorm2d(D)
        
        self.conv3 = nn.Conv2d(D, out_channels * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
    
    def forward(self, x):
        identity = x
        
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        
        if self.downsample is not None:
            identity = self.downsample(x)
        
        out += identity
        out = self.relu(out)
        
        return out


# Test
resnext_block = ResNeXtBlock(256, 64, cardinality=32, base_width=4)
x = torch.randn(1, 256, 56, 56)
output = resnext_block(x)
print(f"ResNeXt block output: {output.shape}")

Architecture Comparison

def compare_architectures():
    """Compare different CNN architectures."""
    
    architectures = {
        'VGG-16': VGG16(num_classes=1000),
        'ResNet-50': resnet50(num_classes=1000),
        'DenseNet-121': densenet121(num_classes=1000),
        'EfficientNet-B0': efficientnet_b0(num_classes=1000),
    }
    
    print("CNN Architecture Comparison")
    print("="*70)
    print(f"{'Architecture':<20} {'Parameters':>15} {'MACs (224×224)':>20}")
    print("-"*70)
    
    x = torch.randn(1, 3, 224, 224)
    
    for name, model in architectures.items():
        # Count parameters
        params = sum(p.numel() for p in model.parameters())
        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
        
        # Estimate MACs (very rough)
        model.eval()
        with torch.no_grad():
            output = model(x)
        
        print(f"{name:<20} {params:>15,}")
    
    print("-"*70)
    print("\nKey Design Choices:")
    print("  VGG:         Only 3×3 convs, simple but heavy")
    print("  ResNet:      Skip connections, can go very deep")
    print("  DenseNet:    Feature reuse, efficient parameters")
    print("  EfficientNet: Compound scaling, best efficiency")

compare_architectures()

Exercises

Add Squeeze-and-Excitation to any architecture:
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.squeeze = nn.AdaptiveAvgPool2d(1)
        self.excite = nn.Sequential(
            nn.Linear(channels, channels // reduction),
            nn.ReLU(),
            nn.Linear(channels // reduction, channels),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.shape
        y = self.squeeze(x).view(b, c)
        y = self.excite(y).view(b, c, 1, 1)
        return x * y
Implement RegNet with its simple design rules:
def generate_regnet_config(w0, wa, wm, depth, group_w):
    """Generate RegNet widths."""
    widths = []
    for i in range(depth):
        w = w0 + wa * i
        w = round(w / group_w) * group_w
        widths.append(w)
    return widths

What’s Next?