The Evolution of CNN Architectures
Timeline of Innovation
Copy
1998: LeNet-5 → First practical CNN (32x32 images)
2012: AlexNet → Deep learning revolution (227x227)
2014: VGGNet → Deeper is better (224x224)
2014: GoogLeNet → Inception modules, efficient depth
2015: ResNet → Skip connections, 1000+ layers possible
2016: DenseNet → Dense connections, feature reuse
2017: MobileNet → Efficient architectures
2019: EfficientNet → Compound scaling
2020: Vision Transformers → Attention in vision
Copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import List, Tuple, Optional
torch.manual_seed(42)
VGGNet: Simplicity and Depth
Design Philosophy
VGGNet proved that depth with small filters beats shallow networks with large filters. Key insights:- Use only 3×3 convolutions (receptive field with depth)
- Double channels when halving spatial dimensions
- Simple, uniform architecture
Copy
class VGGBlock(nn.Module):
"""VGG-style convolutional block."""
def __init__(self, in_channels: int, out_channels: int, num_convs: int):
super().__init__()
layers = []
for i in range(num_convs):
layers.append(nn.Conv2d(
in_channels if i == 0 else out_channels,
out_channels,
kernel_size=3,
padding=1
))
layers.append(nn.ReLU(inplace=True))
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
self.block = nn.Sequential(*layers)
def forward(self, x):
return self.block(x)
class VGG16(nn.Module):
"""VGG-16 implementation."""
def __init__(self, num_classes: int = 1000):
super().__init__()
# Configuration: (num_convs, out_channels)
config = [
(2, 64), # Block 1: 224 -> 112
(2, 128), # Block 2: 112 -> 56
(3, 256), # Block 3: 56 -> 28
(3, 512), # Block 4: 28 -> 14
(3, 512), # Block 5: 14 -> 7
]
blocks = []
in_channels = 3
for num_convs, out_channels in config:
blocks.append(VGGBlock(in_channels, out_channels, num_convs))
in_channels = out_channels
self.features = nn.Sequential(*blocks)
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# Test
vgg = VGG16(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = vgg(x)
print(f"VGG16 output shape: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in vgg.parameters()):,}")
Why 3×3 Filters?
Two 3×3 convolutions = one 5×5 receptive field, but:- Fewer parameters: 2×(32)=18 vs 52=25
- More non-linearities: 2 ReLU activations vs 1
- Better regularization: More structured model
Copy
def receptive_field_analysis():
"""Demonstrate receptive field growth."""
print("Receptive Field Analysis")
print("="*50)
# One 5x5 conv
conv5x5 = nn.Conv2d(64, 64, kernel_size=5, padding=2)
params_5x5 = sum(p.numel() for p in conv5x5.parameters())
# Two 3x3 convs
conv3x3_stack = nn.Sequential(
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1)
)
params_3x3 = sum(p.numel() for p in conv3x3_stack.parameters())
print(f"5×5 conv parameters: {params_5x5:,}")
print(f"2× 3×3 conv parameters: {params_3x3:,}")
print(f"Both have 5×5 receptive field")
print(f"Parameter savings: {(1 - params_3x3/params_5x5)*100:.1f}%")
receptive_field_analysis()
GoogLeNet/Inception: Multi-Scale Processing
The Inception Module
Process input at multiple scales simultaneously:Copy
class InceptionModuleV1(nn.Module):
"""Original Inception module from GoogLeNet."""
def __init__(
self,
in_channels: int,
out_1x1: int,
reduce_3x3: int,
out_3x3: int,
reduce_5x5: int,
out_5x5: int,
out_pool: int
):
super().__init__()
# Branch 1: 1×1 convolution
self.branch1 = nn.Sequential(
nn.Conv2d(in_channels, out_1x1, kernel_size=1),
nn.ReLU(inplace=True)
)
# Branch 2: 1×1 reduction then 3×3 convolution
self.branch2 = nn.Sequential(
nn.Conv2d(in_channels, reduce_3x3, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv2d(reduce_3x3, out_3x3, kernel_size=3, padding=1),
nn.ReLU(inplace=True)
)
# Branch 3: 1×1 reduction then 5×5 convolution
self.branch3 = nn.Sequential(
nn.Conv2d(in_channels, reduce_5x5, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv2d(reduce_5x5, out_5x5, kernel_size=5, padding=2),
nn.ReLU(inplace=True)
)
# Branch 4: 3×3 max pool then 1×1 convolution
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels, out_pool, kernel_size=1),
nn.ReLU(inplace=True)
)
def forward(self, x):
b1 = self.branch1(x)
b2 = self.branch2(x)
b3 = self.branch3(x)
b4 = self.branch4(x)
# Concatenate along channel dimension
return torch.cat([b1, b2, b3, b4], dim=1)
# Test
inception = InceptionModuleV1(
in_channels=192,
out_1x1=64,
reduce_3x3=96, out_3x3=128,
reduce_5x5=16, out_5x5=32,
out_pool=32
)
x = torch.randn(1, 192, 28, 28)
output = inception(x)
print(f"Inception output: {output.shape}") # [1, 256, 28, 28]
Inception V2/V3: Factorized Convolutions
Copy
class InceptionModuleV3(nn.Module):
"""Inception V3 with factorized convolutions."""
def __init__(self, in_channels: int, out_channels: int):
super().__init__()
# Factorize 5×5 into two 3×3
# Factorize n×n into 1×n and n×1 (asymmetric)
c7 = 128 # Channels for 7×7 factorization
# Branch 1: 1×1
self.branch1 = nn.Sequential(
nn.Conv2d(in_channels, 192, kernel_size=1),
nn.BatchNorm2d(192),
nn.ReLU(inplace=True)
)
# Branch 2: 1×1 -> 1×7 -> 7×1
self.branch2 = nn.Sequential(
nn.Conv2d(in_channels, c7, kernel_size=1),
nn.BatchNorm2d(c7),
nn.ReLU(inplace=True),
nn.Conv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
nn.BatchNorm2d(c7),
nn.ReLU(inplace=True),
nn.Conv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0)),
nn.BatchNorm2d(192),
nn.ReLU(inplace=True)
)
# Branch 3: Double 7×7 factorized
self.branch3 = nn.Sequential(
nn.Conv2d(in_channels, c7, kernel_size=1),
nn.BatchNorm2d(c7),
nn.ReLU(inplace=True),
nn.Conv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
nn.BatchNorm2d(c7),
nn.ReLU(inplace=True),
nn.Conv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0)),
nn.BatchNorm2d(c7),
nn.ReLU(inplace=True),
nn.Conv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3)),
nn.BatchNorm2d(c7),
nn.ReLU(inplace=True),
nn.Conv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0)),
nn.BatchNorm2d(192),
nn.ReLU(inplace=True)
)
# Branch 4: Pool -> 1×1
self.branch4 = nn.Sequential(
nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels, 192, kernel_size=1),
nn.BatchNorm2d(192),
nn.ReLU(inplace=True)
)
def forward(self, x):
return torch.cat([
self.branch1(x),
self.branch2(x),
self.branch3(x),
self.branch4(x)
], dim=1)
# Test
inception_v3 = InceptionModuleV3(in_channels=768, out_channels=768)
x = torch.randn(1, 768, 17, 17)
output = inception_v3(x)
print(f"Inception V3 output: {output.shape}")
ResNet: Skip Connections
The Residual Block
Instead of learning H(x), learn the residual F(x)=H(x)−x: output=F(x)+xCopy
class BasicBlock(nn.Module):
"""Basic residual block for ResNet-18/34."""
expansion = 1
def __init__(
self,
in_channels: int,
out_channels: int,
stride: int = 1,
downsample: Optional[nn.Module] = None
):
super().__init__()
self.conv1 = nn.Conv2d(
in_channels, out_channels,
kernel_size=3, stride=stride, padding=1, bias=False
)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(
out_channels, out_channels,
kernel_size=3, stride=1, padding=1, bias=False
)
self.bn2 = nn.BatchNorm2d(out_channels)
self.downsample = downsample
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity # The residual connection!
out = self.relu(out)
return out
class Bottleneck(nn.Module):
"""Bottleneck block for ResNet-50/101/152."""
expansion = 4
def __init__(
self,
in_channels: int,
out_channels: int,
stride: int = 1,
downsample: Optional[nn.Module] = None
):
super().__init__()
# 1×1 convolution to reduce channels
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
# 3×3 convolution
self.conv2 = nn.Conv2d(
out_channels, out_channels,
kernel_size=3, stride=stride, padding=1, bias=False
)
self.bn2 = nn.BatchNorm2d(out_channels)
# 1×1 convolution to expand channels
self.conv3 = nn.Conv2d(
out_channels, out_channels * self.expansion,
kernel_size=1, bias=False
)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.downsample = downsample
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
"""ResNet implementation."""
def __init__(
self,
block: type,
layers: List[int],
num_classes: int = 1000
):
super().__init__()
self.in_channels = 64
# Stem
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# Residual layers
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
# Classifier
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
# Initialize weights
self._init_weights()
def _make_layer(
self,
block: type,
out_channels: int,
num_blocks: int,
stride: int = 1
) -> nn.Sequential:
downsample = None
if stride != 1 or self.in_channels != out_channels * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(
self.in_channels, out_channels * block.expansion,
kernel_size=1, stride=stride, bias=False
),
nn.BatchNorm2d(out_channels * block.expansion)
)
layers = []
layers.append(block(self.in_channels, out_channels, stride, downsample))
self.in_channels = out_channels * block.expansion
for _ in range(1, num_blocks):
layers.append(block(self.in_channels, out_channels))
return nn.Sequential(*layers)
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# Factory functions
def resnet18(num_classes=1000):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
def resnet34(num_classes=1000):
return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)
def resnet50(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)
def resnet101(num_classes=1000):
return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
def resnet152(num_classes=1000):
return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)
# Test
resnet = resnet50(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = resnet(x)
print(f"ResNet-50 output: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in resnet.parameters()):,}")
Pre-Activation ResNet
Copy
class PreActBlock(nn.Module):
"""Pre-activation residual block (BN-ReLU-Conv)."""
expansion = 1
def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super().__init__()
self.bn1 = nn.BatchNorm2d(in_channels)
self.conv1 = nn.Conv2d(
in_channels, out_channels,
kernel_size=3, stride=stride, padding=1, bias=False
)
self.bn2 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(
out_channels, out_channels,
kernel_size=3, padding=1, bias=False
)
self.downsample = downsample
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
identity = x
out = self.relu(self.bn1(x))
if self.downsample is not None:
identity = self.downsample(out)
out = self.conv1(out)
out = self.conv2(self.relu(self.bn2(out)))
out += identity
return out
DenseNet: Feature Reuse
Dense Connections
Each layer receives input from ALL preceding layers:Copy
class DenseLayer(nn.Module):
"""A single dense layer in DenseNet."""
def __init__(self, in_channels: int, growth_rate: int, bn_size: int = 4):
super().__init__()
# Bottleneck
self.bn1 = nn.BatchNorm2d(in_channels)
self.conv1 = nn.Conv2d(
in_channels, bn_size * growth_rate,
kernel_size=1, bias=False
)
# Main convolution
self.bn2 = nn.BatchNorm2d(bn_size * growth_rate)
self.conv2 = nn.Conv2d(
bn_size * growth_rate, growth_rate,
kernel_size=3, padding=1, bias=False
)
def forward(self, features: List[torch.Tensor]) -> torch.Tensor:
# Concatenate all previous features
x = torch.cat(features, dim=1)
out = self.conv1(F.relu(self.bn1(x)))
out = self.conv2(F.relu(self.bn2(out)))
return out
class DenseBlock(nn.Module):
"""Dense block with multiple dense layers."""
def __init__(
self,
num_layers: int,
in_channels: int,
growth_rate: int
):
super().__init__()
self.layers = nn.ModuleList()
for i in range(num_layers):
self.layers.append(
DenseLayer(in_channels + i * growth_rate, growth_rate)
)
def forward(self, x):
features = [x]
for layer in self.layers:
new_features = layer(features)
features.append(new_features)
return torch.cat(features, dim=1)
class TransitionLayer(nn.Module):
"""Transition layer between dense blocks."""
def __init__(self, in_channels: int, out_channels: int):
super().__init__()
self.bn = nn.BatchNorm2d(in_channels)
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
def forward(self, x):
return self.pool(self.conv(F.relu(self.bn(x))))
class DenseNet(nn.Module):
"""DenseNet implementation."""
def __init__(
self,
growth_rate: int = 32,
block_config: Tuple[int, ...] = (6, 12, 24, 16),
num_init_features: int = 64,
compression: float = 0.5,
num_classes: int = 1000
):
super().__init__()
# Initial convolution
self.features = nn.Sequential(
nn.Conv2d(3, num_init_features, kernel_size=7, stride=2, padding=3, bias=False),
nn.BatchNorm2d(num_init_features),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
# Dense blocks
num_features = num_init_features
for i, num_layers in enumerate(block_config):
block = DenseBlock(num_layers, num_features, growth_rate)
self.features.add_module(f'denseblock{i+1}', block)
num_features = num_features + num_layers * growth_rate
if i != len(block_config) - 1:
trans = TransitionLayer(
num_features,
int(num_features * compression)
)
self.features.add_module(f'transition{i+1}', trans)
num_features = int(num_features * compression)
# Final batch norm
self.features.add_module('final_bn', nn.BatchNorm2d(num_features))
# Classifier
self.classifier = nn.Linear(num_features, num_classes)
def forward(self, x):
x = self.features(x)
x = F.relu(x)
x = F.adaptive_avg_pool2d(x, (1, 1))
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# Factory functions
def densenet121(num_classes=1000):
return DenseNet(32, (6, 12, 24, 16), 64, num_classes=num_classes)
def densenet169(num_classes=1000):
return DenseNet(32, (6, 12, 32, 32), 64, num_classes=num_classes)
def densenet201(num_classes=1000):
return DenseNet(32, (6, 12, 48, 32), 64, num_classes=num_classes)
# Test
densenet = densenet121(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = densenet(x)
print(f"DenseNet-121 output: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in densenet.parameters()):,}")
EfficientNet: Compound Scaling
The Scaling Problem
Width: More channels per layer Depth: More layers Resolution: Higher input resolution EfficientNet scales all three together with compound coefficient ϕ: depth=αϕ,width=βϕ,resolution=γϕ Subject to: α⋅β2⋅γ2≈2Copy
class MBConvBlock(nn.Module):
"""Mobile Inverted Bottleneck (MBConv) with Squeeze-and-Excitation."""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int,
expand_ratio: int,
se_ratio: float = 0.25
):
super().__init__()
self.stride = stride
self.use_residual = (stride == 1 and in_channels == out_channels)
hidden_dim = in_channels * expand_ratio
layers = []
# Expansion phase
if expand_ratio != 1:
layers.extend([
nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
nn.BatchNorm2d(hidden_dim),
nn.SiLU(inplace=True) # Swish activation
])
# Depthwise convolution
layers.extend([
nn.Conv2d(
hidden_dim, hidden_dim, kernel_size,
stride=stride, padding=kernel_size // 2,
groups=hidden_dim, bias=False
),
nn.BatchNorm2d(hidden_dim),
nn.SiLU(inplace=True)
])
self.conv = nn.Sequential(*layers)
# Squeeze-and-Excitation
se_channels = max(1, int(in_channels * se_ratio))
self.se = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(hidden_dim, se_channels, 1),
nn.SiLU(inplace=True),
nn.Conv2d(se_channels, hidden_dim, 1),
nn.Sigmoid()
)
# Projection
self.project = nn.Sequential(
nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels)
)
# Stochastic depth (drop path)
self.drop_rate = 0.2
def forward(self, x):
identity = x
x = self.conv(x)
x = x * self.se(x) # SE attention
x = self.project(x)
if self.use_residual:
if self.training and self.drop_rate > 0:
x = self._drop_path(x)
x = x + identity
return x
def _drop_path(self, x):
"""Stochastic depth."""
if not self.training:
return x
keep_prob = 1 - self.drop_rate
shape = (x.shape[0],) + (1,) * (x.ndim - 1)
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
random_tensor.floor_()
return x.div(keep_prob) * random_tensor
class EfficientNet(nn.Module):
"""EfficientNet-B0 implementation."""
def __init__(self, num_classes: int = 1000, width_mult: float = 1.0, depth_mult: float = 1.0):
super().__init__()
# EfficientNet-B0 configuration
# (expand_ratio, channels, num_layers, stride, kernel_size)
config = [
(1, 16, 1, 1, 3),
(6, 24, 2, 2, 3),
(6, 40, 2, 2, 5),
(6, 80, 3, 2, 3),
(6, 112, 3, 1, 5),
(6, 192, 4, 2, 5),
(6, 320, 1, 1, 3),
]
# Stem
out_channels = self._round_channels(32, width_mult)
self.stem = nn.Sequential(
nn.Conv2d(3, out_channels, 3, stride=2, padding=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.SiLU(inplace=True)
)
# Build blocks
layers = []
in_channels = out_channels
for expand_ratio, channels, num_layers, stride, kernel_size in config:
out_channels = self._round_channels(channels, width_mult)
num_layers = self._round_layers(num_layers, depth_mult)
for i in range(num_layers):
layers.append(MBConvBlock(
in_channels,
out_channels,
kernel_size,
stride if i == 0 else 1,
expand_ratio
))
in_channels = out_channels
self.blocks = nn.Sequential(*layers)
# Head
head_channels = self._round_channels(1280, width_mult)
self.head = nn.Sequential(
nn.Conv2d(in_channels, head_channels, 1, bias=False),
nn.BatchNorm2d(head_channels),
nn.SiLU(inplace=True),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Dropout(0.2),
nn.Linear(head_channels, num_classes)
)
@staticmethod
def _round_channels(channels: int, mult: float) -> int:
"""Round channels to nearest multiple of 8."""
return int(max(8, int(channels * mult + 4) // 8 * 8))
@staticmethod
def _round_layers(layers: int, mult: float) -> int:
return int(np.ceil(layers * mult))
def forward(self, x):
x = self.stem(x)
x = self.blocks(x)
x = self.head(x)
return x
# Factory functions with compound scaling
def efficientnet_b0(num_classes=1000):
return EfficientNet(num_classes, width_mult=1.0, depth_mult=1.0)
def efficientnet_b1(num_classes=1000):
return EfficientNet(num_classes, width_mult=1.0, depth_mult=1.1)
def efficientnet_b2(num_classes=1000):
return EfficientNet(num_classes, width_mult=1.1, depth_mult=1.2)
def efficientnet_b3(num_classes=1000):
return EfficientNet(num_classes, width_mult=1.2, depth_mult=1.4)
def efficientnet_b4(num_classes=1000):
return EfficientNet(num_classes, width_mult=1.4, depth_mult=1.8)
# Test
effnet = efficientnet_b0(num_classes=1000)
x = torch.randn(1, 3, 224, 224)
output = effnet(x)
print(f"EfficientNet-B0 output: {output.shape}")
print(f"Parameters: {sum(p.numel() for p in effnet.parameters()):,}")
ResNeXt: Cardinality
Split-Transform-Merge
Increase “cardinality” (number of parallel paths) instead of depth/width:Copy
class ResNeXtBlock(nn.Module):
"""ResNeXt block with grouped convolutions."""
expansion = 4
def __init__(
self,
in_channels: int,
out_channels: int,
stride: int = 1,
cardinality: int = 32,
base_width: int = 4,
downsample: Optional[nn.Module] = None
):
super().__init__()
# Width of each group
D = int(out_channels * (base_width / 64)) * cardinality
self.conv1 = nn.Conv2d(in_channels, D, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(D)
# Grouped convolution
self.conv2 = nn.Conv2d(
D, D, kernel_size=3, stride=stride, padding=1,
groups=cardinality, bias=False
)
self.bn2 = nn.BatchNorm2d(D)
self.conv3 = nn.Conv2d(D, out_channels * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
def forward(self, x):
identity = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.relu(self.bn2(self.conv2(out)))
out = self.bn3(self.conv3(out))
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
# Test
resnext_block = ResNeXtBlock(256, 64, cardinality=32, base_width=4)
x = torch.randn(1, 256, 56, 56)
output = resnext_block(x)
print(f"ResNeXt block output: {output.shape}")
Architecture Comparison
Copy
def compare_architectures():
"""Compare different CNN architectures."""
architectures = {
'VGG-16': VGG16(num_classes=1000),
'ResNet-50': resnet50(num_classes=1000),
'DenseNet-121': densenet121(num_classes=1000),
'EfficientNet-B0': efficientnet_b0(num_classes=1000),
}
print("CNN Architecture Comparison")
print("="*70)
print(f"{'Architecture':<20} {'Parameters':>15} {'MACs (224×224)':>20}")
print("-"*70)
x = torch.randn(1, 3, 224, 224)
for name, model in architectures.items():
# Count parameters
params = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
# Estimate MACs (very rough)
model.eval()
with torch.no_grad():
output = model(x)
print(f"{name:<20} {params:>15,}")
print("-"*70)
print("\nKey Design Choices:")
print(" VGG: Only 3×3 convs, simple but heavy")
print(" ResNet: Skip connections, can go very deep")
print(" DenseNet: Feature reuse, efficient parameters")
print(" EfficientNet: Compound scaling, best efficiency")
compare_architectures()
Exercises
Exercise 1: Implement SENet
Exercise 1: Implement SENet
Add Squeeze-and-Excitation to any architecture:
Copy
class SEBlock(nn.Module):
def __init__(self, channels, reduction=16):
super().__init__()
self.squeeze = nn.AdaptiveAvgPool2d(1)
self.excite = nn.Sequential(
nn.Linear(channels, channels // reduction),
nn.ReLU(),
nn.Linear(channels // reduction, channels),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.shape
y = self.squeeze(x).view(b, c)
y = self.excite(y).view(b, c, 1, 1)
return x * y
Exercise 2: Build RegNet
Exercise 2: Build RegNet
Implement RegNet with its simple design rules:
Copy
def generate_regnet_config(w0, wa, wm, depth, group_w):
"""Generate RegNet widths."""
widths = []
for i in range(depth):
w = w0 + wa * i
w = round(w / group_w) * group_w
widths.append(w)
return widths
Exercise 3: Architecture Search
Exercise 3: Architecture Search
Run a simple grid search over architecture hyperparameters:
Copy
for depth in [18, 34, 50]:
for width_mult in [0.5, 1.0, 1.5]:
model = create_resnet(depth, width_mult)
accuracy = train_and_eval(model, epochs=10)
print(f"Depth={depth}, Width={width_mult}: {accuracy:.2f}%")