Skip to main content
Efficient Architectures

Efficient Neural Network Design

The Efficiency Challenge

Mobile and edge devices have strict constraints:
  • Compute: Limited FLOPs/second
  • Memory: Small RAM and storage
  • Power: Battery limitations
  • Latency: Real-time requirements
We need models that are:
  • Small (fewer parameters)
  • Fast (fewer FLOPs)
  • Accurate (still useful!)
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from typing import List, Tuple, Optional, Callable

torch.manual_seed(42)

Depthwise Separable Convolutions

The foundation of efficient architectures: Standard Conv: O(K2CinCoutHW)\text{Standard Conv: } O(K^2 \cdot C_{in} \cdot C_{out} \cdot H \cdot W) Depthwise Separable: O(K2CinHW)+O(CinCoutHW)\text{Depthwise Separable: } O(K^2 \cdot C_{in} \cdot H \cdot W) + O(C_{in} \cdot C_{out} \cdot H \cdot W)
class DepthwiseSeparableConv(nn.Module):
    """
    Depthwise Separable Convolution.
    
    Factorizes standard conv into:
    1. Depthwise: Spatial filtering per channel
    2. Pointwise: Channel mixing via 1x1 conv
    
    Reduction factor: K² (typically 8-9x fewer FLOPs)
    """
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        padding: int = 1,
        bias: bool = False
    ):
        super().__init__()
        
        # Depthwise: each input channel has its own filter
        self.depthwise = nn.Conv2d(
            in_channels,
            in_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            groups=in_channels,  # Key: groups = in_channels
            bias=bias
        )
        
        # Pointwise: 1x1 conv for channel mixing
        self.pointwise = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=1,
            bias=bias
        )
        
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.depthwise(x)
        x = self.bn1(x)
        x = F.relu(x)
        
        x = self.pointwise(x)
        x = self.bn2(x)
        x = F.relu(x)
        
        return x
    
    @staticmethod
    def compute_savings(in_c: int, out_c: int, k: int = 3) -> float:
        """Compute FLOPs savings ratio."""
        standard = k * k * in_c * out_c
        separable = k * k * in_c + in_c * out_c
        return standard / separable


# Example: savings for 256 -> 256 with 3x3 kernel
savings = DepthwiseSeparableConv.compute_savings(256, 256, 3)
print(f"FLOPs reduction: {savings:.1f}x")  # ~8-9x

MobileNet Family

MobileNetV1

class MobileNetV1Block(nn.Module):
    """MobileNetV1 basic block."""
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1
    ):
        super().__init__()
        self.conv = DepthwiseSeparableConv(
            in_channels, out_channels, stride=stride
        )
    
    def forward(self, x):
        return self.conv(x)


class MobileNetV1(nn.Module):
    """
    MobileNetV1: Efficient CNN using depthwise separable convolutions.
    
    Key ideas:
    1. Replace all standard convs with depthwise separable
    2. Width multiplier α: scale all channel counts
    3. Resolution multiplier ρ: scale input resolution
    """
    
    def __init__(
        self,
        num_classes: int = 1000,
        width_mult: float = 1.0,
        input_size: int = 224
    ):
        super().__init__()
        
        self.width_mult = width_mult
        
        def c(channels):
            return int(channels * width_mult)
        
        # Configuration: (out_channels, stride)
        config = [
            (64, 1),
            (128, 2),
            (128, 1),
            (256, 2),
            (256, 1),
            (512, 2),
            (512, 1), (512, 1), (512, 1), (512, 1), (512, 1),  # 5x
            (1024, 2),
            (1024, 1)
        ]
        
        # First conv
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, c(32), 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(c(32)),
            nn.ReLU(inplace=True)
        )
        
        # Build blocks
        layers = []
        in_channels = c(32)
        
        for out_channels, stride in config:
            layers.append(MobileNetV1Block(in_channels, c(out_channels), stride))
            in_channels = c(out_channels)
        
        self.features = nn.Sequential(*layers)
        
        # Classifier
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Linear(c(1024), num_classes)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

MobileNetV2

class InvertedResidual(nn.Module):
    """
    MobileNetV2 Inverted Residual Block.
    
    Key innovations:
    1. Inverted bottleneck: expand -> depthwise -> project
    2. Linear bottleneck: no ReLU after final projection
    3. Residual connection when stride=1 and in_c=out_c
    """
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1,
        expand_ratio: int = 6
    ):
        super().__init__()
        
        self.stride = stride
        self.use_residual = stride == 1 and in_channels == out_channels
        
        hidden_dim = in_channels * expand_ratio
        
        layers = []
        
        # Expansion (1x1 conv)
        if expand_ratio != 1:
            layers.extend([
                nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True)
            ])
        
        # Depthwise conv
        layers.extend([
            nn.Conv2d(
                hidden_dim, hidden_dim, 3,
                stride=stride, padding=1, groups=hidden_dim, bias=False
            ),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True)
        ])
        
        # Projection (linear - no activation!)
        layers.extend([
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels)
        ])
        
        self.conv = nn.Sequential(*layers)
    
    def forward(self, x):
        if self.use_residual:
            return x + self.conv(x)
        return self.conv(x)


class MobileNetV2(nn.Module):
    """
    MobileNetV2: Inverted Residuals and Linear Bottlenecks.
    
    Improvements over V1:
    - Inverted residual structure
    - Linear bottlenecks (preserve information)
    - Residual connections for better gradients
    """
    
    def __init__(self, num_classes: int = 1000, width_mult: float = 1.0):
        super().__init__()
        
        def c(channels):
            return max(8, int(channels * width_mult))
        
        # (expand_ratio, out_channels, num_blocks, stride)
        config = [
            (1, 16, 1, 1),
            (6, 24, 2, 2),
            (6, 32, 3, 2),
            (6, 64, 4, 2),
            (6, 96, 3, 1),
            (6, 160, 3, 2),
            (6, 320, 1, 1),
        ]
        
        # First conv
        self.features = [nn.Sequential(
            nn.Conv2d(3, c(32), 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(c(32)),
            nn.ReLU6(inplace=True)
        )]
        
        # Build inverted residual blocks
        in_channels = c(32)
        
        for t, out_c, n, s in config:
            out_channels = c(out_c)
            for i in range(n):
                stride = s if i == 0 else 1
                self.features.append(
                    InvertedResidual(in_channels, out_channels, stride, t)
                )
                in_channels = out_channels
        
        # Last conv
        self.features.append(nn.Sequential(
            nn.Conv2d(in_channels, c(1280), 1, bias=False),
            nn.BatchNorm2d(c(1280)),
            nn.ReLU6(inplace=True)
        ))
        
        self.features = nn.Sequential(*self.features)
        
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(c(1280), num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

MobileNetV3

class SEBlock(nn.Module):
    """Squeeze-and-Excitation block."""
    
    def __init__(self, channels: int, reduction: int = 4):
        super().__init__()
        
        reduced = max(1, channels // reduction)
        
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(channels, reduced, 1),
            nn.ReLU(inplace=True),
            nn.Conv2d(reduced, channels, 1),
            nn.Hardsigmoid(inplace=True)
        )
    
    def forward(self, x):
        return x * self.se(x)


class MobileNetV3Block(nn.Module):
    """
    MobileNetV3 block with SE and h-swish.
    
    Improvements:
    - Squeeze-and-Excitation attention
    - h-swish activation (efficient approximation)
    - Neural Architecture Search optimized
    """
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        expand_ratio: float,
        use_se: bool,
        use_hs: bool  # Use h-swish
    ):
        super().__init__()
        
        self.use_residual = stride == 1 and in_channels == out_channels
        
        hidden_dim = int(in_channels * expand_ratio)
        
        # Choose activation
        activation = nn.Hardswish if use_hs else nn.ReLU
        
        layers = []
        
        # Expansion
        if expand_ratio != 1:
            layers.extend([
                nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim),
                activation(inplace=True)
            ])
        
        # Depthwise
        layers.extend([
            nn.Conv2d(
                hidden_dim, hidden_dim, kernel_size,
                stride=stride, padding=kernel_size//2,
                groups=hidden_dim, bias=False
            ),
            nn.BatchNorm2d(hidden_dim),
            activation(inplace=True)
        ])
        
        # SE block
        if use_se:
            layers.append(SEBlock(hidden_dim))
        
        # Projection
        layers.extend([
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels)
        ])
        
        self.conv = nn.Sequential(*layers)
    
    def forward(self, x):
        if self.use_residual:
            return x + self.conv(x)
        return self.conv(x)


class MobileNetV3Small(nn.Module):
    """MobileNetV3-Small: Optimized for mobile."""
    
    def __init__(self, num_classes: int = 1000):
        super().__init__()
        
        # Configuration from NAS
        # (kernel, expand, out, SE, HS, stride)
        config = [
            (3, 1, 16, True, False, 2),
            (3, 4.5, 24, False, False, 2),
            (3, 3.67, 24, False, False, 1),
            (5, 4, 40, True, True, 2),
            (5, 6, 40, True, True, 1),
            (5, 6, 40, True, True, 1),
            (5, 3, 48, True, True, 1),
            (5, 3, 48, True, True, 1),
            (5, 6, 96, True, True, 2),
            (5, 6, 96, True, True, 1),
            (5, 6, 96, True, True, 1),
        ]
        
        # First conv
        layers = [nn.Sequential(
            nn.Conv2d(3, 16, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.Hardswish(inplace=True)
        )]
        
        # Build blocks
        in_c = 16
        for k, exp, out_c, se, hs, s in config:
            layers.append(MobileNetV3Block(in_c, out_c, k, s, exp, se, hs))
            in_c = out_c
        
        # Last stages
        layers.append(nn.Sequential(
            nn.Conv2d(96, 576, 1, bias=False),
            nn.BatchNorm2d(576),
            nn.Hardswish(inplace=True)
        ))
        
        self.features = nn.Sequential(*layers)
        
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        
        self.classifier = nn.Sequential(
            nn.Linear(576, 1024),
            nn.Hardswish(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(1024, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

ShuffleNet

Channel Shuffle Operation

def channel_shuffle(x: torch.Tensor, groups: int) -> torch.Tensor:
    """
    Channel shuffle operation for group convolutions.
    
    Enables information flow between groups by shuffling channels.
    """
    batch, channels, height, width = x.shape
    
    # Reshape to (batch, groups, channels_per_group, H, W)
    x = x.view(batch, groups, channels // groups, height, width)
    
    # Transpose groups and channels_per_group
    x = x.transpose(1, 2).contiguous()
    
    # Flatten back
    x = x.view(batch, channels, height, width)
    
    return x


class ShuffleNetV2Block(nn.Module):
    """
    ShuffleNetV2 building block.
    
    Key innovations:
    1. Channel split instead of pointwise group conv
    2. Channel shuffle for cross-group communication
    3. Efficient memory access patterns
    """
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        stride: int = 1
    ):
        super().__init__()
        
        self.stride = stride
        branch_channels = out_channels // 2
        
        if stride == 1:
            self.branch1 = nn.Identity()
        else:
            self.branch1 = nn.Sequential(
                # Depthwise
                nn.Conv2d(in_channels, in_channels, 3, stride, 1,
                          groups=in_channels, bias=False),
                nn.BatchNorm2d(in_channels),
                # Pointwise
                nn.Conv2d(in_channels, branch_channels, 1, bias=False),
                nn.BatchNorm2d(branch_channels),
                nn.ReLU(inplace=True)
            )
        
        # Main branch
        in_c = in_channels if stride > 1 else branch_channels
        
        self.branch2 = nn.Sequential(
            # Pointwise
            nn.Conv2d(in_c, branch_channels, 1, bias=False),
            nn.BatchNorm2d(branch_channels),
            nn.ReLU(inplace=True),
            # Depthwise
            nn.Conv2d(branch_channels, branch_channels, 3, stride, 1,
                      groups=branch_channels, bias=False),
            nn.BatchNorm2d(branch_channels),
            # Pointwise
            nn.Conv2d(branch_channels, branch_channels, 1, bias=False),
            nn.BatchNorm2d(branch_channels),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        if self.stride == 1:
            # Channel split
            x1, x2 = x.chunk(2, dim=1)
            out = torch.cat([x1, self.branch2(x2)], dim=1)
        else:
            out = torch.cat([self.branch1(x), self.branch2(x)], dim=1)
        
        # Channel shuffle
        out = channel_shuffle(out, 2)
        
        return out


class ShuffleNetV2(nn.Module):
    """
    ShuffleNetV2: Practical Guidelines for Efficient CNN Design.
    
    Design principles:
    G1: Equal channel width minimizes memory access cost
    G2: Excessive group convolution increases MAC
    G3: Network fragmentation reduces parallelism
    G4: Element-wise operations are non-negligible
    """
    
    def __init__(
        self,
        num_classes: int = 1000,
        width_mult: float = 1.0
    ):
        super().__init__()
        
        # Width configurations
        if width_mult == 0.5:
            out_channels = [24, 48, 96, 192, 1024]
        elif width_mult == 1.0:
            out_channels = [24, 116, 232, 464, 1024]
        elif width_mult == 1.5:
            out_channels = [24, 176, 352, 704, 1024]
        elif width_mult == 2.0:
            out_channels = [24, 244, 488, 976, 2048]
        
        # First conv
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, out_channels[0], 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(out_channels[0]),
            nn.ReLU(inplace=True)
        )
        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
        
        # Stages
        self.stage2 = self._make_stage(out_channels[0], out_channels[1], 4)
        self.stage3 = self._make_stage(out_channels[1], out_channels[2], 8)
        self.stage4 = self._make_stage(out_channels[2], out_channels[3], 4)
        
        # Last conv
        self.conv5 = nn.Sequential(
            nn.Conv2d(out_channels[3], out_channels[4], 1, bias=False),
            nn.BatchNorm2d(out_channels[4]),
            nn.ReLU(inplace=True)
        )
        
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(out_channels[4], num_classes)
    
    def _make_stage(self, in_c, out_c, num_blocks):
        layers = [ShuffleNetV2Block(in_c, out_c, stride=2)]
        for _ in range(num_blocks - 1):
            layers.append(ShuffleNetV2Block(out_c, out_c, stride=1))
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.conv5(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

EfficientNet

Compound Scaling

class EfficientNetConfig:
    """
    EfficientNet compound scaling.
    
    Key insight: Scale depth, width, and resolution together.
    
    φ: compound coefficient
    depth = α^φ
    width = β^φ  
    resolution = γ^φ
    
    Subject to: α × β² × γ² ≈ 2 (FLOPs double)
    """
    
    # Base model (EfficientNet-B0)
    BASE_WIDTH = 1.0
    BASE_DEPTH = 1.0
    BASE_RESOLUTION = 224
    
    # Scaling coefficients (found via grid search)
    ALPHA = 1.2   # Depth
    BETA = 1.1    # Width
    GAMMA = 1.15  # Resolution
    
    # Model configurations
    CONFIGS = {
        'b0': (1.0, 1.0, 224),
        'b1': (1.0, 1.1, 240),
        'b2': (1.1, 1.2, 260),
        'b3': (1.2, 1.4, 300),
        'b4': (1.4, 1.8, 380),
        'b5': (1.6, 2.2, 456),
        'b6': (1.8, 2.6, 528),
        'b7': (2.0, 3.1, 600),
    }
    
    @classmethod
    def get_config(cls, model_name: str):
        if model_name not in cls.CONFIGS:
            raise ValueError(f"Unknown model: {model_name}")
        
        width_mult, depth_mult, resolution = cls.CONFIGS[model_name]
        return width_mult, depth_mult, resolution


class MBConv(nn.Module):
    """
    Mobile Inverted Bottleneck with Squeeze-and-Excitation.
    Building block of EfficientNet.
    """
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int,
        expand_ratio: int,
        se_ratio: float = 0.25,
        drop_connect_rate: float = 0.0
    ):
        super().__init__()
        
        self.use_residual = stride == 1 and in_channels == out_channels
        self.drop_connect_rate = drop_connect_rate
        
        hidden_dim = in_channels * expand_ratio
        
        layers = []
        
        # Expansion
        if expand_ratio != 1:
            layers.extend([
                nn.Conv2d(in_channels, hidden_dim, 1, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.SiLU(inplace=True)  # Swish activation
            ])
        
        # Depthwise
        layers.extend([
            nn.Conv2d(
                hidden_dim, hidden_dim, kernel_size,
                stride=stride, padding=kernel_size//2,
                groups=hidden_dim, bias=False
            ),
            nn.BatchNorm2d(hidden_dim),
            nn.SiLU(inplace=True)
        ])
        
        # SE block
        se_channels = max(1, int(in_channels * se_ratio))
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(hidden_dim, se_channels, 1),
            nn.SiLU(inplace=True),
            nn.Conv2d(se_channels, hidden_dim, 1),
            nn.Sigmoid()
        )
        
        # Projection
        layers.extend([
            nn.Conv2d(hidden_dim, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels)
        ])
        
        self.conv = nn.Sequential(*layers)
    
    def _drop_connect(self, x: torch.Tensor) -> torch.Tensor:
        """Stochastic depth / drop connect."""
        if not self.training or self.drop_connect_rate == 0:
            return x
        
        keep_prob = 1 - self.drop_connect_rate
        random_tensor = keep_prob + torch.rand(
            (x.shape[0], 1, 1, 1),
            dtype=x.dtype,
            device=x.device
        )
        random_tensor.floor_()
        
        return x / keep_prob * random_tensor
    
    def forward(self, x):
        out = self.conv[:len(self.conv)-2](x)  # Up to depthwise
        out = out * self.se(out)  # SE attention
        out = self.conv[-2:](out)  # Projection
        
        if self.use_residual:
            out = self._drop_connect(out) + x
        
        return out


class EfficientNet(nn.Module):
    """
    EfficientNet: Rethinking Model Scaling for CNNs.
    
    Achieves state-of-the-art accuracy with far fewer parameters
    through compound scaling.
    """
    
    def __init__(
        self,
        model_name: str = 'b0',
        num_classes: int = 1000,
        drop_connect_rate: float = 0.2
    ):
        super().__init__()
        
        width_mult, depth_mult, resolution = EfficientNetConfig.get_config(model_name)
        
        def c(channels):
            """Round channels to divisible by 8."""
            return int(math.ceil(channels * width_mult / 8) * 8)
        
        def d(num_layers):
            """Scale depth."""
            return int(math.ceil(num_layers * depth_mult))
        
        # Block configuration: (expand, channels, layers, kernel, stride)
        block_config = [
            (1, 16, 1, 3, 1),
            (6, 24, 2, 3, 2),
            (6, 40, 2, 5, 2),
            (6, 80, 3, 3, 2),
            (6, 112, 3, 5, 1),
            (6, 192, 4, 5, 2),
            (6, 320, 1, 3, 1),
        ]
        
        # Stem
        self.stem = nn.Sequential(
            nn.Conv2d(3, c(32), 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(c(32)),
            nn.SiLU(inplace=True)
        )
        
        # Build blocks
        blocks = []
        total_blocks = sum(d(n) for _, _, n, _, _ in block_config)
        block_idx = 0
        in_channels = c(32)
        
        for expand, out_c, num_layers, kernel, stride in block_config:
            out_channels = c(out_c)
            
            for layer_idx in range(d(num_layers)):
                # Drop connect rate increases with depth
                drop_rate = drop_connect_rate * block_idx / total_blocks
                
                blocks.append(MBConv(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    kernel_size=kernel,
                    stride=stride if layer_idx == 0 else 1,
                    expand_ratio=expand,
                    drop_connect_rate=drop_rate
                ))
                
                in_channels = out_channels
                block_idx += 1
        
        self.blocks = nn.Sequential(*blocks)
        
        # Head
        head_channels = c(1280)
        self.head = nn.Sequential(
            nn.Conv2d(in_channels, head_channels, 1, bias=False),
            nn.BatchNorm2d(head_channels),
            nn.SiLU(inplace=True),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Dropout(0.2),
            nn.Linear(head_channels, num_classes)
        )
    
    def forward(self, x):
        x = self.stem(x)
        x = self.blocks(x)
        x = self.head(x)
        return x

GhostNet

class GhostModule(nn.Module):
    """
    Ghost Module: Generate more features from cheap operations.
    
    Idea: Many feature maps are similar (redundant).
    Generate a few "intrinsic" features, then create "ghosts" cheaply.
    """
    
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 1,
        ratio: int = 2,  # Ghost ratio
        dw_kernel: int = 3,  # Depthwise kernel for ghosts
        stride: int = 1
    ):
        super().__init__()
        
        # Number of intrinsic features
        self.intrinsic_channels = out_channels // ratio
        self.ghost_channels = self.intrinsic_channels * (ratio - 1)
        
        # Primary conv (generates intrinsic features)
        self.primary_conv = nn.Sequential(
            nn.Conv2d(in_channels, self.intrinsic_channels, kernel_size,
                      stride=stride, padding=kernel_size//2, bias=False),
            nn.BatchNorm2d(self.intrinsic_channels),
            nn.ReLU(inplace=True)
        )
        
        # Cheap operation (generates ghost features)
        self.cheap_operation = nn.Sequential(
            nn.Conv2d(self.intrinsic_channels, self.ghost_channels, dw_kernel,
                      stride=1, padding=dw_kernel//2,
                      groups=self.intrinsic_channels, bias=False),
            nn.BatchNorm2d(self.ghost_channels),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        intrinsic = self.primary_conv(x)
        ghosts = self.cheap_operation(intrinsic)
        return torch.cat([intrinsic, ghosts], dim=1)


class GhostBottleneck(nn.Module):
    """Ghost Bottleneck for GhostNet."""
    
    def __init__(
        self,
        in_channels: int,
        hidden_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        use_se: bool = False
    ):
        super().__init__()
        
        self.stride = stride
        
        # Ghost module 1 (expansion)
        self.ghost1 = GhostModule(in_channels, hidden_channels)
        
        # Depthwise conv (if stride > 1)
        if stride > 1:
            self.dw_conv = nn.Sequential(
                nn.Conv2d(hidden_channels, hidden_channels, kernel_size,
                          stride=stride, padding=kernel_size//2,
                          groups=hidden_channels, bias=False),
                nn.BatchNorm2d(hidden_channels)
            )
        
        # SE block
        self.se = SEBlock(hidden_channels) if use_se else nn.Identity()
        
        # Ghost module 2 (projection, no activation)
        self.ghost2 = GhostModule(hidden_channels, out_channels)
        
        # Skip connection
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, in_channels, kernel_size,
                          stride=stride, padding=kernel_size//2,
                          groups=in_channels, bias=False),
                nn.BatchNorm2d(in_channels),
                nn.Conv2d(in_channels, out_channels, 1, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.shortcut = nn.Identity()
    
    def forward(self, x):
        residual = self.shortcut(x)
        
        out = self.ghost1(x)
        
        if self.stride > 1:
            out = self.dw_conv(out)
        
        out = self.se(out)
        out = self.ghost2(out)
        
        return out + residual

Model Comparison

def compare_efficiency():
    """Compare efficient architectures."""
    
    comparison = """
    ╔══════════════════════════════════════════════════════════════════════╗
    ║                 EFFICIENT ARCHITECTURE COMPARISON                     ║
    ╠══════════════════════════════════════════════════════════════════════╣
    ║                                                                       ║
    ║  Model            Params    FLOPs      Top-1     Latency*            ║
    ║  ────────────────────────────────────────────────────────────────    ║
    ║  MobileNetV1      4.2M      569M       70.6%     33ms                ║
    ║  MobileNetV2      3.4M      300M       72.0%     29ms                ║
    ║  MobileNetV3-S    2.5M      56M        67.4%     15ms                ║
    ║  MobileNetV3-L    5.4M      219M       75.2%     27ms                ║
    ║                                                                       ║
    ║  ShuffleNetV2     2.3M      146M       69.4%     20ms                ║
    ║                                                                       ║
    ║  EfficientNet-B0  5.3M      390M       77.1%     35ms                ║
    ║  EfficientNet-B1  7.8M      700M       79.1%     52ms                ║
    ║  EfficientNet-B4  19M       4.2B       82.9%     166ms               ║
    ║                                                                       ║
    ║  GhostNet         5.2M      141M       73.9%     25ms                ║
    ║                                                                       ║
    ║  *Latency on mobile device (Pixel 3)                                 ║
    ║                                                                       ║
    ╠══════════════════════════════════════════════════════════════════════╣
    ║                      KEY DESIGN PRINCIPLES                            ║
    ╠══════════════════════════════════════════════════════════════════════╣
    ║                                                                       ║
    ║  1. Depthwise Separable Convolutions                                  ║
    ║     • 8-9x fewer FLOPs than standard conv                            ║
    ║     • Used by all efficient architectures                            ║
    ║                                                                       ║
    ║  2. Inverted Residuals                                                ║
    ║     • Expand → Depthwise → Project                                   ║
    ║     • Better gradient flow                                           ║
    ║                                                                       ║
    ║  3. Squeeze-and-Excitation                                            ║
    ║     • Channel attention with minimal overhead                        ║
    ║     • 0.5-1% accuracy boost                                          ║
    ║                                                                       ║
    ║  4. Neural Architecture Search                                        ║
    ║     • MobileNetV3, EfficientNet optimized by NAS                     ║
    ║     • Better than hand-designed                                       ║
    ║                                                                       ║
    ║  5. Compound Scaling                                                  ║
    ║     • Scale depth, width, resolution together                        ║
    ║     • EfficientNet's key innovation                                  ║
    ║                                                                       ║
    ╚══════════════════════════════════════════════════════════════════════╝
    """
    print(comparison)

compare_efficiency()

Exercises

Measure actual FLOPs and latency:
from thop import profile

model = MobileNetV2()
input = torch.randn(1, 3, 224, 224)

flops, params = profile(model, inputs=(input,))
print(f"FLOPs: {flops/1e9:.2f}G, Params: {params/1e6:.2f}M")
Create your own efficient block combining:
  • Depthwise separable convolution
  • Squeeze-and-excitation
  • Residual connection
Benchmark against MobileNetV2 block.
Experiment with different width/depth trade-offs:
  • Wide and shallow
  • Narrow and deep
  • Balanced
Which works best for your task?

What’s Next?