Skip to main content
Object Detection

Object Detection

The Detection Problem

Object detection = classification + localization For each object in an image, we need:
  • What: Class label (car, person, dog…)
  • Where: Bounding box (x,y,w,h)(x, y, w, h)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from typing import List, Tuple, Dict, Optional

torch.manual_seed(42)

Detection Paradigms

Two-Stage Detectors

Generate proposals → Classify and refine
class RegionProposalNetwork(nn.Module):
    """
    Region Proposal Network (RPN) from Faster R-CNN.
    
    Generates object proposals from feature maps.
    """
    
    def __init__(
        self,
        in_channels: int,
        anchor_sizes: List[int] = [128, 256, 512],
        aspect_ratios: List[float] = [0.5, 1.0, 2.0]
    ):
        super().__init__()
        
        self.anchor_sizes = anchor_sizes
        self.aspect_ratios = aspect_ratios
        self.num_anchors = len(anchor_sizes) * len(aspect_ratios)
        
        # Shared conv
        self.conv = nn.Conv2d(in_channels, in_channels, 3, padding=1)
        
        # Classification head (object vs background)
        self.cls_head = nn.Conv2d(in_channels, self.num_anchors * 2, 1)
        
        # Regression head (box deltas)
        self.reg_head = nn.Conv2d(in_channels, self.num_anchors * 4, 1)
    
    def generate_anchors(
        self,
        feature_map: torch.Tensor,
        stride: int = 16
    ) -> torch.Tensor:
        """Generate anchor boxes for feature map."""
        
        _, _, H, W = feature_map.shape
        device = feature_map.device
        
        # Grid of anchor centers
        shifts_x = torch.arange(0, W, device=device) * stride + stride // 2
        shifts_y = torch.arange(0, H, device=device) * stride + stride // 2
        shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing='ij')
        
        shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=-1)
        shifts = shifts.reshape(-1, 4)
        
        # Base anchors (centered at origin)
        base_anchors = []
        for size in self.anchor_sizes:
            for ratio in self.aspect_ratios:
                h = size / (ratio ** 0.5)
                w = size * (ratio ** 0.5)
                base_anchors.append([-w/2, -h/2, w/2, h/2])
        
        base_anchors = torch.tensor(base_anchors, device=device)
        
        # All anchors
        anchors = shifts.unsqueeze(1) + base_anchors.unsqueeze(0)
        anchors = anchors.reshape(-1, 4)
        
        return anchors
    
    def forward(
        self,
        features: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            features: [B, C, H, W] backbone features
        
        Returns:
            objectness: [B, num_anchors*H*W, 2] objectness scores
            bbox_deltas: [B, num_anchors*H*W, 4] box regression
            anchors: [num_anchors*H*W, 4] anchor boxes
        """
        batch_size = features.size(0)
        
        x = F.relu(self.conv(features))
        
        # Classification
        objectness = self.cls_head(x)  # [B, A*2, H, W]
        objectness = objectness.permute(0, 2, 3, 1).reshape(batch_size, -1, 2)
        
        # Regression
        bbox_deltas = self.reg_head(x)  # [B, A*4, H, W]
        bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).reshape(batch_size, -1, 4)
        
        # Generate anchors
        anchors = self.generate_anchors(features)
        
        return objectness, bbox_deltas, anchors


class RoIPooling(nn.Module):
    """
    Region of Interest Pooling.
    Extract fixed-size features from variable-size proposals.
    """
    
    def __init__(self, output_size: int = 7, spatial_scale: float = 1/16):
        super().__init__()
        self.output_size = output_size
        self.spatial_scale = spatial_scale
    
    def forward(
        self,
        features: torch.Tensor,
        rois: torch.Tensor
    ) -> torch.Tensor:
        """
        Args:
            features: [B, C, H, W] feature maps
            rois: [N, 5] (batch_idx, x1, y1, x2, y2)
        
        Returns:
            pooled: [N, C, output_size, output_size]
        """
        # Scale RoIs to feature map coordinates
        rois_scaled = rois.clone()
        rois_scaled[:, 1:] *= self.spatial_scale
        
        # Use torchvision's roi_pool
        pooled = torchvision.ops.roi_pool(
            features, rois_scaled, 
            output_size=self.output_size,
            spatial_scale=1.0  # Already scaled
        )
        
        return pooled


class FasterRCNN(nn.Module):
    """
    Faster R-CNN: Two-stage object detector.
    
    Architecture:
    1. Backbone (ResNet, etc.) → Feature maps
    2. RPN → Region proposals
    3. RoI Pooling → Fixed-size features
    4. Detection head → Classes + refined boxes
    """
    
    def __init__(
        self,
        num_classes: int,
        backbone: nn.Module,
        backbone_out_channels: int = 2048
    ):
        super().__init__()
        
        self.num_classes = num_classes
        self.backbone = backbone
        
        # RPN
        self.rpn = RegionProposalNetwork(backbone_out_channels)
        
        # RoI pooling
        self.roi_pool = RoIPooling(output_size=7)
        
        # Detection head
        self.fc1 = nn.Linear(backbone_out_channels * 7 * 7, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        
        self.cls_head = nn.Linear(1024, num_classes)
        self.reg_head = nn.Linear(1024, num_classes * 4)
    
    def forward(
        self,
        images: torch.Tensor,
        targets: Optional[List[Dict]] = None
    ) -> Dict[str, torch.Tensor]:
        """
        Args:
            images: [B, 3, H, W] input images
            targets: List of dicts with 'boxes' and 'labels' (training only)
        """
        # Backbone
        features = self.backbone(images)
        
        # RPN
        objectness, rpn_deltas, anchors = self.rpn(features)
        
        # Get proposals (simplified - in practice use NMS)
        proposals = self._get_proposals(anchors, rpn_deltas, objectness)
        
        # RoI pooling
        roi_features = self.roi_pool(features, proposals)
        roi_features = roi_features.flatten(start_dim=1)
        
        # Detection head
        x = F.relu(self.fc1(roi_features))
        x = F.relu(self.fc2(x))
        
        cls_logits = self.cls_head(x)
        box_deltas = self.reg_head(x)
        
        return {
            'cls_logits': cls_logits,
            'box_deltas': box_deltas,
            'rpn_objectness': objectness,
            'rpn_deltas': rpn_deltas
        }
    
    def _get_proposals(
        self,
        anchors: torch.Tensor,
        deltas: torch.Tensor,
        scores: torch.Tensor,
        top_k: int = 1000
    ) -> torch.Tensor:
        """Get top-k proposals after NMS."""
        # Apply deltas to anchors
        proposals = self._apply_deltas(anchors, deltas[0])
        
        # Get objectness scores
        scores = F.softmax(scores[0], dim=-1)[:, 1]
        
        # Top-k
        _, top_idx = scores.topk(min(top_k, len(scores)))
        proposals = proposals[top_idx]
        
        # Add batch index
        batch_idx = torch.zeros(len(proposals), 1, device=proposals.device)
        proposals = torch.cat([batch_idx, proposals], dim=1)
        
        return proposals
    
    def _apply_deltas(
        self,
        anchors: torch.Tensor,
        deltas: torch.Tensor
    ) -> torch.Tensor:
        """Apply box deltas to anchors."""
        # Convert anchors to (cx, cy, w, h)
        widths = anchors[:, 2] - anchors[:, 0]
        heights = anchors[:, 3] - anchors[:, 1]
        cx = anchors[:, 0] + 0.5 * widths
        cy = anchors[:, 1] + 0.5 * heights
        
        # Apply deltas
        dx, dy, dw, dh = deltas.unbind(-1)
        pred_cx = dx * widths + cx
        pred_cy = dy * heights + cy
        pred_w = torch.exp(dw) * widths
        pred_h = torch.exp(dh) * heights
        
        # Convert back to (x1, y1, x2, y2)
        pred_boxes = torch.stack([
            pred_cx - 0.5 * pred_w,
            pred_cy - 0.5 * pred_h,
            pred_cx + 0.5 * pred_w,
            pred_cy + 0.5 * pred_h
        ], dim=-1)
        
        return pred_boxes

YOLO: Single-Stage Detection

YOLOv1 Core Concepts

class YOLOv1Head(nn.Module):
    """
    YOLOv1: You Only Look Once.
    
    Divides image into S×S grid:
    - Each cell predicts B bounding boxes
    - Each box: (x, y, w, h, confidence)
    - Plus C class probabilities per cell
    
    Output: S × S × (B*5 + C)
    """
    
    def __init__(
        self,
        num_classes: int = 20,
        grid_size: int = 7,
        num_boxes: int = 2
    ):
        super().__init__()
        
        self.S = grid_size
        self.B = num_boxes
        self.C = num_classes
        
        self.output_size = self.S * self.S * (self.B * 5 + self.C)
        
        # Final layers (after backbone)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * 7 * 7, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),
            nn.Linear(4096, self.output_size)
        )
    
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        """
        Returns:
            predictions: [B, S, S, B*5+C]
        """
        batch_size = features.size(0)
        
        x = self.fc(features)
        x = x.view(batch_size, self.S, self.S, self.B * 5 + self.C)
        
        return x
    
    def decode_predictions(
        self,
        predictions: torch.Tensor,
        confidence_threshold: float = 0.5
    ) -> List[Dict]:
        """Decode YOLO predictions to boxes."""
        batch_size = predictions.size(0)
        results = []
        
        for b in range(batch_size):
            boxes = []
            scores = []
            labels = []
            
            for i in range(self.S):
                for j in range(self.S):
                    cell = predictions[b, i, j]
                    
                    # Class probabilities
                    class_probs = cell[self.B * 5:]
                    
                    for box_idx in range(self.B):
                        # Box predictions
                        box_offset = box_idx * 5
                        x = (cell[box_offset] + j) / self.S
                        y = (cell[box_offset + 1] + i) / self.S
                        w = cell[box_offset + 2] ** 2  # Squared to ensure positive
                        h = cell[box_offset + 3] ** 2
                        conf = cell[box_offset + 4]
                        
                        # Class-specific confidence
                        class_scores = conf * class_probs
                        
                        best_class = class_scores.argmax()
                        best_score = class_scores[best_class]
                        
                        if best_score > confidence_threshold:
                            # Convert to (x1, y1, x2, y2)
                            x1 = x - w / 2
                            y1 = y - h / 2
                            x2 = x + w / 2
                            y2 = y + h / 2
                            
                            boxes.append([x1, y1, x2, y2])
                            scores.append(best_score.item())
                            labels.append(best_class.item())
            
            results.append({
                'boxes': torch.tensor(boxes),
                'scores': torch.tensor(scores),
                'labels': torch.tensor(labels)
            })
        
        return results

YOLOv3 Architecture

class DarknetBlock(nn.Module):
    """Darknet residual block."""
    
    def __init__(self, in_channels: int):
        super().__init__()
        
        hidden = in_channels // 2
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, hidden, 1, bias=False),
            nn.BatchNorm2d(hidden),
            nn.LeakyReLU(0.1)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(hidden, in_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.LeakyReLU(0.1)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.conv2(self.conv1(x))


class YOLOv3Neck(nn.Module):
    """
    YOLOv3 Feature Pyramid Neck.
    Multi-scale predictions for better small object detection.
    """
    
    def __init__(self, backbone_channels: List[int] = [256, 512, 1024]):
        super().__init__()
        
        # Lateral connections
        self.lateral1 = nn.Conv2d(backbone_channels[2], 512, 1)
        self.lateral2 = nn.Conv2d(backbone_channels[1], 256, 1)
        self.lateral3 = nn.Conv2d(backbone_channels[0], 128, 1)
        
        # Feature fusion
        self.fuse1 = self._make_fuse_block(512 + 256, 256)
        self.fuse2 = self._make_fuse_block(256 + 128, 128)
    
    def _make_fuse_block(self, in_channels: int, out_channels: int):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1),
            nn.Conv2d(out_channels, out_channels * 2, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels * 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(out_channels * 2, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1)
        )
    
    def forward(
        self,
        features: Tuple[torch.Tensor, ...]
    ) -> Tuple[torch.Tensor, ...]:
        """
        Args:
            features: (C3, C4, C5) from backbone
        
        Returns:
            (P3, P4, P5) multi-scale features
        """
        c3, c4, c5 = features
        
        # Top-down pathway
        p5 = self.lateral1(c5)
        
        p5_upsampled = F.interpolate(p5, size=c4.shape[2:], mode='nearest')
        p4 = self.fuse1(torch.cat([p5_upsampled, self.lateral2(c4)], dim=1))
        
        p4_upsampled = F.interpolate(p4, size=c3.shape[2:], mode='nearest')
        p3 = self.fuse2(torch.cat([p4_upsampled, self.lateral3(c3)], dim=1))
        
        return p3, p4, p5


class YOLOv3Head(nn.Module):
    """YOLOv3 detection head for one scale."""
    
    def __init__(
        self,
        in_channels: int,
        num_classes: int,
        num_anchors: int = 3
    ):
        super().__init__()
        
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        
        # 5 = (x, y, w, h, objectness)
        out_channels = num_anchors * (5 + num_classes)
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels * 2, 3, padding=1, bias=False),
            nn.BatchNorm2d(in_channels * 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(in_channels * 2, out_channels, 1)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Returns:
            predictions: [B, A, H, W, 5+C]
        """
        batch_size = x.size(0)
        
        x = self.conv(x)  # [B, A*(5+C), H, W]
        
        # Reshape
        x = x.view(batch_size, self.num_anchors, 5 + self.num_classes, 
                   x.size(2), x.size(3))
        x = x.permute(0, 1, 3, 4, 2)  # [B, A, H, W, 5+C]
        
        return x

Anchor-Free Detection

FCOS (Fully Convolutional One-Stage)

class FCOSHead(nn.Module):
    """
    FCOS: Fully Convolutional One-Stage Object Detection.
    
    No anchors! Predicts:
    - Per-pixel classification
    - Distance to box edges (l, t, r, b)
    - Centerness (suppress low-quality predictions)
    """
    
    def __init__(
        self,
        in_channels: int,
        num_classes: int,
        num_convs: int = 4
    ):
        super().__init__()
        
        self.num_classes = num_classes
        
        # Shared convolutions
        cls_convs = []
        reg_convs = []
        
        for _ in range(num_convs):
            cls_convs.append(nn.Conv2d(in_channels, in_channels, 3, padding=1))
            cls_convs.append(nn.GroupNorm(32, in_channels))
            cls_convs.append(nn.ReLU())
            
            reg_convs.append(nn.Conv2d(in_channels, in_channels, 3, padding=1))
            reg_convs.append(nn.GroupNorm(32, in_channels))
            reg_convs.append(nn.ReLU())
        
        self.cls_tower = nn.Sequential(*cls_convs)
        self.reg_tower = nn.Sequential(*reg_convs)
        
        # Prediction heads
        self.cls_logits = nn.Conv2d(in_channels, num_classes, 3, padding=1)
        self.bbox_pred = nn.Conv2d(in_channels, 4, 3, padding=1)  # l, t, r, b
        self.centerness = nn.Conv2d(in_channels, 1, 3, padding=1)
        
        # Learnable scale for each FPN level
        self.scales = nn.Parameter(torch.ones(1))
    
    def forward(self, features: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Returns:
            cls_logits: [B, C, H, W] classification
            bbox_pred: [B, 4, H, W] box regression
            centerness: [B, 1, H, W] centerness
        """
        cls_feat = self.cls_tower(features)
        reg_feat = self.reg_tower(features)
        
        cls_logits = self.cls_logits(cls_feat)
        bbox_pred = self.scales * self.bbox_pred(reg_feat)
        bbox_pred = F.relu(bbox_pred)  # Distances must be positive
        centerness = self.centerness(reg_feat)
        
        return {
            'cls_logits': cls_logits,
            'bbox_pred': bbox_pred,
            'centerness': centerness
        }


class CenterNet(nn.Module):
    """
    CenterNet: Objects as Points.
    
    Predicts:
    - Heatmap of object centers
    - Size (w, h) at each center
    - Local offset for sub-pixel accuracy
    """
    
    def __init__(
        self,
        backbone: nn.Module,
        num_classes: int,
        head_channels: int = 64
    ):
        super().__init__()
        
        self.backbone = backbone
        self.num_classes = num_classes
        
        # Upsampling to recover resolution
        self.deconv_layers = self._make_deconv_layers(
            in_channels=2048,  # Assuming ResNet
            out_channels=head_channels
        )
        
        # Prediction heads
        self.heatmap = nn.Sequential(
            nn.Conv2d(head_channels, head_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(head_channels, num_classes, 1)
        )
        
        self.wh = nn.Sequential(
            nn.Conv2d(head_channels, head_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(head_channels, 2, 1)  # width, height
        )
        
        self.offset = nn.Sequential(
            nn.Conv2d(head_channels, head_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(head_channels, 2, 1)  # x_offset, y_offset
        )
    
    def _make_deconv_layers(self, in_channels: int, out_channels: int):
        layers = []
        channels = [256, 128, 64]
        
        for c in channels:
            layers.extend([
                nn.ConvTranspose2d(in_channels, c, 4, stride=2, padding=1, bias=False),
                nn.BatchNorm2d(c),
                nn.ReLU()
            ])
            in_channels = c
        
        return nn.Sequential(*layers)
    
    def forward(self, images: torch.Tensor) -> Dict[str, torch.Tensor]:
        features = self.backbone(images)
        features = self.deconv_layers(features)
        
        heatmap = self.heatmap(features)
        heatmap = torch.sigmoid(heatmap)  # Normalize to [0, 1]
        
        wh = self.wh(features)
        offset = self.offset(features)
        
        return {
            'heatmap': heatmap,
            'wh': wh,
            'offset': offset
        }
    
    @staticmethod
    def decode_detections(
        heatmap: torch.Tensor,
        wh: torch.Tensor,
        offset: torch.Tensor,
        k: int = 100
    ) -> Dict[str, torch.Tensor]:
        """Decode CenterNet predictions."""
        batch, num_classes, h, w = heatmap.shape
        
        # Find local peaks
        heatmap_max = F.max_pool2d(heatmap, 3, stride=1, padding=1)
        keep = (heatmap == heatmap_max).float()
        heatmap = heatmap * keep
        
        # Top-k peaks
        heatmap_flat = heatmap.view(batch, -1)
        topk_scores, topk_indices = heatmap_flat.topk(k)
        
        # Get class and position
        topk_classes = topk_indices // (h * w)
        topk_indices = topk_indices % (h * w)
        topk_y = topk_indices // w
        topk_x = topk_indices % w
        
        # Get size and offset
        wh = wh.view(batch, 2, -1).permute(0, 2, 1)
        offset = offset.view(batch, 2, -1).permute(0, 2, 1)
        
        # Gather predictions at peak locations
        topk_wh = wh.gather(1, topk_indices.unsqueeze(-1).expand(-1, -1, 2))
        topk_offset = offset.gather(1, topk_indices.unsqueeze(-1).expand(-1, -1, 2))
        
        # Compute boxes
        topk_cx = topk_x.float() + topk_offset[..., 0]
        topk_cy = topk_y.float() + topk_offset[..., 1]
        
        boxes = torch.stack([
            topk_cx - topk_wh[..., 0] / 2,
            topk_cy - topk_wh[..., 1] / 2,
            topk_cx + topk_wh[..., 0] / 2,
            topk_cy + topk_wh[..., 1] / 2
        ], dim=-1)
        
        return {
            'boxes': boxes,
            'scores': topk_scores,
            'labels': topk_classes
        }

DETR: Detection Transformer

class DETR(nn.Module):
    """
    DETR: DEtection TRansformer.
    
    End-to-end object detection with transformers.
    No NMS, no anchors!
    """
    
    def __init__(
        self,
        backbone: nn.Module,
        num_classes: int,
        hidden_dim: int = 256,
        nheads: int = 8,
        num_encoder_layers: int = 6,
        num_decoder_layers: int = 6,
        num_queries: int = 100
    ):
        super().__init__()
        
        self.backbone = backbone
        self.num_queries = num_queries
        
        # Project backbone features
        self.input_proj = nn.Conv2d(2048, hidden_dim, 1)
        
        # Transformer
        self.transformer = nn.Transformer(
            d_model=hidden_dim,
            nhead=nheads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=2048,
            dropout=0.1,
            batch_first=True
        )
        
        # Object queries (learned)
        self.query_embed = nn.Embedding(num_queries, hidden_dim)
        
        # Position encoding
        self.pos_encoder = PositionalEncoding2D(hidden_dim)
        
        # Prediction heads
        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)  # +1 for "no object"
        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
    
    def forward(self, images: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Args:
            images: [B, 3, H, W]
        
        Returns:
            class_logits: [B, num_queries, num_classes+1]
            pred_boxes: [B, num_queries, 4]
        """
        batch_size = images.size(0)
        
        # Backbone
        features = self.backbone(images)  # [B, C, H', W']
        
        # Project and flatten
        features = self.input_proj(features)  # [B, hidden_dim, H', W']
        
        # Position encoding
        pos_embed = self.pos_encoder(features)  # [B, hidden_dim, H', W']
        
        # Flatten spatial dimensions
        features = features.flatten(2).permute(0, 2, 1)  # [B, H'*W', hidden_dim]
        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)  # [B, H'*W', hidden_dim]
        
        # Add position encoding
        features = features + pos_embed
        
        # Object queries
        query_embed = self.query_embed.weight.unsqueeze(0).expand(batch_size, -1, -1)
        tgt = torch.zeros_like(query_embed)
        
        # Transformer
        memory = self.transformer.encoder(features)
        hs = self.transformer.decoder(tgt, memory, query_pos=query_embed)
        
        # Predictions
        class_logits = self.class_embed(hs)  # [B, num_queries, num_classes+1]
        pred_boxes = self.bbox_embed(hs).sigmoid()  # [B, num_queries, 4]
        
        return {
            'class_logits': class_logits,
            'pred_boxes': pred_boxes
        }


class PositionalEncoding2D(nn.Module):
    """2D positional encoding for spatial features."""
    
    def __init__(self, hidden_dim: int, temperature: float = 10000):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.temperature = temperature
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: [B, C, H, W]
        
        Returns:
            pos_encoding: [B, C, H, W]
        """
        batch, _, h, w = x.shape
        
        y_embed = torch.arange(h, device=x.device).float().unsqueeze(1).expand(h, w)
        x_embed = torch.arange(w, device=x.device).float().unsqueeze(0).expand(h, w)
        
        dim_t = torch.arange(self.hidden_dim // 2, device=x.device).float()
        dim_t = self.temperature ** (2 * dim_t / self.hidden_dim)
        
        pos_x = x_embed.unsqueeze(-1) / dim_t
        pos_y = y_embed.unsqueeze(-1) / dim_t
        
        pos_x = torch.stack([pos_x.sin(), pos_x.cos()], dim=-1).flatten(-2)
        pos_y = torch.stack([pos_y.sin(), pos_y.cos()], dim=-1).flatten(-2)
        
        pos = torch.cat([pos_y, pos_x], dim=-1).permute(2, 0, 1)
        
        return pos.unsqueeze(0).expand(batch, -1, -1, -1)


class MLP(nn.Module):
    """Simple MLP for box prediction."""
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        
        layers = []
        for i in range(num_layers):
            in_dim = input_dim if i == 0 else hidden_dim
            out_dim = output_dim if i == num_layers - 1 else hidden_dim
            layers.append(nn.Linear(in_dim, out_dim))
            if i < num_layers - 1:
                layers.append(nn.ReLU())
        
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.layers(x)


class HungarianMatcher:
    """
    Hungarian matching for DETR training.
    Matches predictions to ground truth using optimal assignment.
    """
    
    def __init__(
        self,
        cost_class: float = 1.0,
        cost_bbox: float = 5.0,
        cost_giou: float = 2.0
    ):
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou
    
    @torch.no_grad()
    def __call__(
        self,
        outputs: Dict[str, torch.Tensor],
        targets: List[Dict[str, torch.Tensor]]
    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
        """
        Returns list of (pred_indices, target_indices) for each batch element.
        """
        from scipy.optimize import linear_sum_assignment
        
        batch_size, num_queries = outputs['class_logits'].shape[:2]
        
        # Flatten predictions
        pred_probs = outputs['class_logits'].softmax(-1)
        pred_boxes = outputs['pred_boxes']
        
        indices = []
        
        for b in range(batch_size):
            target_labels = targets[b]['labels']
            target_boxes = targets[b]['boxes']
            
            num_targets = len(target_labels)
            
            if num_targets == 0:
                indices.append((torch.tensor([]), torch.tensor([])))
                continue
            
            # Classification cost
            cost_class = -pred_probs[b, :, target_labels]
            
            # L1 box cost
            cost_bbox = torch.cdist(pred_boxes[b], target_boxes, p=1)
            
            # GIoU cost
            cost_giou = -self._giou(pred_boxes[b], target_boxes)
            
            # Total cost
            C = (
                self.cost_class * cost_class +
                self.cost_bbox * cost_bbox +
                self.cost_giou * cost_giou
            )
            
            # Hungarian algorithm
            row_ind, col_ind = linear_sum_assignment(C.cpu().numpy())
            
            indices.append((
                torch.tensor(row_ind, dtype=torch.long),
                torch.tensor(col_ind, dtype=torch.long)
            ))
        
        return indices
    
    def _giou(self, boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
        """Compute GIoU between two sets of boxes."""
        return torchvision.ops.generalized_box_iou(boxes1, boxes2)

Comparison

def detection_comparison():
    """Compare detection approaches."""
    
    comparison = """
    ╔════════════════════════════════════════════════════════════════════╗
    ║                  OBJECT DETECTION COMPARISON                        ║
    ╠════════════════════════════════════════════════════════════════════╣
    ║                                                                     ║
    ║  Method          mAP     FPS     Pros              Cons            ║
    ║  ─────────────────────────────────────────────────────────────     ║
    ║  Faster R-CNN    42.0    ~5      High accuracy     Slow            ║
    ║  YOLOv3          33.0    ~45     Fast              Lower accuracy  ║
    ║  YOLOv5          50.7    ~140    Fast + accurate   Complex         ║
    ║  FCOS            44.7    ~23     Simple, no anchor Anchor-based    ║
    ║  CenterNet       45.1    ~28     Simple            Fixed scales    ║
    ║  DETR            42.0    ~28     End-to-end        Slow training   ║
    ║                                                                     ║
    ╠════════════════════════════════════════════════════════════════════╣
    ║                     WHEN TO USE WHAT                                ║
    ╠════════════════════════════════════════════════════════════════════╣
    ║                                                                     ║
    ║  • Real-time (>30 FPS): YOLOv5, YOLOv8                             ║
    ║  • High accuracy: Faster R-CNN, Cascade R-CNN                      ║
    ║  • Simple training: FCOS, CenterNet                                 ║
    ║  • Research/flexibility: DETR, Deformable DETR                     ║
    ║  • Small objects: FPN-based methods, high resolution               ║
    ║  • Dense scenes: Anchor-free methods                               ║
    ║                                                                     ║
    ╚════════════════════════════════════════════════════════════════════╝
    """
    print(comparison)

detection_comparison()

Exercises

Implement GIoU, DIoU, and CIoU losses:
def giou_loss(pred_boxes, target_boxes):
    # Generalized IoU
    pass

def diou_loss(pred_boxes, target_boxes):
    # Distance IoU
    pass

def ciou_loss(pred_boxes, target_boxes):
    # Complete IoU
    pass
Implement multi-scale training for YOLO:
  • Randomly resize input during training
  • Adjust anchor scales accordingly
  • Handle variable batch sizes
Implement Soft-NMS for better overlapping object handling:
def soft_nms(boxes, scores, sigma=0.5, threshold=0.001):
    # Decay overlapping box scores instead of removing
    pass

What’s Next?