Object Detection
The Detection Problem
Object detection = classification + localization For each object in an image, we need:- What: Class label (car, person, dog…)
- Where: Bounding box (x,y,w,h)
Copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from typing import List, Tuple, Dict, Optional
torch.manual_seed(42)
Detection Paradigms
Two-Stage Detectors
Generate proposals → Classify and refineCopy
class RegionProposalNetwork(nn.Module):
"""
Region Proposal Network (RPN) from Faster R-CNN.
Generates object proposals from feature maps.
"""
def __init__(
self,
in_channels: int,
anchor_sizes: List[int] = [128, 256, 512],
aspect_ratios: List[float] = [0.5, 1.0, 2.0]
):
super().__init__()
self.anchor_sizes = anchor_sizes
self.aspect_ratios = aspect_ratios
self.num_anchors = len(anchor_sizes) * len(aspect_ratios)
# Shared conv
self.conv = nn.Conv2d(in_channels, in_channels, 3, padding=1)
# Classification head (object vs background)
self.cls_head = nn.Conv2d(in_channels, self.num_anchors * 2, 1)
# Regression head (box deltas)
self.reg_head = nn.Conv2d(in_channels, self.num_anchors * 4, 1)
def generate_anchors(
self,
feature_map: torch.Tensor,
stride: int = 16
) -> torch.Tensor:
"""Generate anchor boxes for feature map."""
_, _, H, W = feature_map.shape
device = feature_map.device
# Grid of anchor centers
shifts_x = torch.arange(0, W, device=device) * stride + stride // 2
shifts_y = torch.arange(0, H, device=device) * stride + stride // 2
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing='ij')
shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=-1)
shifts = shifts.reshape(-1, 4)
# Base anchors (centered at origin)
base_anchors = []
for size in self.anchor_sizes:
for ratio in self.aspect_ratios:
h = size / (ratio ** 0.5)
w = size * (ratio ** 0.5)
base_anchors.append([-w/2, -h/2, w/2, h/2])
base_anchors = torch.tensor(base_anchors, device=device)
# All anchors
anchors = shifts.unsqueeze(1) + base_anchors.unsqueeze(0)
anchors = anchors.reshape(-1, 4)
return anchors
def forward(
self,
features: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Args:
features: [B, C, H, W] backbone features
Returns:
objectness: [B, num_anchors*H*W, 2] objectness scores
bbox_deltas: [B, num_anchors*H*W, 4] box regression
anchors: [num_anchors*H*W, 4] anchor boxes
"""
batch_size = features.size(0)
x = F.relu(self.conv(features))
# Classification
objectness = self.cls_head(x) # [B, A*2, H, W]
objectness = objectness.permute(0, 2, 3, 1).reshape(batch_size, -1, 2)
# Regression
bbox_deltas = self.reg_head(x) # [B, A*4, H, W]
bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).reshape(batch_size, -1, 4)
# Generate anchors
anchors = self.generate_anchors(features)
return objectness, bbox_deltas, anchors
class RoIPooling(nn.Module):
"""
Region of Interest Pooling.
Extract fixed-size features from variable-size proposals.
"""
def __init__(self, output_size: int = 7, spatial_scale: float = 1/16):
super().__init__()
self.output_size = output_size
self.spatial_scale = spatial_scale
def forward(
self,
features: torch.Tensor,
rois: torch.Tensor
) -> torch.Tensor:
"""
Args:
features: [B, C, H, W] feature maps
rois: [N, 5] (batch_idx, x1, y1, x2, y2)
Returns:
pooled: [N, C, output_size, output_size]
"""
# Scale RoIs to feature map coordinates
rois_scaled = rois.clone()
rois_scaled[:, 1:] *= self.spatial_scale
# Use torchvision's roi_pool
pooled = torchvision.ops.roi_pool(
features, rois_scaled,
output_size=self.output_size,
spatial_scale=1.0 # Already scaled
)
return pooled
class FasterRCNN(nn.Module):
"""
Faster R-CNN: Two-stage object detector.
Architecture:
1. Backbone (ResNet, etc.) → Feature maps
2. RPN → Region proposals
3. RoI Pooling → Fixed-size features
4. Detection head → Classes + refined boxes
"""
def __init__(
self,
num_classes: int,
backbone: nn.Module,
backbone_out_channels: int = 2048
):
super().__init__()
self.num_classes = num_classes
self.backbone = backbone
# RPN
self.rpn = RegionProposalNetwork(backbone_out_channels)
# RoI pooling
self.roi_pool = RoIPooling(output_size=7)
# Detection head
self.fc1 = nn.Linear(backbone_out_channels * 7 * 7, 1024)
self.fc2 = nn.Linear(1024, 1024)
self.cls_head = nn.Linear(1024, num_classes)
self.reg_head = nn.Linear(1024, num_classes * 4)
def forward(
self,
images: torch.Tensor,
targets: Optional[List[Dict]] = None
) -> Dict[str, torch.Tensor]:
"""
Args:
images: [B, 3, H, W] input images
targets: List of dicts with 'boxes' and 'labels' (training only)
"""
# Backbone
features = self.backbone(images)
# RPN
objectness, rpn_deltas, anchors = self.rpn(features)
# Get proposals (simplified - in practice use NMS)
proposals = self._get_proposals(anchors, rpn_deltas, objectness)
# RoI pooling
roi_features = self.roi_pool(features, proposals)
roi_features = roi_features.flatten(start_dim=1)
# Detection head
x = F.relu(self.fc1(roi_features))
x = F.relu(self.fc2(x))
cls_logits = self.cls_head(x)
box_deltas = self.reg_head(x)
return {
'cls_logits': cls_logits,
'box_deltas': box_deltas,
'rpn_objectness': objectness,
'rpn_deltas': rpn_deltas
}
def _get_proposals(
self,
anchors: torch.Tensor,
deltas: torch.Tensor,
scores: torch.Tensor,
top_k: int = 1000
) -> torch.Tensor:
"""Get top-k proposals after NMS."""
# Apply deltas to anchors
proposals = self._apply_deltas(anchors, deltas[0])
# Get objectness scores
scores = F.softmax(scores[0], dim=-1)[:, 1]
# Top-k
_, top_idx = scores.topk(min(top_k, len(scores)))
proposals = proposals[top_idx]
# Add batch index
batch_idx = torch.zeros(len(proposals), 1, device=proposals.device)
proposals = torch.cat([batch_idx, proposals], dim=1)
return proposals
def _apply_deltas(
self,
anchors: torch.Tensor,
deltas: torch.Tensor
) -> torch.Tensor:
"""Apply box deltas to anchors."""
# Convert anchors to (cx, cy, w, h)
widths = anchors[:, 2] - anchors[:, 0]
heights = anchors[:, 3] - anchors[:, 1]
cx = anchors[:, 0] + 0.5 * widths
cy = anchors[:, 1] + 0.5 * heights
# Apply deltas
dx, dy, dw, dh = deltas.unbind(-1)
pred_cx = dx * widths + cx
pred_cy = dy * heights + cy
pred_w = torch.exp(dw) * widths
pred_h = torch.exp(dh) * heights
# Convert back to (x1, y1, x2, y2)
pred_boxes = torch.stack([
pred_cx - 0.5 * pred_w,
pred_cy - 0.5 * pred_h,
pred_cx + 0.5 * pred_w,
pred_cy + 0.5 * pred_h
], dim=-1)
return pred_boxes
YOLO: Single-Stage Detection
YOLOv1 Core Concepts
Copy
class YOLOv1Head(nn.Module):
"""
YOLOv1: You Only Look Once.
Divides image into S×S grid:
- Each cell predicts B bounding boxes
- Each box: (x, y, w, h, confidence)
- Plus C class probabilities per cell
Output: S × S × (B*5 + C)
"""
def __init__(
self,
num_classes: int = 20,
grid_size: int = 7,
num_boxes: int = 2
):
super().__init__()
self.S = grid_size
self.B = num_boxes
self.C = num_classes
self.output_size = self.S * self.S * (self.B * 5 + self.C)
# Final layers (after backbone)
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(1024 * 7 * 7, 4096),
nn.LeakyReLU(0.1),
nn.Dropout(0.5),
nn.Linear(4096, self.output_size)
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
"""
Returns:
predictions: [B, S, S, B*5+C]
"""
batch_size = features.size(0)
x = self.fc(features)
x = x.view(batch_size, self.S, self.S, self.B * 5 + self.C)
return x
def decode_predictions(
self,
predictions: torch.Tensor,
confidence_threshold: float = 0.5
) -> List[Dict]:
"""Decode YOLO predictions to boxes."""
batch_size = predictions.size(0)
results = []
for b in range(batch_size):
boxes = []
scores = []
labels = []
for i in range(self.S):
for j in range(self.S):
cell = predictions[b, i, j]
# Class probabilities
class_probs = cell[self.B * 5:]
for box_idx in range(self.B):
# Box predictions
box_offset = box_idx * 5
x = (cell[box_offset] + j) / self.S
y = (cell[box_offset + 1] + i) / self.S
w = cell[box_offset + 2] ** 2 # Squared to ensure positive
h = cell[box_offset + 3] ** 2
conf = cell[box_offset + 4]
# Class-specific confidence
class_scores = conf * class_probs
best_class = class_scores.argmax()
best_score = class_scores[best_class]
if best_score > confidence_threshold:
# Convert to (x1, y1, x2, y2)
x1 = x - w / 2
y1 = y - h / 2
x2 = x + w / 2
y2 = y + h / 2
boxes.append([x1, y1, x2, y2])
scores.append(best_score.item())
labels.append(best_class.item())
results.append({
'boxes': torch.tensor(boxes),
'scores': torch.tensor(scores),
'labels': torch.tensor(labels)
})
return results
YOLOv3 Architecture
Copy
class DarknetBlock(nn.Module):
"""Darknet residual block."""
def __init__(self, in_channels: int):
super().__init__()
hidden = in_channels // 2
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, hidden, 1, bias=False),
nn.BatchNorm2d(hidden),
nn.LeakyReLU(0.1)
)
self.conv2 = nn.Sequential(
nn.Conv2d(hidden, in_channels, 3, padding=1, bias=False),
nn.BatchNorm2d(in_channels),
nn.LeakyReLU(0.1)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return x + self.conv2(self.conv1(x))
class YOLOv3Neck(nn.Module):
"""
YOLOv3 Feature Pyramid Neck.
Multi-scale predictions for better small object detection.
"""
def __init__(self, backbone_channels: List[int] = [256, 512, 1024]):
super().__init__()
# Lateral connections
self.lateral1 = nn.Conv2d(backbone_channels[2], 512, 1)
self.lateral2 = nn.Conv2d(backbone_channels[1], 256, 1)
self.lateral3 = nn.Conv2d(backbone_channels[0], 128, 1)
# Feature fusion
self.fuse1 = self._make_fuse_block(512 + 256, 256)
self.fuse2 = self._make_fuse_block(256 + 128, 128)
def _make_fuse_block(self, in_channels: int, out_channels: int):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1),
nn.Conv2d(out_channels, out_channels * 2, 3, padding=1, bias=False),
nn.BatchNorm2d(out_channels * 2),
nn.LeakyReLU(0.1),
nn.Conv2d(out_channels * 2, out_channels, 1, bias=False),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1)
)
def forward(
self,
features: Tuple[torch.Tensor, ...]
) -> Tuple[torch.Tensor, ...]:
"""
Args:
features: (C3, C4, C5) from backbone
Returns:
(P3, P4, P5) multi-scale features
"""
c3, c4, c5 = features
# Top-down pathway
p5 = self.lateral1(c5)
p5_upsampled = F.interpolate(p5, size=c4.shape[2:], mode='nearest')
p4 = self.fuse1(torch.cat([p5_upsampled, self.lateral2(c4)], dim=1))
p4_upsampled = F.interpolate(p4, size=c3.shape[2:], mode='nearest')
p3 = self.fuse2(torch.cat([p4_upsampled, self.lateral3(c3)], dim=1))
return p3, p4, p5
class YOLOv3Head(nn.Module):
"""YOLOv3 detection head for one scale."""
def __init__(
self,
in_channels: int,
num_classes: int,
num_anchors: int = 3
):
super().__init__()
self.num_classes = num_classes
self.num_anchors = num_anchors
# 5 = (x, y, w, h, objectness)
out_channels = num_anchors * (5 + num_classes)
self.conv = nn.Sequential(
nn.Conv2d(in_channels, in_channels * 2, 3, padding=1, bias=False),
nn.BatchNorm2d(in_channels * 2),
nn.LeakyReLU(0.1),
nn.Conv2d(in_channels * 2, out_channels, 1)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Returns:
predictions: [B, A, H, W, 5+C]
"""
batch_size = x.size(0)
x = self.conv(x) # [B, A*(5+C), H, W]
# Reshape
x = x.view(batch_size, self.num_anchors, 5 + self.num_classes,
x.size(2), x.size(3))
x = x.permute(0, 1, 3, 4, 2) # [B, A, H, W, 5+C]
return x
Anchor-Free Detection
FCOS (Fully Convolutional One-Stage)
Copy
class FCOSHead(nn.Module):
"""
FCOS: Fully Convolutional One-Stage Object Detection.
No anchors! Predicts:
- Per-pixel classification
- Distance to box edges (l, t, r, b)
- Centerness (suppress low-quality predictions)
"""
def __init__(
self,
in_channels: int,
num_classes: int,
num_convs: int = 4
):
super().__init__()
self.num_classes = num_classes
# Shared convolutions
cls_convs = []
reg_convs = []
for _ in range(num_convs):
cls_convs.append(nn.Conv2d(in_channels, in_channels, 3, padding=1))
cls_convs.append(nn.GroupNorm(32, in_channels))
cls_convs.append(nn.ReLU())
reg_convs.append(nn.Conv2d(in_channels, in_channels, 3, padding=1))
reg_convs.append(nn.GroupNorm(32, in_channels))
reg_convs.append(nn.ReLU())
self.cls_tower = nn.Sequential(*cls_convs)
self.reg_tower = nn.Sequential(*reg_convs)
# Prediction heads
self.cls_logits = nn.Conv2d(in_channels, num_classes, 3, padding=1)
self.bbox_pred = nn.Conv2d(in_channels, 4, 3, padding=1) # l, t, r, b
self.centerness = nn.Conv2d(in_channels, 1, 3, padding=1)
# Learnable scale for each FPN level
self.scales = nn.Parameter(torch.ones(1))
def forward(self, features: torch.Tensor) -> Dict[str, torch.Tensor]:
"""
Returns:
cls_logits: [B, C, H, W] classification
bbox_pred: [B, 4, H, W] box regression
centerness: [B, 1, H, W] centerness
"""
cls_feat = self.cls_tower(features)
reg_feat = self.reg_tower(features)
cls_logits = self.cls_logits(cls_feat)
bbox_pred = self.scales * self.bbox_pred(reg_feat)
bbox_pred = F.relu(bbox_pred) # Distances must be positive
centerness = self.centerness(reg_feat)
return {
'cls_logits': cls_logits,
'bbox_pred': bbox_pred,
'centerness': centerness
}
class CenterNet(nn.Module):
"""
CenterNet: Objects as Points.
Predicts:
- Heatmap of object centers
- Size (w, h) at each center
- Local offset for sub-pixel accuracy
"""
def __init__(
self,
backbone: nn.Module,
num_classes: int,
head_channels: int = 64
):
super().__init__()
self.backbone = backbone
self.num_classes = num_classes
# Upsampling to recover resolution
self.deconv_layers = self._make_deconv_layers(
in_channels=2048, # Assuming ResNet
out_channels=head_channels
)
# Prediction heads
self.heatmap = nn.Sequential(
nn.Conv2d(head_channels, head_channels, 3, padding=1),
nn.ReLU(),
nn.Conv2d(head_channels, num_classes, 1)
)
self.wh = nn.Sequential(
nn.Conv2d(head_channels, head_channels, 3, padding=1),
nn.ReLU(),
nn.Conv2d(head_channels, 2, 1) # width, height
)
self.offset = nn.Sequential(
nn.Conv2d(head_channels, head_channels, 3, padding=1),
nn.ReLU(),
nn.Conv2d(head_channels, 2, 1) # x_offset, y_offset
)
def _make_deconv_layers(self, in_channels: int, out_channels: int):
layers = []
channels = [256, 128, 64]
for c in channels:
layers.extend([
nn.ConvTranspose2d(in_channels, c, 4, stride=2, padding=1, bias=False),
nn.BatchNorm2d(c),
nn.ReLU()
])
in_channels = c
return nn.Sequential(*layers)
def forward(self, images: torch.Tensor) -> Dict[str, torch.Tensor]:
features = self.backbone(images)
features = self.deconv_layers(features)
heatmap = self.heatmap(features)
heatmap = torch.sigmoid(heatmap) # Normalize to [0, 1]
wh = self.wh(features)
offset = self.offset(features)
return {
'heatmap': heatmap,
'wh': wh,
'offset': offset
}
@staticmethod
def decode_detections(
heatmap: torch.Tensor,
wh: torch.Tensor,
offset: torch.Tensor,
k: int = 100
) -> Dict[str, torch.Tensor]:
"""Decode CenterNet predictions."""
batch, num_classes, h, w = heatmap.shape
# Find local peaks
heatmap_max = F.max_pool2d(heatmap, 3, stride=1, padding=1)
keep = (heatmap == heatmap_max).float()
heatmap = heatmap * keep
# Top-k peaks
heatmap_flat = heatmap.view(batch, -1)
topk_scores, topk_indices = heatmap_flat.topk(k)
# Get class and position
topk_classes = topk_indices // (h * w)
topk_indices = topk_indices % (h * w)
topk_y = topk_indices // w
topk_x = topk_indices % w
# Get size and offset
wh = wh.view(batch, 2, -1).permute(0, 2, 1)
offset = offset.view(batch, 2, -1).permute(0, 2, 1)
# Gather predictions at peak locations
topk_wh = wh.gather(1, topk_indices.unsqueeze(-1).expand(-1, -1, 2))
topk_offset = offset.gather(1, topk_indices.unsqueeze(-1).expand(-1, -1, 2))
# Compute boxes
topk_cx = topk_x.float() + topk_offset[..., 0]
topk_cy = topk_y.float() + topk_offset[..., 1]
boxes = torch.stack([
topk_cx - topk_wh[..., 0] / 2,
topk_cy - topk_wh[..., 1] / 2,
topk_cx + topk_wh[..., 0] / 2,
topk_cy + topk_wh[..., 1] / 2
], dim=-1)
return {
'boxes': boxes,
'scores': topk_scores,
'labels': topk_classes
}
DETR: Detection Transformer
Copy
class DETR(nn.Module):
"""
DETR: DEtection TRansformer.
End-to-end object detection with transformers.
No NMS, no anchors!
"""
def __init__(
self,
backbone: nn.Module,
num_classes: int,
hidden_dim: int = 256,
nheads: int = 8,
num_encoder_layers: int = 6,
num_decoder_layers: int = 6,
num_queries: int = 100
):
super().__init__()
self.backbone = backbone
self.num_queries = num_queries
# Project backbone features
self.input_proj = nn.Conv2d(2048, hidden_dim, 1)
# Transformer
self.transformer = nn.Transformer(
d_model=hidden_dim,
nhead=nheads,
num_encoder_layers=num_encoder_layers,
num_decoder_layers=num_decoder_layers,
dim_feedforward=2048,
dropout=0.1,
batch_first=True
)
# Object queries (learned)
self.query_embed = nn.Embedding(num_queries, hidden_dim)
# Position encoding
self.pos_encoder = PositionalEncoding2D(hidden_dim)
# Prediction heads
self.class_embed = nn.Linear(hidden_dim, num_classes + 1) # +1 for "no object"
self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
def forward(self, images: torch.Tensor) -> Dict[str, torch.Tensor]:
"""
Args:
images: [B, 3, H, W]
Returns:
class_logits: [B, num_queries, num_classes+1]
pred_boxes: [B, num_queries, 4]
"""
batch_size = images.size(0)
# Backbone
features = self.backbone(images) # [B, C, H', W']
# Project and flatten
features = self.input_proj(features) # [B, hidden_dim, H', W']
# Position encoding
pos_embed = self.pos_encoder(features) # [B, hidden_dim, H', W']
# Flatten spatial dimensions
features = features.flatten(2).permute(0, 2, 1) # [B, H'*W', hidden_dim]
pos_embed = pos_embed.flatten(2).permute(0, 2, 1) # [B, H'*W', hidden_dim]
# Add position encoding
features = features + pos_embed
# Object queries
query_embed = self.query_embed.weight.unsqueeze(0).expand(batch_size, -1, -1)
tgt = torch.zeros_like(query_embed)
# Transformer
memory = self.transformer.encoder(features)
hs = self.transformer.decoder(tgt, memory, query_pos=query_embed)
# Predictions
class_logits = self.class_embed(hs) # [B, num_queries, num_classes+1]
pred_boxes = self.bbox_embed(hs).sigmoid() # [B, num_queries, 4]
return {
'class_logits': class_logits,
'pred_boxes': pred_boxes
}
class PositionalEncoding2D(nn.Module):
"""2D positional encoding for spatial features."""
def __init__(self, hidden_dim: int, temperature: float = 10000):
super().__init__()
self.hidden_dim = hidden_dim
self.temperature = temperature
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: [B, C, H, W]
Returns:
pos_encoding: [B, C, H, W]
"""
batch, _, h, w = x.shape
y_embed = torch.arange(h, device=x.device).float().unsqueeze(1).expand(h, w)
x_embed = torch.arange(w, device=x.device).float().unsqueeze(0).expand(h, w)
dim_t = torch.arange(self.hidden_dim // 2, device=x.device).float()
dim_t = self.temperature ** (2 * dim_t / self.hidden_dim)
pos_x = x_embed.unsqueeze(-1) / dim_t
pos_y = y_embed.unsqueeze(-1) / dim_t
pos_x = torch.stack([pos_x.sin(), pos_x.cos()], dim=-1).flatten(-2)
pos_y = torch.stack([pos_y.sin(), pos_y.cos()], dim=-1).flatten(-2)
pos = torch.cat([pos_y, pos_x], dim=-1).permute(2, 0, 1)
return pos.unsqueeze(0).expand(batch, -1, -1, -1)
class MLP(nn.Module):
"""Simple MLP for box prediction."""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
layers = []
for i in range(num_layers):
in_dim = input_dim if i == 0 else hidden_dim
out_dim = output_dim if i == num_layers - 1 else hidden_dim
layers.append(nn.Linear(in_dim, out_dim))
if i < num_layers - 1:
layers.append(nn.ReLU())
self.layers = nn.Sequential(*layers)
def forward(self, x):
return self.layers(x)
class HungarianMatcher:
"""
Hungarian matching for DETR training.
Matches predictions to ground truth using optimal assignment.
"""
def __init__(
self,
cost_class: float = 1.0,
cost_bbox: float = 5.0,
cost_giou: float = 2.0
):
self.cost_class = cost_class
self.cost_bbox = cost_bbox
self.cost_giou = cost_giou
@torch.no_grad()
def __call__(
self,
outputs: Dict[str, torch.Tensor],
targets: List[Dict[str, torch.Tensor]]
) -> List[Tuple[torch.Tensor, torch.Tensor]]:
"""
Returns list of (pred_indices, target_indices) for each batch element.
"""
from scipy.optimize import linear_sum_assignment
batch_size, num_queries = outputs['class_logits'].shape[:2]
# Flatten predictions
pred_probs = outputs['class_logits'].softmax(-1)
pred_boxes = outputs['pred_boxes']
indices = []
for b in range(batch_size):
target_labels = targets[b]['labels']
target_boxes = targets[b]['boxes']
num_targets = len(target_labels)
if num_targets == 0:
indices.append((torch.tensor([]), torch.tensor([])))
continue
# Classification cost
cost_class = -pred_probs[b, :, target_labels]
# L1 box cost
cost_bbox = torch.cdist(pred_boxes[b], target_boxes, p=1)
# GIoU cost
cost_giou = -self._giou(pred_boxes[b], target_boxes)
# Total cost
C = (
self.cost_class * cost_class +
self.cost_bbox * cost_bbox +
self.cost_giou * cost_giou
)
# Hungarian algorithm
row_ind, col_ind = linear_sum_assignment(C.cpu().numpy())
indices.append((
torch.tensor(row_ind, dtype=torch.long),
torch.tensor(col_ind, dtype=torch.long)
))
return indices
def _giou(self, boxes1: torch.Tensor, boxes2: torch.Tensor) -> torch.Tensor:
"""Compute GIoU between two sets of boxes."""
return torchvision.ops.generalized_box_iou(boxes1, boxes2)
Comparison
Copy
def detection_comparison():
"""Compare detection approaches."""
comparison = """
╔════════════════════════════════════════════════════════════════════╗
║ OBJECT DETECTION COMPARISON ║
╠════════════════════════════════════════════════════════════════════╣
║ ║
║ Method mAP FPS Pros Cons ║
║ ───────────────────────────────────────────────────────────── ║
║ Faster R-CNN 42.0 ~5 High accuracy Slow ║
║ YOLOv3 33.0 ~45 Fast Lower accuracy ║
║ YOLOv5 50.7 ~140 Fast + accurate Complex ║
║ FCOS 44.7 ~23 Simple, no anchor Anchor-based ║
║ CenterNet 45.1 ~28 Simple Fixed scales ║
║ DETR 42.0 ~28 End-to-end Slow training ║
║ ║
╠════════════════════════════════════════════════════════════════════╣
║ WHEN TO USE WHAT ║
╠════════════════════════════════════════════════════════════════════╣
║ ║
║ • Real-time (>30 FPS): YOLOv5, YOLOv8 ║
║ • High accuracy: Faster R-CNN, Cascade R-CNN ║
║ • Simple training: FCOS, CenterNet ║
║ • Research/flexibility: DETR, Deformable DETR ║
║ • Small objects: FPN-based methods, high resolution ║
║ • Dense scenes: Anchor-free methods ║
║ ║
╚════════════════════════════════════════════════════════════════════╝
"""
print(comparison)
detection_comparison()
Exercises
Exercise 1: Implement IoU Loss
Exercise 1: Implement IoU Loss
Implement GIoU, DIoU, and CIoU losses:
Copy
def giou_loss(pred_boxes, target_boxes):
# Generalized IoU
pass
def diou_loss(pred_boxes, target_boxes):
# Distance IoU
pass
def ciou_loss(pred_boxes, target_boxes):
# Complete IoU
pass
Exercise 2: Multi-Scale Training
Exercise 2: Multi-Scale Training
Implement multi-scale training for YOLO:
- Randomly resize input during training
- Adjust anchor scales accordingly
- Handle variable batch sizes
Exercise 3: Soft-NMS
Exercise 3: Soft-NMS
Implement Soft-NMS for better overlapping object handling:
Copy
def soft_nms(boxes, scores, sigma=0.5, threshold=0.001):
# Decay overlapping box scores instead of removing
pass