Skip to main content
Adversarial Robustness

Adversarial Machine Learning

The Vulnerability of Neural Networks

Neural networks are surprisingly vulnerable to adversarial examples - inputs crafted to cause misclassification while appearing normal to humans.
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Optional, Callable

torch.manual_seed(42)
Adversary’s goal: minδδ s.t. f(x+δ)f(x)\text{Adversary's goal: } \min_{\delta} \|\delta\| \text{ s.t. } f(x + \delta) \neq f(x)

Adversarial Attacks

Fast Gradient Sign Method (FGSM)

The foundational one-step attack by Goodfellow et al.: xadv=x+ϵsign(xL(f(x),y))x_{adv} = x + \epsilon \cdot \text{sign}(\nabla_x L(f(x), y))
class FGSM:
    """
    Fast Gradient Sign Method attack.
    
    Simple, fast, but not the strongest attack.
    """
    
    def __init__(self, model: nn.Module, epsilon: float = 0.03):
        """
        Args:
            model: Target model to attack
            epsilon: Perturbation budget (L-infinity)
        """
        self.model = model
        self.epsilon = epsilon
    
    def attack(
        self,
        images: torch.Tensor,
        labels: torch.Tensor
    ) -> torch.Tensor:
        """
        Generate adversarial examples.
        
        Args:
            images: [N, C, H, W] clean images
            labels: [N] true labels
        
        Returns:
            adversarial: [N, C, H, W] adversarial images
        """
        images = images.clone().detach().requires_grad_(True)
        
        # Forward pass
        outputs = self.model(images)
        loss = F.cross_entropy(outputs, labels)
        
        # Backward pass
        self.model.zero_grad()
        loss.backward()
        
        # Create perturbation
        grad_sign = images.grad.sign()
        perturbation = self.epsilon * grad_sign
        
        # Apply perturbation
        adversarial = images + perturbation
        
        # Clamp to valid range
        adversarial = torch.clamp(adversarial, 0, 1)
        
        return adversarial.detach()
    
    def targeted_attack(
        self,
        images: torch.Tensor,
        target_labels: torch.Tensor
    ) -> torch.Tensor:
        """Generate targeted adversarial examples."""
        
        images = images.clone().detach().requires_grad_(True)
        
        # Forward pass
        outputs = self.model(images)
        
        # Minimize loss for target class (gradient descent)
        loss = F.cross_entropy(outputs, target_labels)
        
        self.model.zero_grad()
        loss.backward()
        
        # Subtract gradient (move toward target)
        grad_sign = images.grad.sign()
        perturbation = -self.epsilon * grad_sign  # Negative!
        
        adversarial = images + perturbation
        adversarial = torch.clamp(adversarial, 0, 1)
        
        return adversarial.detach()


# Example usage
def fgsm_example():
    model = nn.Sequential(
        nn.Flatten(),
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10)
    )
    
    attack = FGSM(model, epsilon=0.3)
    
    # Generate adversarial examples
    images = torch.rand(10, 1, 28, 28)
    labels = torch.randint(0, 10, (10,))
    
    adv_images = attack.attack(images, labels)
    
    # Measure perturbation
    perturbation = (adv_images - images).abs().max()
    print(f"Max perturbation: {perturbation:.4f}")

Projected Gradient Descent (PGD)

The strongest first-order attack - iterative FGSM with projection:
class PGD:
    """
    Projected Gradient Descent attack.
    
    Strong iterative attack - the standard for evaluating robustness.
    """
    
    def __init__(
        self,
        model: nn.Module,
        epsilon: float = 0.03,
        alpha: float = 0.01,
        num_iter: int = 40,
        random_start: bool = True
    ):
        """
        Args:
            epsilon: Total perturbation budget (L-infinity)
            alpha: Step size per iteration
            num_iter: Number of attack iterations
            random_start: Start from random perturbation
        """
        self.model = model
        self.epsilon = epsilon
        self.alpha = alpha
        self.num_iter = num_iter
        self.random_start = random_start
    
    def attack(
        self,
        images: torch.Tensor,
        labels: torch.Tensor
    ) -> torch.Tensor:
        """Generate PGD adversarial examples."""
        
        original = images.clone()
        
        if self.random_start:
            # Random start within epsilon ball
            images = images + torch.empty_like(images).uniform_(
                -self.epsilon, self.epsilon
            )
            images = torch.clamp(images, 0, 1)
        
        for _ in range(self.num_iter):
            images = images.clone().detach().requires_grad_(True)
            
            # Forward pass
            outputs = self.model(images)
            loss = F.cross_entropy(outputs, labels)
            
            # Backward pass
            self.model.zero_grad()
            loss.backward()
            
            # Gradient step
            grad_sign = images.grad.sign()
            images = images + self.alpha * grad_sign
            
            # Project back to epsilon ball
            perturbation = images - original
            perturbation = torch.clamp(perturbation, -self.epsilon, self.epsilon)
            images = original + perturbation
            
            # Clamp to valid range
            images = torch.clamp(images, 0, 1)
        
        return images.detach()
    
    def attack_with_restarts(
        self,
        images: torch.Tensor,
        labels: torch.Tensor,
        num_restarts: int = 10
    ) -> torch.Tensor:
        """PGD with multiple random restarts."""
        
        best_adv = None
        best_loss = float('-inf')
        
        for _ in range(num_restarts):
            adv = self.attack(images, labels)
            
            with torch.no_grad():
                outputs = self.model(adv)
                loss = F.cross_entropy(outputs, labels)
            
            if loss > best_loss:
                best_loss = loss
                best_adv = adv
        
        return best_adv


class AutoPGD:
    """
    Auto-PGD: Automatically tuned PGD attack.
    Part of AutoAttack - a reliable attack for robustness evaluation.
    """
    
    def __init__(
        self,
        model: nn.Module,
        epsilon: float = 0.03,
        num_iter: int = 100,
        loss_type: str = 'ce'  # 'ce' or 'dlr'
    ):
        self.model = model
        self.epsilon = epsilon
        self.num_iter = num_iter
        self.loss_type = loss_type
    
    def _dlr_loss(self, outputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
        """Difference of Logits Ratio loss."""
        # Sort outputs
        sorted_outputs, _ = outputs.sort(dim=1, descending=True)
        
        # y = correct class logit
        # y' = highest incorrect logit
        # y'' = second highest overall
        y = outputs.gather(1, labels.unsqueeze(1)).squeeze()
        
        # Mask correct class
        mask = torch.ones_like(outputs, dtype=torch.bool)
        mask.scatter_(1, labels.unsqueeze(1), False)
        y_prime = outputs[mask].view(outputs.shape[0], -1).max(dim=1)[0]
        
        # DLR loss
        loss = -(y - y_prime) / (sorted_outputs[:, 0] - sorted_outputs[:, 2] + 1e-8)
        
        return loss.mean()
    
    def attack(
        self,
        images: torch.Tensor,
        labels: torch.Tensor
    ) -> torch.Tensor:
        """Auto-PGD attack with step size adaptation."""
        
        original = images.clone()
        
        # Initialize with random start
        images = images + torch.empty_like(images).uniform_(
            -self.epsilon, self.epsilon
        )
        images = torch.clamp(images, 0, 1)
        
        # Adaptive step size
        step_size = 2 * self.epsilon
        
        best_adv = images.clone()
        best_loss = float('-inf')
        
        for i in range(self.num_iter):
            images = images.clone().detach().requires_grad_(True)
            
            outputs = self.model(images)
            
            if self.loss_type == 'dlr':
                loss = self._dlr_loss(outputs, labels)
            else:
                loss = F.cross_entropy(outputs, labels)
            
            self.model.zero_grad()
            loss.backward()
            
            # Gradient step
            grad = images.grad
            images = images + step_size * grad.sign()
            
            # Project
            perturbation = images - original
            perturbation = torch.clamp(perturbation, -self.epsilon, self.epsilon)
            images = original + perturbation
            images = torch.clamp(images, 0, 1)
            
            # Update best
            with torch.no_grad():
                current_loss = loss.item()
                if current_loss > best_loss:
                    best_loss = current_loss
                    best_adv = images.clone()
            
            # Adapt step size
            if i % 10 == 0 and i > 0:
                step_size *= 0.75
        
        return best_adv

C&W Attack

Carlini & Wagner - optimization-based attack:
class CWAttack:
    """
    Carlini & Wagner L2 attack.
    
    Powerful optimization-based attack that finds minimal perturbations.
    """
    
    def __init__(
        self,
        model: nn.Module,
        c: float = 1.0,
        kappa: float = 0,
        num_iter: int = 1000,
        lr: float = 0.01
    ):
        """
        Args:
            c: Weight for classification loss
            kappa: Confidence margin
            num_iter: Optimization steps
            lr: Learning rate
        """
        self.model = model
        self.c = c
        self.kappa = kappa
        self.num_iter = num_iter
        self.lr = lr
    
    def attack(
        self,
        images: torch.Tensor,
        labels: torch.Tensor,
        targeted: bool = False,
        target_labels: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """Generate C&W adversarial examples."""
        
        batch_size = images.shape[0]
        
        # Use tanh space for box constraints
        # x = 0.5 * (tanh(w) + 1)
        w = torch.arctanh(2 * images - 1).clone().detach().requires_grad_(True)
        
        optimizer = torch.optim.Adam([w], lr=self.lr)
        
        for _ in range(self.num_iter):
            optimizer.zero_grad()
            
            # Convert back to image space
            adv_images = 0.5 * (torch.tanh(w) + 1)
            
            # Forward pass
            outputs = self.model(adv_images)
            
            # L2 distance loss
            l2_loss = ((adv_images - images) ** 2).sum(dim=(1, 2, 3)).mean()
            
            # Classification loss
            if targeted:
                # Minimize f(x_adv) for target class
                target_logits = outputs.gather(1, target_labels.unsqueeze(1)).squeeze()
                other_logits = outputs.clone()
                other_logits.scatter_(1, target_labels.unsqueeze(1), float('-inf'))
                max_other = other_logits.max(dim=1)[0]
                
                f_loss = F.relu(max_other - target_logits + self.kappa).mean()
            else:
                # Maximize loss for true class
                true_logits = outputs.gather(1, labels.unsqueeze(1)).squeeze()
                other_logits = outputs.clone()
                other_logits.scatter_(1, labels.unsqueeze(1), float('-inf'))
                max_other = other_logits.max(dim=1)[0]
                
                f_loss = F.relu(true_logits - max_other + self.kappa).mean()
            
            # Total loss
            loss = l2_loss + self.c * f_loss
            
            loss.backward()
            optimizer.step()
        
        # Final adversarial images
        adv_images = 0.5 * (torch.tanh(w) + 1)
        
        return adv_images.detach()

Adversarial Defenses

Adversarial Training

The most effective defense - train on adversarial examples:
class AdversarialTrainer:
    """
    Adversarial training framework.
    
    Key insight: Train on worst-case perturbations.
    """
    
    def __init__(
        self,
        model: nn.Module,
        optimizer: torch.optim.Optimizer,
        epsilon: float = 0.03,
        attack_steps: int = 10,
        attack_lr: float = 0.01
    ):
        self.model = model
        self.optimizer = optimizer
        self.epsilon = epsilon
        self.attack_steps = attack_steps
        self.attack_lr = attack_lr
        
        self.pgd = PGD(
            model,
            epsilon=epsilon,
            alpha=attack_lr,
            num_iter=attack_steps
        )
    
    def train_step(
        self,
        images: torch.Tensor,
        labels: torch.Tensor
    ) -> Tuple[float, float]:
        """
        Single adversarial training step.
        
        Returns:
            clean_loss: Loss on clean examples
            adv_loss: Loss on adversarial examples
        """
        self.model.train()
        
        # Generate adversarial examples
        self.model.eval()
        adv_images = self.pgd.attack(images, labels)
        self.model.train()
        
        # Train on adversarial examples
        self.optimizer.zero_grad()
        
        adv_outputs = self.model(adv_images)
        adv_loss = F.cross_entropy(adv_outputs, labels)
        
        adv_loss.backward()
        self.optimizer.step()
        
        # Compute clean loss for monitoring
        with torch.no_grad():
            clean_outputs = self.model(images)
            clean_loss = F.cross_entropy(clean_outputs, labels)
        
        return clean_loss.item(), adv_loss.item()
    
    def train_epoch(self, dataloader):
        """Train for one epoch."""
        
        total_clean_loss = 0
        total_adv_loss = 0
        n_batches = 0
        
        for images, labels in dataloader:
            clean_loss, adv_loss = self.train_step(images, labels)
            total_clean_loss += clean_loss
            total_adv_loss += adv_loss
            n_batches += 1
        
        return total_clean_loss / n_batches, total_adv_loss / n_batches


class TRADESTrainer:
    """
    TRADES: Theoretically-motivated adversarial training.
    
    Separates natural accuracy and robustness objectives.
    """
    
    def __init__(
        self,
        model: nn.Module,
        optimizer: torch.optim.Optimizer,
        epsilon: float = 0.03,
        beta: float = 6.0,  # Robustness weight
        attack_steps: int = 10
    ):
        self.model = model
        self.optimizer = optimizer
        self.epsilon = epsilon
        self.beta = beta
        self.attack_steps = attack_steps
    
    def train_step(
        self,
        images: torch.Tensor,
        labels: torch.Tensor
    ) -> float:
        """TRADES training step."""
        
        self.model.eval()
        
        # Generate adversarial examples (maximize KL divergence)
        adv_images = images.clone().detach()
        adv_images += torch.empty_like(adv_images).uniform_(-self.epsilon, self.epsilon)
        adv_images = torch.clamp(adv_images, 0, 1)
        
        with torch.no_grad():
            natural_outputs = self.model(images)
        
        for _ in range(self.attack_steps):
            adv_images = adv_images.clone().detach().requires_grad_(True)
            
            adv_outputs = self.model(adv_images)
            
            # KL divergence from natural outputs
            loss = F.kl_div(
                F.log_softmax(adv_outputs, dim=1),
                F.softmax(natural_outputs, dim=1),
                reduction='batchmean'
            )
            
            self.model.zero_grad()
            loss.backward()
            
            adv_images = adv_images + (self.epsilon / self.attack_steps) * adv_images.grad.sign()
            adv_images = torch.clamp(
                adv_images,
                images - self.epsilon,
                images + self.epsilon
            )
            adv_images = torch.clamp(adv_images, 0, 1)
        
        # Training step
        self.model.train()
        self.optimizer.zero_grad()
        
        # Natural loss
        natural_outputs = self.model(images)
        natural_loss = F.cross_entropy(natural_outputs, labels)
        
        # Robustness loss (KL divergence)
        adv_outputs = self.model(adv_images)
        robust_loss = F.kl_div(
            F.log_softmax(adv_outputs, dim=1),
            F.softmax(natural_outputs.detach(), dim=1),
            reduction='batchmean'
        )
        
        # Combined loss
        loss = natural_loss + self.beta * robust_loss
        
        loss.backward()
        self.optimizer.step()
        
        return loss.item()

Input Preprocessing Defenses

class InputPreprocessing:
    """Preprocessing-based defenses (generally broken by adaptive attacks)."""
    
    @staticmethod
    def jpeg_compression(images: torch.Tensor, quality: int = 50) -> torch.Tensor:
        """Apply JPEG compression as defense."""
        # Note: This is easily broken by adaptive attacks
        import io
        from PIL import Image
        import torchvision.transforms as T
        
        compressed = []
        for img in images:
            # Convert to PIL
            pil_img = T.ToPILImage()(img)
            
            # Compress
            buffer = io.BytesIO()
            pil_img.save(buffer, format='JPEG', quality=quality)
            buffer.seek(0)
            
            # Reload
            compressed_img = Image.open(buffer)
            compressed.append(T.ToTensor()(compressed_img))
        
        return torch.stack(compressed)
    
    @staticmethod
    def spatial_smoothing(images: torch.Tensor, kernel_size: int = 3) -> torch.Tensor:
        """Apply spatial smoothing."""
        kernel = torch.ones(1, 1, kernel_size, kernel_size) / (kernel_size ** 2)
        
        smoothed = []
        for c in range(images.shape[1]):
            channel = images[:, c:c+1]
            smoothed_channel = F.conv2d(channel, kernel, padding=kernel_size//2)
            smoothed.append(smoothed_channel)
        
        return torch.cat(smoothed, dim=1)
    
    @staticmethod
    def bit_depth_reduction(images: torch.Tensor, bits: int = 4) -> torch.Tensor:
        """Reduce bit depth of images."""
        factor = 2 ** (8 - bits)
        return torch.round(images * 255 / factor) * factor / 255


class RandomizedDefense:
    """
    Randomized defenses add stochasticity to break gradient-based attacks.
    """
    
    @staticmethod
    def random_resize_padding(
        images: torch.Tensor,
        min_size: int = 200,
        max_size: int = 224
    ) -> torch.Tensor:
        """Random resizing and padding."""
        
        batch_size = images.shape[0]
        
        # Random new size
        new_size = torch.randint(min_size, max_size + 1, (1,)).item()
        
        # Resize
        resized = F.interpolate(images, size=new_size, mode='bilinear')
        
        # Random padding to max_size
        pad_total = max_size - new_size
        pad_left = torch.randint(0, pad_total + 1, (1,)).item()
        pad_top = torch.randint(0, pad_total + 1, (1,)).item()
        
        padded = F.pad(
            resized,
            (pad_left, pad_total - pad_left, pad_top, pad_total - pad_top)
        )
        
        return padded

Certified Defenses

Randomized Smoothing

Provable robustness via randomization:
class RandomizedSmoothing:
    """
    Randomized Smoothing: Certifiably robust classifier.
    
    Key idea: Average predictions over Gaussian noise.
    """
    
    def __init__(
        self,
        base_classifier: nn.Module,
        sigma: float = 0.25,
        n_samples: int = 100
    ):
        self.base_classifier = base_classifier
        self.sigma = sigma
        self.n_samples = n_samples
    
    def predict(self, x: torch.Tensor) -> torch.Tensor:
        """Smoothed prediction (majority vote)."""
        
        counts = torch.zeros(x.shape[0], 10)  # Assuming 10 classes
        
        with torch.no_grad():
            for _ in range(self.n_samples):
                # Add Gaussian noise
                noise = torch.randn_like(x) * self.sigma
                noisy_x = x + noise
                
                # Get prediction
                outputs = self.base_classifier(noisy_x)
                preds = outputs.argmax(dim=1)
                
                # Count
                for i, pred in enumerate(preds):
                    counts[i, pred] += 1
        
        return counts.argmax(dim=1)
    
    def certify(
        self,
        x: torch.Tensor,
        n_samples: int = 10000,
        alpha: float = 0.001
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Certify robustness radius.
        
        Returns:
            predictions: Certified predictions
            radii: Certified L2 radii
        """
        from scipy.stats import norm, binom_test
        
        # Count predictions
        counts = torch.zeros(x.shape[0], 10)
        
        with torch.no_grad():
            for _ in range(n_samples):
                noise = torch.randn_like(x) * self.sigma
                outputs = self.base_classifier(x + noise)
                preds = outputs.argmax(dim=1)
                
                for i, pred in enumerate(preds):
                    counts[i, pred] += 1
        
        predictions = []
        radii = []
        
        for i in range(x.shape[0]):
            # Top class and count
            top_class = counts[i].argmax().item()
            top_count = counts[i, top_class].item()
            
            # Statistical test for majority
            p_value = binom_test(top_count, n_samples, 0.5)
            
            if p_value < alpha:
                # Compute certified radius
                p_lower = self._lower_confidence_bound(top_count, n_samples, alpha)
                radius = self.sigma * norm.ppf(p_lower)
                
                predictions.append(top_class)
                radii.append(max(0, radius))
            else:
                predictions.append(-1)  # Abstain
                radii.append(0)
        
        return torch.tensor(predictions), torch.tensor(radii)
    
    def _lower_confidence_bound(
        self,
        successes: int,
        trials: int,
        alpha: float
    ) -> float:
        """Compute lower confidence bound using Clopper-Pearson."""
        from scipy.stats import beta
        return beta.ppf(alpha, successes, trials - successes + 1)


class IBPCertifiedDefense:
    """
    Interval Bound Propagation for certified defense.
    
    Propagates bounds through the network to certify robustness.
    """
    
    def __init__(self, model: nn.Module, epsilon: float = 0.03):
        self.model = model
        self.epsilon = epsilon
    
    def compute_bounds(
        self,
        x: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Compute output bounds for epsilon-ball around x.
        
        Returns:
            lower_bounds: Lower bound on each output
            upper_bounds: Upper bound on each output
        """
        # Initial bounds
        lower = x - self.epsilon
        upper = x + self.epsilon
        
        for layer in self.model:
            if isinstance(layer, nn.Linear):
                lower, upper = self._linear_bounds(layer, lower, upper)
            elif isinstance(layer, nn.ReLU):
                lower, upper = self._relu_bounds(lower, upper)
            elif isinstance(layer, nn.Flatten):
                lower = lower.flatten(start_dim=1)
                upper = upper.flatten(start_dim=1)
        
        return lower, upper
    
    def _linear_bounds(
        self,
        layer: nn.Linear,
        lower: torch.Tensor,
        upper: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Propagate bounds through linear layer."""
        
        weight = layer.weight
        bias = layer.bias if layer.bias is not None else 0
        
        # Positive and negative weights
        pos_weight = F.relu(weight)
        neg_weight = -F.relu(-weight)
        
        # New bounds
        new_lower = (
            F.linear(lower, pos_weight) +
            F.linear(upper, neg_weight) +
            bias
        )
        new_upper = (
            F.linear(upper, pos_weight) +
            F.linear(lower, neg_weight) +
            bias
        )
        
        return new_lower, new_upper
    
    def _relu_bounds(
        self,
        lower: torch.Tensor,
        upper: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Propagate bounds through ReLU."""
        return F.relu(lower), F.relu(upper)
    
    def certified_accuracy(
        self,
        x: torch.Tensor,
        labels: torch.Tensor
    ) -> float:
        """Compute certified accuracy."""
        
        lower, upper = self.compute_bounds(x)
        
        # Check if true class lower bound > all other upper bounds
        certified = 0
        
        for i in range(x.shape[0]):
            true_class = labels[i].item()
            true_lower = lower[i, true_class]
            
            # Mask true class
            other_upper = upper[i].clone()
            other_upper[true_class] = float('-inf')
            max_other = other_upper.max()
            
            if true_lower > max_other:
                certified += 1
        
        return certified / x.shape[0]

Robust Architecture Design

class RobustArchitectureDesign:
    """
    Architectural choices that improve robustness.
    """
    
    @staticmethod
    def create_robust_cnn():
        """CNN with robustness-enhancing features."""
        
        return nn.Sequential(
            # Larger kernel sizes (more robust to small perturbations)
            nn.Conv2d(3, 64, 5, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            # Smooth activation functions
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.SiLU(),  # Smoother than ReLU
            nn.MaxPool2d(2),
            
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.SiLU(),
            nn.AdaptiveAvgPool2d(1),
            
            nn.Flatten(),
            nn.Linear(256, 10)
        )
    
    @staticmethod
    def lipschitz_constrained_layer(
        in_features: int,
        out_features: int,
        lipschitz_bound: float = 1.0
    ) -> nn.Module:
        """Linear layer with Lipschitz constraint."""
        
        class LipschitzLinear(nn.Module):
            def __init__(self):
                super().__init__()
                self.weight = nn.Parameter(
                    torch.randn(out_features, in_features) * 0.01
                )
                self.bias = nn.Parameter(torch.zeros(out_features))
                self.bound = lipschitz_bound
            
            def forward(self, x):
                # Spectral normalization
                u = torch.randn(self.weight.shape[1], 1, device=x.device)
                
                for _ in range(3):  # Power iteration
                    v = self.weight @ u
                    v = v / v.norm()
                    u = self.weight.T @ v
                    u = u / u.norm()
                
                sigma = (v.T @ self.weight @ u).item()
                
                # Scale weight if needed
                weight = self.weight
                if sigma > self.bound:
                    weight = weight * self.bound / sigma
                
                return F.linear(x, weight, self.bias)
        
        return LipschitzLinear()


class WideResNetRobust(nn.Module):
    """
    Wide ResNet architecture commonly used for adversarial training.
    Wider networks tend to be more robust.
    """
    
    def __init__(self, depth: int = 28, widen_factor: int = 10, num_classes: int = 10):
        super().__init__()
        
        nChannels = [16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor]
        
        self.conv1 = nn.Conv2d(3, nChannels[0], 3, padding=1)
        
        self.block1 = self._make_block(nChannels[0], nChannels[1], depth // 6)
        self.block2 = self._make_block(nChannels[1], nChannels[2], depth // 6, stride=2)
        self.block3 = self._make_block(nChannels[2], nChannels[3], depth // 6, stride=2)
        
        self.bn = nn.BatchNorm2d(nChannels[3])
        self.relu = nn.ReLU()
        self.fc = nn.Linear(nChannels[3], num_classes)
    
    def _make_block(self, in_c, out_c, num_blocks, stride=1):
        layers = [self._residual_block(in_c, out_c, stride)]
        for _ in range(1, num_blocks):
            layers.append(self._residual_block(out_c, out_c))
        return nn.Sequential(*layers)
    
    def _residual_block(self, in_c, out_c, stride=1):
        return nn.Sequential(
            nn.BatchNorm2d(in_c),
            nn.ReLU(),
            nn.Conv2d(in_c, out_c, 3, stride=stride, padding=1),
            nn.BatchNorm2d(out_c),
            nn.ReLU(),
            nn.Conv2d(out_c, out_c, 3, padding=1)
        )
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.relu(self.bn(x))
        x = F.adaptive_avg_pool2d(x, 1)
        x = x.view(x.size(0), -1)
        return self.fc(x)

Robustness Evaluation

class RobustnessEvaluator:
    """Comprehensive robustness evaluation."""
    
    def __init__(self, model: nn.Module):
        self.model = model
        self.model.eval()
    
    def evaluate(
        self,
        test_loader,
        epsilon: float = 0.03
    ) -> Dict[str, float]:
        """Full robustness evaluation."""
        
        results = {
            'clean_accuracy': 0,
            'fgsm_accuracy': 0,
            'pgd_accuracy': 0,
            'pgd_20_accuracy': 0,
            'autopgd_accuracy': 0
        }
        
        fgsm = FGSM(self.model, epsilon)
        pgd_10 = PGD(self.model, epsilon, num_iter=10)
        pgd_20 = PGD(self.model, epsilon, num_iter=20)
        autopgd = AutoPGD(self.model, epsilon)
        
        n_correct = {k: 0 for k in results}
        n_total = 0
        
        for images, labels in test_loader:
            n_total += len(labels)
            
            with torch.no_grad():
                # Clean accuracy
                clean_pred = self.model(images).argmax(dim=1)
                n_correct['clean_accuracy'] += (clean_pred == labels).sum().item()
            
            # FGSM
            fgsm_images = fgsm.attack(images, labels)
            with torch.no_grad():
                fgsm_pred = self.model(fgsm_images).argmax(dim=1)
                n_correct['fgsm_accuracy'] += (fgsm_pred == labels).sum().item()
            
            # PGD-10
            pgd_images = pgd_10.attack(images, labels)
            with torch.no_grad():
                pgd_pred = self.model(pgd_images).argmax(dim=1)
                n_correct['pgd_accuracy'] += (pgd_pred == labels).sum().item()
            
            # PGD-20
            pgd20_images = pgd_20.attack(images, labels)
            with torch.no_grad():
                pgd20_pred = self.model(pgd20_images).argmax(dim=1)
                n_correct['pgd_20_accuracy'] += (pgd20_pred == labels).sum().item()
            
            # AutoPGD
            autopgd_images = autopgd.attack(images, labels)
            with torch.no_grad():
                auto_pred = self.model(autopgd_images).argmax(dim=1)
                n_correct['autopgd_accuracy'] += (auto_pred == labels).sum().item()
        
        for k in results:
            results[k] = n_correct[k] / n_total
        
        return results
    
    def robustness_curve(
        self,
        images: torch.Tensor,
        labels: torch.Tensor,
        epsilons: list = [0.01, 0.02, 0.03, 0.05, 0.1, 0.2]
    ) -> Dict[float, float]:
        """Accuracy vs epsilon curve."""
        
        results = {}
        
        for eps in epsilons:
            pgd = PGD(self.model, eps, num_iter=20)
            adv_images = pgd.attack(images, labels)
            
            with torch.no_grad():
                preds = self.model(adv_images).argmax(dim=1)
                accuracy = (preds == labels).float().mean().item()
            
            results[eps] = accuracy
        
        return results


def evaluate_robustness():
    """Evaluation guidelines."""
    
    guidelines = """
    ╔════════════════════════════════════════════════════════════════╗
    ║               ROBUSTNESS EVALUATION GUIDELINES                 ║
    ╠════════════════════════════════════════════════════════════════╣
    ║                                                                ║
    ║  1. STANDARD EVALUATIONS                                       ║
    ║     • Clean accuracy (baseline)                                ║
    ║     • FGSM accuracy (weak attack)                              ║
    ║     • PGD-20 with restarts (strong attack)                     ║
    ║     • AutoAttack (state-of-the-art)                            ║
    ║                                                                ║
    ║  2. EPSILON RANGES (L∞ normalized to [0,1])                    ║
    ║     • MNIST: ε = 0.3                                           ║
    ║     • CIFAR-10: ε = 8/255 ≈ 0.031                              ║
    ║     • ImageNet: ε = 4/255 ≈ 0.016                              ║
    ║                                                                ║
    ║  3. AVOID COMMON PITFALLS                                      ║
    ║     • Don't rely on weak attacks                               ║
    ║     • Use adaptive attacks for defense evaluation              ║
    ║     • Report worst-case across multiple attacks                ║
    ║     • Include certified accuracy if applicable                 ║
    ║                                                                ║
    ║  4. BENCHMARKS                                                 ║
    ║     • RobustBench: robustbench.github.io                       ║
    ║     • AutoAttack: standardized evaluation                      ║
    ║                                                                ║
    ╚════════════════════════════════════════════════════════════════╝
    """
    print(guidelines)

evaluate_robustness()

Exercises

Implement the query-efficient Square Attack:
class SquareAttack:
    # Black-box attack using only model outputs
    # No gradients needed!
    pass
Compare TRADES and standard PGD adversarial training:
  • Train models with both methods
  • Compare clean vs robust accuracy tradeoff
  • Evaluate with AutoAttack
Implement and evaluate randomized smoothing:
  • Train a smoothed classifier
  • Compute certified radii
  • Plot certified accuracy vs radius

What’s Next?