Adversarial Machine Learning
The Vulnerability of Neural Networks
Neural networks are surprisingly vulnerable to adversarial examples - inputs crafted to cause misclassification while appearing normal to humans.Copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, Optional, Callable
torch.manual_seed(42)
Adversarial Attacks
Fast Gradient Sign Method (FGSM)
The foundational one-step attack by Goodfellow et al.: xadv=x+ϵ⋅sign(∇xL(f(x),y))Copy
class FGSM:
"""
Fast Gradient Sign Method attack.
Simple, fast, but not the strongest attack.
"""
def __init__(self, model: nn.Module, epsilon: float = 0.03):
"""
Args:
model: Target model to attack
epsilon: Perturbation budget (L-infinity)
"""
self.model = model
self.epsilon = epsilon
def attack(
self,
images: torch.Tensor,
labels: torch.Tensor
) -> torch.Tensor:
"""
Generate adversarial examples.
Args:
images: [N, C, H, W] clean images
labels: [N] true labels
Returns:
adversarial: [N, C, H, W] adversarial images
"""
images = images.clone().detach().requires_grad_(True)
# Forward pass
outputs = self.model(images)
loss = F.cross_entropy(outputs, labels)
# Backward pass
self.model.zero_grad()
loss.backward()
# Create perturbation
grad_sign = images.grad.sign()
perturbation = self.epsilon * grad_sign
# Apply perturbation
adversarial = images + perturbation
# Clamp to valid range
adversarial = torch.clamp(adversarial, 0, 1)
return adversarial.detach()
def targeted_attack(
self,
images: torch.Tensor,
target_labels: torch.Tensor
) -> torch.Tensor:
"""Generate targeted adversarial examples."""
images = images.clone().detach().requires_grad_(True)
# Forward pass
outputs = self.model(images)
# Minimize loss for target class (gradient descent)
loss = F.cross_entropy(outputs, target_labels)
self.model.zero_grad()
loss.backward()
# Subtract gradient (move toward target)
grad_sign = images.grad.sign()
perturbation = -self.epsilon * grad_sign # Negative!
adversarial = images + perturbation
adversarial = torch.clamp(adversarial, 0, 1)
return adversarial.detach()
# Example usage
def fgsm_example():
model = nn.Sequential(
nn.Flatten(),
nn.Linear(784, 256),
nn.ReLU(),
nn.Linear(256, 10)
)
attack = FGSM(model, epsilon=0.3)
# Generate adversarial examples
images = torch.rand(10, 1, 28, 28)
labels = torch.randint(0, 10, (10,))
adv_images = attack.attack(images, labels)
# Measure perturbation
perturbation = (adv_images - images).abs().max()
print(f"Max perturbation: {perturbation:.4f}")
Projected Gradient Descent (PGD)
The strongest first-order attack - iterative FGSM with projection:Copy
class PGD:
"""
Projected Gradient Descent attack.
Strong iterative attack - the standard for evaluating robustness.
"""
def __init__(
self,
model: nn.Module,
epsilon: float = 0.03,
alpha: float = 0.01,
num_iter: int = 40,
random_start: bool = True
):
"""
Args:
epsilon: Total perturbation budget (L-infinity)
alpha: Step size per iteration
num_iter: Number of attack iterations
random_start: Start from random perturbation
"""
self.model = model
self.epsilon = epsilon
self.alpha = alpha
self.num_iter = num_iter
self.random_start = random_start
def attack(
self,
images: torch.Tensor,
labels: torch.Tensor
) -> torch.Tensor:
"""Generate PGD adversarial examples."""
original = images.clone()
if self.random_start:
# Random start within epsilon ball
images = images + torch.empty_like(images).uniform_(
-self.epsilon, self.epsilon
)
images = torch.clamp(images, 0, 1)
for _ in range(self.num_iter):
images = images.clone().detach().requires_grad_(True)
# Forward pass
outputs = self.model(images)
loss = F.cross_entropy(outputs, labels)
# Backward pass
self.model.zero_grad()
loss.backward()
# Gradient step
grad_sign = images.grad.sign()
images = images + self.alpha * grad_sign
# Project back to epsilon ball
perturbation = images - original
perturbation = torch.clamp(perturbation, -self.epsilon, self.epsilon)
images = original + perturbation
# Clamp to valid range
images = torch.clamp(images, 0, 1)
return images.detach()
def attack_with_restarts(
self,
images: torch.Tensor,
labels: torch.Tensor,
num_restarts: int = 10
) -> torch.Tensor:
"""PGD with multiple random restarts."""
best_adv = None
best_loss = float('-inf')
for _ in range(num_restarts):
adv = self.attack(images, labels)
with torch.no_grad():
outputs = self.model(adv)
loss = F.cross_entropy(outputs, labels)
if loss > best_loss:
best_loss = loss
best_adv = adv
return best_adv
class AutoPGD:
"""
Auto-PGD: Automatically tuned PGD attack.
Part of AutoAttack - a reliable attack for robustness evaluation.
"""
def __init__(
self,
model: nn.Module,
epsilon: float = 0.03,
num_iter: int = 100,
loss_type: str = 'ce' # 'ce' or 'dlr'
):
self.model = model
self.epsilon = epsilon
self.num_iter = num_iter
self.loss_type = loss_type
def _dlr_loss(self, outputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
"""Difference of Logits Ratio loss."""
# Sort outputs
sorted_outputs, _ = outputs.sort(dim=1, descending=True)
# y = correct class logit
# y' = highest incorrect logit
# y'' = second highest overall
y = outputs.gather(1, labels.unsqueeze(1)).squeeze()
# Mask correct class
mask = torch.ones_like(outputs, dtype=torch.bool)
mask.scatter_(1, labels.unsqueeze(1), False)
y_prime = outputs[mask].view(outputs.shape[0], -1).max(dim=1)[0]
# DLR loss
loss = -(y - y_prime) / (sorted_outputs[:, 0] - sorted_outputs[:, 2] + 1e-8)
return loss.mean()
def attack(
self,
images: torch.Tensor,
labels: torch.Tensor
) -> torch.Tensor:
"""Auto-PGD attack with step size adaptation."""
original = images.clone()
# Initialize with random start
images = images + torch.empty_like(images).uniform_(
-self.epsilon, self.epsilon
)
images = torch.clamp(images, 0, 1)
# Adaptive step size
step_size = 2 * self.epsilon
best_adv = images.clone()
best_loss = float('-inf')
for i in range(self.num_iter):
images = images.clone().detach().requires_grad_(True)
outputs = self.model(images)
if self.loss_type == 'dlr':
loss = self._dlr_loss(outputs, labels)
else:
loss = F.cross_entropy(outputs, labels)
self.model.zero_grad()
loss.backward()
# Gradient step
grad = images.grad
images = images + step_size * grad.sign()
# Project
perturbation = images - original
perturbation = torch.clamp(perturbation, -self.epsilon, self.epsilon)
images = original + perturbation
images = torch.clamp(images, 0, 1)
# Update best
with torch.no_grad():
current_loss = loss.item()
if current_loss > best_loss:
best_loss = current_loss
best_adv = images.clone()
# Adapt step size
if i % 10 == 0 and i > 0:
step_size *= 0.75
return best_adv
C&W Attack
Carlini & Wagner - optimization-based attack:Copy
class CWAttack:
"""
Carlini & Wagner L2 attack.
Powerful optimization-based attack that finds minimal perturbations.
"""
def __init__(
self,
model: nn.Module,
c: float = 1.0,
kappa: float = 0,
num_iter: int = 1000,
lr: float = 0.01
):
"""
Args:
c: Weight for classification loss
kappa: Confidence margin
num_iter: Optimization steps
lr: Learning rate
"""
self.model = model
self.c = c
self.kappa = kappa
self.num_iter = num_iter
self.lr = lr
def attack(
self,
images: torch.Tensor,
labels: torch.Tensor,
targeted: bool = False,
target_labels: Optional[torch.Tensor] = None
) -> torch.Tensor:
"""Generate C&W adversarial examples."""
batch_size = images.shape[0]
# Use tanh space for box constraints
# x = 0.5 * (tanh(w) + 1)
w = torch.arctanh(2 * images - 1).clone().detach().requires_grad_(True)
optimizer = torch.optim.Adam([w], lr=self.lr)
for _ in range(self.num_iter):
optimizer.zero_grad()
# Convert back to image space
adv_images = 0.5 * (torch.tanh(w) + 1)
# Forward pass
outputs = self.model(adv_images)
# L2 distance loss
l2_loss = ((adv_images - images) ** 2).sum(dim=(1, 2, 3)).mean()
# Classification loss
if targeted:
# Minimize f(x_adv) for target class
target_logits = outputs.gather(1, target_labels.unsqueeze(1)).squeeze()
other_logits = outputs.clone()
other_logits.scatter_(1, target_labels.unsqueeze(1), float('-inf'))
max_other = other_logits.max(dim=1)[0]
f_loss = F.relu(max_other - target_logits + self.kappa).mean()
else:
# Maximize loss for true class
true_logits = outputs.gather(1, labels.unsqueeze(1)).squeeze()
other_logits = outputs.clone()
other_logits.scatter_(1, labels.unsqueeze(1), float('-inf'))
max_other = other_logits.max(dim=1)[0]
f_loss = F.relu(true_logits - max_other + self.kappa).mean()
# Total loss
loss = l2_loss + self.c * f_loss
loss.backward()
optimizer.step()
# Final adversarial images
adv_images = 0.5 * (torch.tanh(w) + 1)
return adv_images.detach()
Adversarial Defenses
Adversarial Training
The most effective defense - train on adversarial examples:Copy
class AdversarialTrainer:
"""
Adversarial training framework.
Key insight: Train on worst-case perturbations.
"""
def __init__(
self,
model: nn.Module,
optimizer: torch.optim.Optimizer,
epsilon: float = 0.03,
attack_steps: int = 10,
attack_lr: float = 0.01
):
self.model = model
self.optimizer = optimizer
self.epsilon = epsilon
self.attack_steps = attack_steps
self.attack_lr = attack_lr
self.pgd = PGD(
model,
epsilon=epsilon,
alpha=attack_lr,
num_iter=attack_steps
)
def train_step(
self,
images: torch.Tensor,
labels: torch.Tensor
) -> Tuple[float, float]:
"""
Single adversarial training step.
Returns:
clean_loss: Loss on clean examples
adv_loss: Loss on adversarial examples
"""
self.model.train()
# Generate adversarial examples
self.model.eval()
adv_images = self.pgd.attack(images, labels)
self.model.train()
# Train on adversarial examples
self.optimizer.zero_grad()
adv_outputs = self.model(adv_images)
adv_loss = F.cross_entropy(adv_outputs, labels)
adv_loss.backward()
self.optimizer.step()
# Compute clean loss for monitoring
with torch.no_grad():
clean_outputs = self.model(images)
clean_loss = F.cross_entropy(clean_outputs, labels)
return clean_loss.item(), adv_loss.item()
def train_epoch(self, dataloader):
"""Train for one epoch."""
total_clean_loss = 0
total_adv_loss = 0
n_batches = 0
for images, labels in dataloader:
clean_loss, adv_loss = self.train_step(images, labels)
total_clean_loss += clean_loss
total_adv_loss += adv_loss
n_batches += 1
return total_clean_loss / n_batches, total_adv_loss / n_batches
class TRADESTrainer:
"""
TRADES: Theoretically-motivated adversarial training.
Separates natural accuracy and robustness objectives.
"""
def __init__(
self,
model: nn.Module,
optimizer: torch.optim.Optimizer,
epsilon: float = 0.03,
beta: float = 6.0, # Robustness weight
attack_steps: int = 10
):
self.model = model
self.optimizer = optimizer
self.epsilon = epsilon
self.beta = beta
self.attack_steps = attack_steps
def train_step(
self,
images: torch.Tensor,
labels: torch.Tensor
) -> float:
"""TRADES training step."""
self.model.eval()
# Generate adversarial examples (maximize KL divergence)
adv_images = images.clone().detach()
adv_images += torch.empty_like(adv_images).uniform_(-self.epsilon, self.epsilon)
adv_images = torch.clamp(adv_images, 0, 1)
with torch.no_grad():
natural_outputs = self.model(images)
for _ in range(self.attack_steps):
adv_images = adv_images.clone().detach().requires_grad_(True)
adv_outputs = self.model(adv_images)
# KL divergence from natural outputs
loss = F.kl_div(
F.log_softmax(adv_outputs, dim=1),
F.softmax(natural_outputs, dim=1),
reduction='batchmean'
)
self.model.zero_grad()
loss.backward()
adv_images = adv_images + (self.epsilon / self.attack_steps) * adv_images.grad.sign()
adv_images = torch.clamp(
adv_images,
images - self.epsilon,
images + self.epsilon
)
adv_images = torch.clamp(adv_images, 0, 1)
# Training step
self.model.train()
self.optimizer.zero_grad()
# Natural loss
natural_outputs = self.model(images)
natural_loss = F.cross_entropy(natural_outputs, labels)
# Robustness loss (KL divergence)
adv_outputs = self.model(adv_images)
robust_loss = F.kl_div(
F.log_softmax(adv_outputs, dim=1),
F.softmax(natural_outputs.detach(), dim=1),
reduction='batchmean'
)
# Combined loss
loss = natural_loss + self.beta * robust_loss
loss.backward()
self.optimizer.step()
return loss.item()
Input Preprocessing Defenses
Copy
class InputPreprocessing:
"""Preprocessing-based defenses (generally broken by adaptive attacks)."""
@staticmethod
def jpeg_compression(images: torch.Tensor, quality: int = 50) -> torch.Tensor:
"""Apply JPEG compression as defense."""
# Note: This is easily broken by adaptive attacks
import io
from PIL import Image
import torchvision.transforms as T
compressed = []
for img in images:
# Convert to PIL
pil_img = T.ToPILImage()(img)
# Compress
buffer = io.BytesIO()
pil_img.save(buffer, format='JPEG', quality=quality)
buffer.seek(0)
# Reload
compressed_img = Image.open(buffer)
compressed.append(T.ToTensor()(compressed_img))
return torch.stack(compressed)
@staticmethod
def spatial_smoothing(images: torch.Tensor, kernel_size: int = 3) -> torch.Tensor:
"""Apply spatial smoothing."""
kernel = torch.ones(1, 1, kernel_size, kernel_size) / (kernel_size ** 2)
smoothed = []
for c in range(images.shape[1]):
channel = images[:, c:c+1]
smoothed_channel = F.conv2d(channel, kernel, padding=kernel_size//2)
smoothed.append(smoothed_channel)
return torch.cat(smoothed, dim=1)
@staticmethod
def bit_depth_reduction(images: torch.Tensor, bits: int = 4) -> torch.Tensor:
"""Reduce bit depth of images."""
factor = 2 ** (8 - bits)
return torch.round(images * 255 / factor) * factor / 255
class RandomizedDefense:
"""
Randomized defenses add stochasticity to break gradient-based attacks.
"""
@staticmethod
def random_resize_padding(
images: torch.Tensor,
min_size: int = 200,
max_size: int = 224
) -> torch.Tensor:
"""Random resizing and padding."""
batch_size = images.shape[0]
# Random new size
new_size = torch.randint(min_size, max_size + 1, (1,)).item()
# Resize
resized = F.interpolate(images, size=new_size, mode='bilinear')
# Random padding to max_size
pad_total = max_size - new_size
pad_left = torch.randint(0, pad_total + 1, (1,)).item()
pad_top = torch.randint(0, pad_total + 1, (1,)).item()
padded = F.pad(
resized,
(pad_left, pad_total - pad_left, pad_top, pad_total - pad_top)
)
return padded
Certified Defenses
Randomized Smoothing
Provable robustness via randomization:Copy
class RandomizedSmoothing:
"""
Randomized Smoothing: Certifiably robust classifier.
Key idea: Average predictions over Gaussian noise.
"""
def __init__(
self,
base_classifier: nn.Module,
sigma: float = 0.25,
n_samples: int = 100
):
self.base_classifier = base_classifier
self.sigma = sigma
self.n_samples = n_samples
def predict(self, x: torch.Tensor) -> torch.Tensor:
"""Smoothed prediction (majority vote)."""
counts = torch.zeros(x.shape[0], 10) # Assuming 10 classes
with torch.no_grad():
for _ in range(self.n_samples):
# Add Gaussian noise
noise = torch.randn_like(x) * self.sigma
noisy_x = x + noise
# Get prediction
outputs = self.base_classifier(noisy_x)
preds = outputs.argmax(dim=1)
# Count
for i, pred in enumerate(preds):
counts[i, pred] += 1
return counts.argmax(dim=1)
def certify(
self,
x: torch.Tensor,
n_samples: int = 10000,
alpha: float = 0.001
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Certify robustness radius.
Returns:
predictions: Certified predictions
radii: Certified L2 radii
"""
from scipy.stats import norm, binom_test
# Count predictions
counts = torch.zeros(x.shape[0], 10)
with torch.no_grad():
for _ in range(n_samples):
noise = torch.randn_like(x) * self.sigma
outputs = self.base_classifier(x + noise)
preds = outputs.argmax(dim=1)
for i, pred in enumerate(preds):
counts[i, pred] += 1
predictions = []
radii = []
for i in range(x.shape[0]):
# Top class and count
top_class = counts[i].argmax().item()
top_count = counts[i, top_class].item()
# Statistical test for majority
p_value = binom_test(top_count, n_samples, 0.5)
if p_value < alpha:
# Compute certified radius
p_lower = self._lower_confidence_bound(top_count, n_samples, alpha)
radius = self.sigma * norm.ppf(p_lower)
predictions.append(top_class)
radii.append(max(0, radius))
else:
predictions.append(-1) # Abstain
radii.append(0)
return torch.tensor(predictions), torch.tensor(radii)
def _lower_confidence_bound(
self,
successes: int,
trials: int,
alpha: float
) -> float:
"""Compute lower confidence bound using Clopper-Pearson."""
from scipy.stats import beta
return beta.ppf(alpha, successes, trials - successes + 1)
class IBPCertifiedDefense:
"""
Interval Bound Propagation for certified defense.
Propagates bounds through the network to certify robustness.
"""
def __init__(self, model: nn.Module, epsilon: float = 0.03):
self.model = model
self.epsilon = epsilon
def compute_bounds(
self,
x: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Compute output bounds for epsilon-ball around x.
Returns:
lower_bounds: Lower bound on each output
upper_bounds: Upper bound on each output
"""
# Initial bounds
lower = x - self.epsilon
upper = x + self.epsilon
for layer in self.model:
if isinstance(layer, nn.Linear):
lower, upper = self._linear_bounds(layer, lower, upper)
elif isinstance(layer, nn.ReLU):
lower, upper = self._relu_bounds(lower, upper)
elif isinstance(layer, nn.Flatten):
lower = lower.flatten(start_dim=1)
upper = upper.flatten(start_dim=1)
return lower, upper
def _linear_bounds(
self,
layer: nn.Linear,
lower: torch.Tensor,
upper: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Propagate bounds through linear layer."""
weight = layer.weight
bias = layer.bias if layer.bias is not None else 0
# Positive and negative weights
pos_weight = F.relu(weight)
neg_weight = -F.relu(-weight)
# New bounds
new_lower = (
F.linear(lower, pos_weight) +
F.linear(upper, neg_weight) +
bias
)
new_upper = (
F.linear(upper, pos_weight) +
F.linear(lower, neg_weight) +
bias
)
return new_lower, new_upper
def _relu_bounds(
self,
lower: torch.Tensor,
upper: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Propagate bounds through ReLU."""
return F.relu(lower), F.relu(upper)
def certified_accuracy(
self,
x: torch.Tensor,
labels: torch.Tensor
) -> float:
"""Compute certified accuracy."""
lower, upper = self.compute_bounds(x)
# Check if true class lower bound > all other upper bounds
certified = 0
for i in range(x.shape[0]):
true_class = labels[i].item()
true_lower = lower[i, true_class]
# Mask true class
other_upper = upper[i].clone()
other_upper[true_class] = float('-inf')
max_other = other_upper.max()
if true_lower > max_other:
certified += 1
return certified / x.shape[0]
Robust Architecture Design
Copy
class RobustArchitectureDesign:
"""
Architectural choices that improve robustness.
"""
@staticmethod
def create_robust_cnn():
"""CNN with robustness-enhancing features."""
return nn.Sequential(
# Larger kernel sizes (more robust to small perturbations)
nn.Conv2d(3, 64, 5, padding=2),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),
# Smooth activation functions
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.SiLU(), # Smoother than ReLU
nn.MaxPool2d(2),
nn.Conv2d(128, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.SiLU(),
nn.AdaptiveAvgPool2d(1),
nn.Flatten(),
nn.Linear(256, 10)
)
@staticmethod
def lipschitz_constrained_layer(
in_features: int,
out_features: int,
lipschitz_bound: float = 1.0
) -> nn.Module:
"""Linear layer with Lipschitz constraint."""
class LipschitzLinear(nn.Module):
def __init__(self):
super().__init__()
self.weight = nn.Parameter(
torch.randn(out_features, in_features) * 0.01
)
self.bias = nn.Parameter(torch.zeros(out_features))
self.bound = lipschitz_bound
def forward(self, x):
# Spectral normalization
u = torch.randn(self.weight.shape[1], 1, device=x.device)
for _ in range(3): # Power iteration
v = self.weight @ u
v = v / v.norm()
u = self.weight.T @ v
u = u / u.norm()
sigma = (v.T @ self.weight @ u).item()
# Scale weight if needed
weight = self.weight
if sigma > self.bound:
weight = weight * self.bound / sigma
return F.linear(x, weight, self.bias)
return LipschitzLinear()
class WideResNetRobust(nn.Module):
"""
Wide ResNet architecture commonly used for adversarial training.
Wider networks tend to be more robust.
"""
def __init__(self, depth: int = 28, widen_factor: int = 10, num_classes: int = 10):
super().__init__()
nChannels = [16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor]
self.conv1 = nn.Conv2d(3, nChannels[0], 3, padding=1)
self.block1 = self._make_block(nChannels[0], nChannels[1], depth // 6)
self.block2 = self._make_block(nChannels[1], nChannels[2], depth // 6, stride=2)
self.block3 = self._make_block(nChannels[2], nChannels[3], depth // 6, stride=2)
self.bn = nn.BatchNorm2d(nChannels[3])
self.relu = nn.ReLU()
self.fc = nn.Linear(nChannels[3], num_classes)
def _make_block(self, in_c, out_c, num_blocks, stride=1):
layers = [self._residual_block(in_c, out_c, stride)]
for _ in range(1, num_blocks):
layers.append(self._residual_block(out_c, out_c))
return nn.Sequential(*layers)
def _residual_block(self, in_c, out_c, stride=1):
return nn.Sequential(
nn.BatchNorm2d(in_c),
nn.ReLU(),
nn.Conv2d(in_c, out_c, 3, stride=stride, padding=1),
nn.BatchNorm2d(out_c),
nn.ReLU(),
nn.Conv2d(out_c, out_c, 3, padding=1)
)
def forward(self, x):
x = self.conv1(x)
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.relu(self.bn(x))
x = F.adaptive_avg_pool2d(x, 1)
x = x.view(x.size(0), -1)
return self.fc(x)
Robustness Evaluation
Copy
class RobustnessEvaluator:
"""Comprehensive robustness evaluation."""
def __init__(self, model: nn.Module):
self.model = model
self.model.eval()
def evaluate(
self,
test_loader,
epsilon: float = 0.03
) -> Dict[str, float]:
"""Full robustness evaluation."""
results = {
'clean_accuracy': 0,
'fgsm_accuracy': 0,
'pgd_accuracy': 0,
'pgd_20_accuracy': 0,
'autopgd_accuracy': 0
}
fgsm = FGSM(self.model, epsilon)
pgd_10 = PGD(self.model, epsilon, num_iter=10)
pgd_20 = PGD(self.model, epsilon, num_iter=20)
autopgd = AutoPGD(self.model, epsilon)
n_correct = {k: 0 for k in results}
n_total = 0
for images, labels in test_loader:
n_total += len(labels)
with torch.no_grad():
# Clean accuracy
clean_pred = self.model(images).argmax(dim=1)
n_correct['clean_accuracy'] += (clean_pred == labels).sum().item()
# FGSM
fgsm_images = fgsm.attack(images, labels)
with torch.no_grad():
fgsm_pred = self.model(fgsm_images).argmax(dim=1)
n_correct['fgsm_accuracy'] += (fgsm_pred == labels).sum().item()
# PGD-10
pgd_images = pgd_10.attack(images, labels)
with torch.no_grad():
pgd_pred = self.model(pgd_images).argmax(dim=1)
n_correct['pgd_accuracy'] += (pgd_pred == labels).sum().item()
# PGD-20
pgd20_images = pgd_20.attack(images, labels)
with torch.no_grad():
pgd20_pred = self.model(pgd20_images).argmax(dim=1)
n_correct['pgd_20_accuracy'] += (pgd20_pred == labels).sum().item()
# AutoPGD
autopgd_images = autopgd.attack(images, labels)
with torch.no_grad():
auto_pred = self.model(autopgd_images).argmax(dim=1)
n_correct['autopgd_accuracy'] += (auto_pred == labels).sum().item()
for k in results:
results[k] = n_correct[k] / n_total
return results
def robustness_curve(
self,
images: torch.Tensor,
labels: torch.Tensor,
epsilons: list = [0.01, 0.02, 0.03, 0.05, 0.1, 0.2]
) -> Dict[float, float]:
"""Accuracy vs epsilon curve."""
results = {}
for eps in epsilons:
pgd = PGD(self.model, eps, num_iter=20)
adv_images = pgd.attack(images, labels)
with torch.no_grad():
preds = self.model(adv_images).argmax(dim=1)
accuracy = (preds == labels).float().mean().item()
results[eps] = accuracy
return results
def evaluate_robustness():
"""Evaluation guidelines."""
guidelines = """
╔════════════════════════════════════════════════════════════════╗
║ ROBUSTNESS EVALUATION GUIDELINES ║
╠════════════════════════════════════════════════════════════════╣
║ ║
║ 1. STANDARD EVALUATIONS ║
║ • Clean accuracy (baseline) ║
║ • FGSM accuracy (weak attack) ║
║ • PGD-20 with restarts (strong attack) ║
║ • AutoAttack (state-of-the-art) ║
║ ║
║ 2. EPSILON RANGES (L∞ normalized to [0,1]) ║
║ • MNIST: ε = 0.3 ║
║ • CIFAR-10: ε = 8/255 ≈ 0.031 ║
║ • ImageNet: ε = 4/255 ≈ 0.016 ║
║ ║
║ 3. AVOID COMMON PITFALLS ║
║ • Don't rely on weak attacks ║
║ • Use adaptive attacks for defense evaluation ║
║ • Report worst-case across multiple attacks ║
║ • Include certified accuracy if applicable ║
║ ║
║ 4. BENCHMARKS ║
║ • RobustBench: robustbench.github.io ║
║ • AutoAttack: standardized evaluation ║
║ ║
╚════════════════════════════════════════════════════════════════╝
"""
print(guidelines)
evaluate_robustness()
Exercises
Exercise 1: Implement Square Attack
Exercise 1: Implement Square Attack
Implement the query-efficient Square Attack:
Copy
class SquareAttack:
# Black-box attack using only model outputs
# No gradients needed!
pass
Exercise 2: TRADES vs PGD Training
Exercise 2: TRADES vs PGD Training
Compare TRADES and standard PGD adversarial training:
- Train models with both methods
- Compare clean vs robust accuracy tradeoff
- Evaluate with AutoAttack
Exercise 3: Certified Smoothing
Exercise 3: Certified Smoothing
Implement and evaluate randomized smoothing:
- Train a smoothed classifier
- Compute certified radii
- Plot certified accuracy vs radius