Hyperparameter Tuning
The Search Space Challenge
Hyperparameters control how a model learns:| Category | Examples |
|---|---|
| Architecture | Layers, hidden sizes, attention heads |
| Optimization | Learning rate, batch size, optimizer |
| Regularization | Dropout, weight decay, augmentation |
| Training | Epochs, warmup steps, gradient clipping |
Copy
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from typing import Dict, Any, Optional, Callable, List, Tuple
from dataclasses import dataclass
import json
import hashlib
from abc import ABC, abstractmethod
torch.manual_seed(42)
Search Strategies
Grid Search
Copy
class GridSearch:
"""
Exhaustive search over a grid of hyperparameters.
Pros: Simple, reproducible, complete coverage
Cons: Exponential cost, ignores parameter importance
"""
def __init__(self, param_grid: Dict[str, List]):
"""
Args:
param_grid: {param_name: [value1, value2, ...]}
"""
self.param_grid = param_grid
self.results = []
def generate_configs(self):
"""Generate all combinations."""
import itertools
keys = list(self.param_grid.keys())
values = list(self.param_grid.values())
for combination in itertools.product(*values):
yield dict(zip(keys, combination))
def search(
self,
objective_fn: Callable[[Dict], float],
maximize: bool = True
) -> Dict:
"""
Run grid search.
Args:
objective_fn: Function that takes config, returns score
maximize: Whether to maximize or minimize
"""
best_config = None
best_score = float('-inf') if maximize else float('inf')
for config in self.generate_configs():
score = objective_fn(config)
self.results.append({'config': config, 'score': score})
if (maximize and score > best_score) or \
(not maximize and score < best_score):
best_score = score
best_config = config
return {'best_config': best_config, 'best_score': best_score}
# Example usage
def train_model(config: Dict) -> float:
"""Simulated training function."""
# In practice: train model and return validation metric
lr = config['learning_rate']
dropout = config['dropout']
# Simulated score (replace with actual training)
score = -((lr - 0.001) ** 2) - ((dropout - 0.3) ** 2) + 0.5
return score + np.random.normal(0, 0.01)
grid_search = GridSearch({
'learning_rate': [0.0001, 0.001, 0.01],
'dropout': [0.1, 0.3, 0.5],
'batch_size': [16, 32, 64]
})
# result = grid_search.search(train_model)
# print(f"Best config: {result['best_config']}")
Random Search
Copy
class RandomSearch:
"""
Random search over hyperparameter space.
Often more efficient than grid search!
(Bergstra & Bengio, 2012)
"""
def __init__(self, param_distributions: Dict[str, Any]):
"""
Args:
param_distributions: {param_name: distribution}
- List: uniform choice
- Tuple (low, high): uniform continuous
- Tuple (low, high, 'log'): log-uniform
"""
self.param_distributions = param_distributions
def sample_config(self) -> Dict:
"""Sample a random configuration."""
config = {}
for param, dist in self.param_distributions.items():
if isinstance(dist, list):
# Categorical
config[param] = np.random.choice(dist)
elif isinstance(dist, tuple):
if len(dist) == 3 and dist[2] == 'log':
# Log-uniform
low, high = np.log(dist[0]), np.log(dist[1])
config[param] = np.exp(np.random.uniform(low, high))
else:
# Uniform
config[param] = np.random.uniform(dist[0], dist[1])
return config
def search(
self,
objective_fn: Callable[[Dict], float],
n_trials: int = 100,
maximize: bool = True
) -> Dict:
"""Run random search."""
best_config = None
best_score = float('-inf') if maximize else float('inf')
results = []
for _ in range(n_trials):
config = self.sample_config()
score = objective_fn(config)
results.append({'config': config, 'score': score})
if (maximize and score > best_score) or \
(not maximize and score < best_score):
best_score = score
best_config = config
return {
'best_config': best_config,
'best_score': best_score,
'all_results': results
}
random_search = RandomSearch({
'learning_rate': (1e-5, 1e-2, 'log'), # Log-uniform
'dropout': (0.0, 0.5), # Uniform
'hidden_size': [64, 128, 256, 512], # Categorical
'num_layers': [1, 2, 3, 4] # Categorical
})
Bayesian Optimization
Copy
class GaussianProcess:
"""
Simple Gaussian Process for Bayesian Optimization.
Uses RBF kernel for surrogate modeling.
"""
def __init__(self, length_scale: float = 1.0, noise: float = 1e-5):
self.length_scale = length_scale
self.noise = noise
self.X_train = None
self.y_train = None
def rbf_kernel(self, X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
"""Radial Basis Function kernel."""
sqdist = np.sum(X1**2, 1).reshape(-1, 1) + \
np.sum(X2**2, 1) - 2 * np.dot(X1, X2.T)
return np.exp(-0.5 / self.length_scale**2 * sqdist)
def fit(self, X: np.ndarray, y: np.ndarray):
"""Fit GP to observations."""
self.X_train = X
self.y_train = y
# Compute kernel matrix
K = self.rbf_kernel(X, X)
K += self.noise * np.eye(len(X))
self.L = np.linalg.cholesky(K)
self.alpha = np.linalg.solve(self.L.T, np.linalg.solve(self.L, y))
def predict(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Predict mean and variance at new points."""
K_star = self.rbf_kernel(self.X_train, X)
K_ss = self.rbf_kernel(X, X)
# Mean prediction
mean = K_star.T @ self.alpha
# Variance prediction
v = np.linalg.solve(self.L, K_star)
var = np.diag(K_ss) - np.sum(v**2, axis=0)
return mean, var
class BayesianOptimization:
"""
Bayesian Optimization with Gaussian Process surrogate.
Key idea:
1. Build probabilistic model of objective
2. Use acquisition function to decide where to sample
3. Update model with new observation
4. Repeat
"""
def __init__(
self,
param_bounds: Dict[str, Tuple[float, float]],
acquisition: str = 'ei' # 'ei' or 'ucb'
):
self.param_bounds = param_bounds
self.param_names = list(param_bounds.keys())
self.acquisition = acquisition
self.gp = GaussianProcess()
self.X_observed = []
self.y_observed = []
def _normalize(self, config: Dict) -> np.ndarray:
"""Normalize config to [0, 1]^d."""
x = []
for name in self.param_names:
low, high = self.param_bounds[name]
val = config[name]
x.append((val - low) / (high - low))
return np.array(x)
def _denormalize(self, x: np.ndarray) -> Dict:
"""Convert [0, 1]^d back to config."""
config = {}
for i, name in enumerate(self.param_names):
low, high = self.param_bounds[name]
config[name] = low + x[i] * (high - low)
return config
def acquisition_function(self, X: np.ndarray) -> np.ndarray:
"""Compute acquisition function values."""
mean, var = self.gp.predict(X)
std = np.sqrt(var + 1e-8)
if self.acquisition == 'ei':
# Expected Improvement
best_y = max(self.y_observed)
z = (mean - best_y) / std
ei = std * (z * self._norm_cdf(z) + self._norm_pdf(z))
return ei
else:
# Upper Confidence Bound
return mean + 2.0 * std
def _norm_cdf(self, x):
return 0.5 * (1 + np.erf(x / np.sqrt(2)))
def _norm_pdf(self, x):
return np.exp(-0.5 * x**2) / np.sqrt(2 * np.pi)
def suggest(self) -> Dict:
"""Suggest next point to evaluate."""
if len(self.X_observed) < 5:
# Random exploration for initial points
x = np.random.uniform(0, 1, len(self.param_names))
else:
# Optimize acquisition function
best_acq = float('-inf')
best_x = None
# Random + grid search over acquisition
for _ in range(1000):
x = np.random.uniform(0, 1, len(self.param_names))
acq = self.acquisition_function(x.reshape(1, -1))[0]
if acq > best_acq:
best_acq = acq
best_x = x
x = best_x
return self._denormalize(x)
def observe(self, config: Dict, value: float):
"""Record an observation."""
x = self._normalize(config)
self.X_observed.append(x)
self.y_observed.append(value)
if len(self.X_observed) > 1:
X = np.array(self.X_observed)
y = np.array(self.y_observed)
self.gp.fit(X, y)
def optimize(
self,
objective_fn: Callable[[Dict], float],
n_iterations: int = 50
) -> Dict:
"""Run Bayesian optimization."""
for i in range(n_iterations):
config = self.suggest()
value = objective_fn(config)
self.observe(config, value)
if (i + 1) % 10 == 0:
best_idx = np.argmax(self.y_observed)
print(f"Iteration {i+1}: Best = {self.y_observed[best_idx]:.4f}")
best_idx = np.argmax(self.y_observed)
return {
'best_config': self._denormalize(np.array(self.X_observed[best_idx])),
'best_score': self.y_observed[best_idx]
}
Optuna Integration
Copy
# Note: Requires `pip install optuna`
class OptunaOptimizer:
"""
Optuna-based hyperparameter optimization.
Features:
- Pruning unpromising trials
- Multiple samplers (TPE, CMA-ES, etc.)
- Distributed optimization
"""
def __init__(
self,
study_name: str = "optimization",
sampler: str = "tpe",
pruner: str = "median"
):
self.study_name = study_name
self.sampler_name = sampler
self.pruner_name = pruner
def create_objective(
self,
train_fn: Callable,
param_config: Dict[str, Dict]
):
"""
Create Optuna objective function.
Args:
train_fn: Training function that returns metric
param_config: {
'param_name': {
'type': 'float' | 'int' | 'categorical',
'low': ...,
'high': ...,
'log': True/False,
'choices': [...] # for categorical
}
}
"""
def objective(trial):
# Sample hyperparameters
config = {}
for name, cfg in param_config.items():
if cfg['type'] == 'float':
config[name] = trial.suggest_float(
name, cfg['low'], cfg['high'],
log=cfg.get('log', False)
)
elif cfg['type'] == 'int':
config[name] = trial.suggest_int(
name, cfg['low'], cfg['high'],
log=cfg.get('log', False)
)
elif cfg['type'] == 'categorical':
config[name] = trial.suggest_categorical(
name, cfg['choices']
)
# Train and get metric
metric = train_fn(config, trial)
return metric
return objective
def optimize(
self,
objective: Callable,
n_trials: int = 100,
direction: str = "maximize",
n_jobs: int = 1
):
"""Run optimization."""
try:
import optuna
# Create sampler
if self.sampler_name == "tpe":
sampler = optuna.samplers.TPESampler()
elif self.sampler_name == "cmaes":
sampler = optuna.samplers.CmaEsSampler()
else:
sampler = optuna.samplers.RandomSampler()
# Create pruner
if self.pruner_name == "median":
pruner = optuna.pruners.MedianPruner()
elif self.pruner_name == "hyperband":
pruner = optuna.pruners.HyperbandPruner()
else:
pruner = None
study = optuna.create_study(
study_name=self.study_name,
direction=direction,
sampler=sampler,
pruner=pruner
)
study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs)
return {
'best_params': study.best_params,
'best_value': study.best_value,
'study': study
}
except ImportError:
print("Install optuna: pip install optuna")
return None
# Example: Optuna with PyTorch
def create_model(config: Dict) -> nn.Module:
"""Create model from config."""
return nn.Sequential(
nn.Linear(784, config['hidden_size']),
nn.ReLU(),
nn.Dropout(config['dropout']),
nn.Linear(config['hidden_size'], 10)
)
def train_with_optuna(config: Dict, trial) -> float:
"""Training function with Optuna pruning."""
# In practice, this would be your full training loop
# Example: report intermediate values for pruning
for epoch in range(config.get('epochs', 10)):
# Train one epoch...
val_acc = np.random.random() # Simulated
# Report to Optuna
trial.report(val_acc, epoch)
# Prune if trial is not promising
if trial.should_prune():
raise Exception("Trial pruned") # optuna.TrialPruned()
return val_acc
# Configuration
optuna_config = {
'learning_rate': {'type': 'float', 'low': 1e-5, 'high': 1e-1, 'log': True},
'hidden_size': {'type': 'categorical', 'choices': [128, 256, 512, 1024]},
'dropout': {'type': 'float', 'low': 0.0, 'high': 0.5},
'batch_size': {'type': 'categorical', 'choices': [16, 32, 64, 128]}
}
Ray Tune Integration
Copy
class RayTuneOptimizer:
"""
Ray Tune for distributed hyperparameter optimization.
Features:
- Distributed training
- Population-based training
- Integration with various schedulers
"""
@staticmethod
def create_trainable(model_fn, train_loader, val_loader):
"""Create Ray Tune trainable function."""
def trainable(config):
# Build model
model = model_fn(config)
# Training loop with reporting
for epoch in range(config.get('epochs', 10)):
# Train...
train_loss = 0.0 # Compute actual loss
# Validate...
val_loss = 0.0 # Compute actual loss
val_acc = 0.0 # Compute actual acc
# Report to Ray Tune
# tune.report(loss=val_loss, accuracy=val_acc)
return {'accuracy': val_acc}
return trainable
@staticmethod
def run_tune(
trainable,
config_space: Dict,
num_samples: int = 10,
scheduler: str = "asha",
resources_per_trial: Dict = None
):
"""
Run Ray Tune optimization.
Schedulers:
- 'asha': Async Successive Halving
- 'pbt': Population Based Training
- 'bohb': Bayesian + Hyperband
"""
try:
from ray import tune
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
# Create scheduler
if scheduler == "asha":
tune_scheduler = ASHAScheduler(
metric="accuracy",
mode="max",
max_t=100,
grace_period=10,
reduction_factor=3
)
elif scheduler == "pbt":
tune_scheduler = PopulationBasedTraining(
time_attr="training_iteration",
metric="accuracy",
mode="max",
perturbation_interval=5,
hyperparam_mutations=config_space
)
# Run
analysis = tune.run(
trainable,
config=config_space,
num_samples=num_samples,
scheduler=tune_scheduler,
resources_per_trial=resources_per_trial or {"cpu": 2, "gpu": 0.5}
)
return analysis.best_config, analysis.best_trial.last_result
except ImportError:
print("Install ray[tune]: pip install 'ray[tune]'")
return None, None
# Ray Tune config example
ray_config = {
# tune.loguniform, tune.choice, etc.
# "lr": tune.loguniform(1e-5, 1e-1),
# "hidden": tune.choice([128, 256, 512]),
# "dropout": tune.uniform(0.0, 0.5),
}
Population-Based Training
Copy
class PopulationBasedTraining:
"""
Population-Based Training (PBT).
Combines parallel training with hyperparameter adaptation.
Key ideas:
1. Train population of models in parallel
2. Periodically exploit (copy best) and explore (mutate)
3. Adapts hyperparameters during training
"""
def __init__(
self,
population_size: int = 10,
exploit_fraction: float = 0.2,
explore_fraction: float = 0.2
):
self.population_size = population_size
self.exploit_fraction = exploit_fraction
self.explore_fraction = explore_fraction
def create_population(
self,
config_space: Dict[str, Tuple]
) -> List[Dict]:
"""Initialize random population."""
population = []
for _ in range(self.population_size):
config = {}
for name, (low, high) in config_space.items():
config[name] = np.random.uniform(low, high)
population.append(config)
return population
def exploit_and_explore(
self,
population: List[Dict],
scores: List[float],
config_space: Dict[str, Tuple]
) -> List[Dict]:
"""PBT exploitation and exploration step."""
n = len(population)
sorted_idx = np.argsort(scores)[::-1] # Best first
n_exploit = int(n * self.exploit_fraction)
n_explore = int(n * self.explore_fraction)
new_population = [population[i].copy() for i in sorted_idx]
# Bottom performers copy top performers
for i in range(n - n_exploit, n):
# Exploit: copy from top
source_idx = np.random.randint(0, n_exploit)
new_population[i] = population[sorted_idx[source_idx]].copy()
# Explore: perturb hyperparameters
for name, (low, high) in config_space.items():
if np.random.random() < self.explore_fraction:
# Perturb by factor of 0.8 or 1.2
factor = np.random.choice([0.8, 1.2])
new_val = new_population[i][name] * factor
new_population[i][name] = np.clip(new_val, low, high)
return [new_population[i] for i in np.argsort(sorted_idx)]
def run(
self,
train_fn: Callable,
config_space: Dict[str, Tuple],
n_iterations: int = 100,
exploit_interval: int = 10
) -> Dict:
"""Run PBT optimization."""
population = self.create_population(config_space)
# Track best
best_score = float('-inf')
best_config = None
for iteration in range(n_iterations):
# Train all members for one step
scores = [train_fn(config) for config in population]
# Track best
max_idx = np.argmax(scores)
if scores[max_idx] > best_score:
best_score = scores[max_idx]
best_config = population[max_idx].copy()
# Exploit and explore periodically
if (iteration + 1) % exploit_interval == 0:
population = self.exploit_and_explore(
population, scores, config_space
)
if (iteration + 1) % 20 == 0:
print(f"Iteration {iteration+1}: Best = {best_score:.4f}")
return {'best_config': best_config, 'best_score': best_score}
Successive Halving / Hyperband
Copy
class SuccessiveHalving:
"""
Successive Halving for efficient hyperparameter search.
Idea: Start many configurations, prune poor performers early.
"""
def __init__(
self,
max_budget: int = 100, # Max epochs/iterations
eta: int = 3 # Reduction factor
):
self.max_budget = max_budget
self.eta = eta
def run(
self,
objective_fn: Callable[[Dict, int], float],
configs: List[Dict]
) -> Dict:
"""
Run successive halving.
Args:
objective_fn: Takes (config, budget) returns metric
configs: List of configurations to evaluate
"""
n = len(configs)
budget = 1
while n > 1 and budget <= self.max_budget:
# Evaluate all remaining configs with current budget
scores = [(cfg, objective_fn(cfg, budget)) for cfg in configs]
# Sort by score (descending)
scores.sort(key=lambda x: x[1], reverse=True)
# Keep top 1/eta
n = max(1, int(n / self.eta))
configs = [cfg for cfg, _ in scores[:n]]
# Increase budget
budget *= self.eta
print(f"Budget {budget}: {n} configs remaining, best = {scores[0][1]:.4f}")
return {
'best_config': configs[0],
'best_score': objective_fn(configs[0], self.max_budget)
}
class Hyperband:
"""
Hyperband: Combines successive halving with random search.
Runs multiple brackets of successive halving with different
budget/config tradeoffs.
"""
def __init__(
self,
max_budget: int = 81,
eta: int = 3
):
self.max_budget = max_budget
self.eta = eta
# Compute number of brackets
self.s_max = int(np.log(max_budget) / np.log(eta))
def run(
self,
config_sampler: Callable[[], Dict],
objective_fn: Callable[[Dict, int], float]
) -> Dict:
"""Run Hyperband optimization."""
best_config = None
best_score = float('-inf')
for s in range(self.s_max, -1, -1):
# Number of configs in this bracket
n = int(np.ceil(
(self.s_max + 1) / (s + 1) * self.eta ** s
))
# Initial budget per config
r = self.max_budget * self.eta ** (-s)
print(f"\nBracket s={s}: n={n} configs, initial budget={r:.0f}")
# Sample configs
configs = [config_sampler() for _ in range(n)]
# Successive halving within bracket
for i in range(s + 1):
n_i = int(n * self.eta ** (-i))
r_i = int(r * self.eta ** i)
# Evaluate
scores = [(cfg, objective_fn(cfg, r_i)) for cfg in configs]
scores.sort(key=lambda x: x[1], reverse=True)
# Track best
if scores[0][1] > best_score:
best_score = scores[0][1]
best_config = scores[0][0]
# Keep top 1/eta for next round
k = max(1, int(n_i / self.eta))
configs = [cfg for cfg, _ in scores[:k]]
print(f" Round {i}: budget={r_i}, {len(configs)} configs, best={scores[0][1]:.4f}")
return {'best_config': best_config, 'best_score': best_score}
Best Practices
Copy
def tuning_best_practices():
"""Guidelines for hyperparameter tuning."""
tips = """
╔════════════════════════════════════════════════════════════════════╗
║ HYPERPARAMETER TUNING BEST PRACTICES ║
╠════════════════════════════════════════════════════════════════════╣
║ ║
║ 1. START SIMPLE ║
║ • Begin with random search (often 60-80% as good as Bayesian) ║
║ • Tune most important params first (LR > architecture) ║
║ • Use log-scale for learning rates, regularization ║
║ ║
║ 2. REDUCE COMPUTE WASTE ║
║ • Use early stopping / pruning ║
║ • Start with smaller models/datasets for initial search ║
║ • Transfer good configs between similar tasks ║
║ ║
║ 3. SEARCH SPACE DESIGN ║
║ • Log scale for: learning rate, weight decay, dropout ║
║ • Linear scale for: batch size, hidden size, num layers ║
║ • Don't search over correlated parameters ║
║ ║
║ 4. VALIDATION STRATEGY ║
║ • Use held-out validation set (not test!) ║
║ • Multiple seeds for noisy objectives ║
║ • Consider cross-validation for small datasets ║
║ ║
║ 5. KNOW WHEN TO STOP ║
║ • Diminishing returns after ~50-100 trials typically ║
║ • If validation improving but test not, you're overfitting ║
║ • Compare to strong baselines, not just other configs ║
║ ║
╠════════════════════════════════════════════════════════════════════╣
║ IMPORTANCE RANKING ║
╠════════════════════════════════════════════════════════════════════╣
║ ║
║ HIGH IMPACT: ║
║ • Learning rate (most important!) ║
║ • Learning rate schedule ║
║ • Batch size ║
║ ║
║ MEDIUM IMPACT: ║
║ • Network depth/width ║
║ • Dropout / regularization ║
║ • Optimizer choice ║
║ ║
║ LOWER IMPACT: ║
║ • Activation functions ║
║ • Weight initialization ║
║ • Optimizer hyperparameters (momentum, etc.) ║
║ ║
╚════════════════════════════════════════════════════════════════════╝
"""
print(tips)
tuning_best_practices()
Exercises
Exercise 1: Implement BOHB
Exercise 1: Implement BOHB
Combine Bayesian optimization with Hyperband:
Copy
# Use TPE sampler for config generation
# Use Hyperband for early stopping
# Compare to vanilla Hyperband
Exercise 2: Multi-Objective Tuning
Exercise 2: Multi-Objective Tuning
Optimize for both accuracy and efficiency:
Copy
# Pareto frontier of accuracy vs FLOPs
# Use NSGA-II or similar
Exercise 3: Transfer Learning for HPO
Exercise 3: Transfer Learning for HPO
Use past experiments to warm-start new searches:
Copy
# Train meta-model on historical configs
# Use as prior for new tasks