Performance Optimization
Learn how to write C code that runs at maximum speed. This guide covers CPU architecture awareness, memory hierarchy optimization, and profiling techniques.Understanding Performance
Where Time Goes
Where Time Goes
Modern programs spend most time waiting:
- Memory access: 100+ cycles for cache miss
- Branch misprediction: 15-20 cycles penalty
- System calls: Thousands of cycles
- I/O: Millions of cycles
Profiling Tools
- perf
- gprof
- Valgrind
Copy
# Record performance data
perf record -g ./my_program
# Analyze results
perf report
# Statistical overview
perf stat ./my_program
# Sample output:
# Performance counter stats for './my_program':
# 1,234.56 msec task-clock
# 4,567,890,123 cycles
# 3,456,789,012 instructions # 0.76 IPC
# 12,345,678 cache-references
# 123,456 cache-misses # 1.0% of all cache refs
Copy
# Compile with profiling
gcc -pg -O2 program.c -o program
# Run the program (generates gmon.out)
./program
# Analyze
gprof program gmon.out > analysis.txt
# Flat profile shows where time is spent
# Call graph shows who calls what
Copy
# Cache simulation
valgrind --tool=cachegrind ./program
# Annotate source with cache info
cg_annotate cachegrind.out.*
# Shows:
# - Cache hit rates
# - Branch prediction rates
# - Per-line cache behavior
Memory Hierarchy Optimization
Copy
// BAD: Column-major access (cache unfriendly)
void matrix_sum_bad(int matrix[N][N]) {
int sum = 0;
for (int j = 0; j < N; j++) { // Column first
for (int i = 0; i < N; i++) { // Row second
sum += matrix[i][j]; // Jumps N ints each access
}
}
}
// GOOD: Row-major access (cache friendly)
void matrix_sum_good(int matrix[N][N]) {
int sum = 0;
for (int i = 0; i < N; i++) { // Row first
for (int j = 0; j < N; j++) { // Column second
sum += matrix[i][j]; // Sequential access
}
}
}
// Performance difference: 10-50x on large matrices!
// BETTER: Blocking for cache locality
#define BLOCK 64
void matrix_multiply_blocked(double *A, double *B, double *C, int n) {
for (int i = 0; i < n; i += BLOCK) {
for (int j = 0; j < n; j += BLOCK) {
for (int k = 0; k < n; k += BLOCK) {
// Multiply block
for (int ii = i; ii < i + BLOCK && ii < n; ii++) {
for (int jj = j; jj < j + BLOCK && jj < n; jj++) {
double sum = C[ii*n + jj];
for (int kk = k; kk < k + BLOCK && kk < n; kk++) {
sum += A[ii*n + kk] * B[kk*n + jj];
}
C[ii*n + jj] = sum;
}
}
}
}
}
}
Data Structure Layout
Copy
// BAD: Poor cache utilization (Array of Structs)
struct Particle_AoS {
float x, y, z; // Position
float vx, vy, vz; // Velocity
float mass;
int type;
char name[32]; // Only used for debugging
};
struct Particle_AoS particles[1000000];
// When updating positions, we load name[] into cache too
void update_positions_aos(void) {
for (int i = 0; i < 1000000; i++) {
particles[i].x += particles[i].vx; // Lots of cache misses
particles[i].y += particles[i].vy;
particles[i].z += particles[i].vz;
}
}
// GOOD: Cache-friendly (Struct of Arrays)
struct ParticleSystem {
float *x, *y, *z; // Positions
float *vx, *vy, *vz; // Velocities
float *mass;
int *type;
size_t count;
};
// Sequential access, no wasted cache space
void update_positions_soa(struct ParticleSystem *ps) {
for (size_t i = 0; i < ps->count; i++) {
ps->x[i] += ps->vx[i]; // Sequential access
ps->y[i] += ps->vy[i];
ps->z[i] += ps->vz[i];
}
}
// Often 2-4x faster for large datasets
// BEST: Hybrid approach - group hot data together
struct ParticleHot {
float x, y, z;
float vx, vy, vz;
};
struct ParticleCold {
float mass;
int type;
char name[32];
};
struct ParticleHot hot_data[1000000]; // Updated every frame
struct ParticleCold cold_data[1000000]; // Rarely accessed
Branch Prediction
Copy
// BAD: Unpredictable branches
void process_bad(int *data, int n) {
for (int i = 0; i < n; i++) {
if (data[i] > 0) { // 50% taken - unpredictable
positive_count++;
} else {
negative_count++;
}
}
}
// GOOD: Branchless code
void process_branchless(int *data, int n) {
for (int i = 0; i < n; i++) {
int is_positive = data[i] > 0;
positive_count += is_positive;
negative_count += !is_positive;
}
}
// BETTER: Sort data first if possible
// After sorting, branches become predictable
qsort(data, n, sizeof(int), compare);
process_bad(data, n); // Now branches are predictable!
// Branchless min/max
int branchless_min(int a, int b) {
return b ^ ((a ^ b) & -(a < b));
}
int branchless_max(int a, int b) {
return a ^ ((a ^ b) & -(a < b));
}
// Branchless absolute value
int branchless_abs(int x) {
int mask = x >> 31;
return (x ^ mask) - mask;
}
// Conditional move using compiler hints
int conditional_value(int condition, int true_val, int false_val) {
// GCC might generate CMOV instruction
return condition ? true_val : false_val;
}
SIMD Vectorization
Copy
#include <immintrin.h> // For AVX/SSE intrinsics
// Scalar version
void add_arrays_scalar(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// SIMD version (AVX - 8 floats at once)
void add_arrays_simd(float *a, float *b, float *c, int n) {
int i;
for (i = 0; i <= n - 8; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
__m256 vc = _mm256_add_ps(va, vb);
_mm256_storeu_ps(&c[i], vc);
}
// Handle remainder
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Dot product with SIMD
float dot_product_simd(float *a, float *b, int n) {
__m256 sum = _mm256_setzero_ps();
int i;
for (i = 0; i <= n - 8; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
sum = _mm256_fmadd_ps(va, vb, sum); // Fused multiply-add
}
// Horizontal sum
__m128 hi = _mm256_extractf128_ps(sum, 1);
__m128 lo = _mm256_castps256_ps128(sum);
lo = _mm_add_ps(lo, hi);
lo = _mm_hadd_ps(lo, lo);
lo = _mm_hadd_ps(lo, lo);
float result = _mm_cvtss_f32(lo);
// Handle remainder
for (; i < n; i++) {
result += a[i] * b[i];
}
return result;
}
Compiler Optimization Hints
Copy
// Alignment hints
void process_aligned(float * __restrict__ __attribute__((aligned(32))) data, int n) {
// Compiler knows data is 32-byte aligned
for (int i = 0; i < n; i++) {
data[i] *= 2.0f;
}
}
// Restrict pointers - no aliasing
void vector_add(float * __restrict__ a,
float * __restrict__ b,
float * __restrict__ c,
int n) {
// Compiler knows a, b, c don't overlap
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
// Branch prediction hints
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
int process_with_hints(int *data, int n) {
for (int i = 0; i < n; i++) {
if (unlikely(data[i] < 0)) {
return -1; // Error path - rarely taken
}
// Normal processing...
}
return 0;
}
// Force inlining
static inline __attribute__((always_inline))
int fast_abs(int x) {
int mask = x >> 31;
return (x ^ mask) - mask;
}
// Prevent inlining (for debugging)
__attribute__((noinline))
void debug_function(void) {
// ...
}
// Hot/cold function hints
__attribute__((hot))
void frequently_called(void) {
// Compiler optimizes more aggressively
}
__attribute__((cold))
void error_handler(void) {
// Compiler optimizes for size
}
// Prefetch hints
void process_with_prefetch(float *data, int n) {
for (int i = 0; i < n; i++) {
// Prefetch data we'll need 64 iterations later
__builtin_prefetch(&data[i + 64], 0, 3);
data[i] = sqrt(data[i]);
}
}
Memory Allocation Optimization
Copy
// Avoid frequent malloc/free
// BAD: Allocate in loop
void process_bad(int n) {
for (int i = 0; i < n; i++) {
char *buf = malloc(1024); // Slow!
do_work(buf);
free(buf);
}
}
// GOOD: Allocate once
void process_good(int n) {
char *buf = malloc(1024);
for (int i = 0; i < n; i++) {
do_work(buf);
}
free(buf);
}
// BETTER: Arena allocator for bulk operations
typedef struct {
char *memory;
size_t used;
size_t capacity;
} Arena;
void *arena_alloc(Arena *a, size_t size) {
size = (size + 7) & ~7; // Align
if (a->used + size > a->capacity) return NULL;
void *ptr = a->memory + a->used;
a->used += size;
return ptr;
}
void arena_reset(Arena *a) {
a->used = 0; // O(1) "free" everything
}
// Stack allocation for small buffers
void process_stack(int n) {
for (int i = 0; i < n; i++) {
char buf[1024]; // Stack allocation - very fast
do_work(buf);
}
}
// Avoid false sharing in multithreaded code
struct __attribute__((aligned(64))) ThreadData {
long counter;
char padding[56]; // Ensure each counter on separate cache line
};
struct ThreadData counters[NUM_THREADS];
Loop Optimizations
Copy
// Loop unrolling
void sum_unrolled(int *arr, int n, long *result) {
long sum = 0;
int i;
// Process 4 elements per iteration
for (i = 0; i <= n - 4; i += 4) {
sum += arr[i];
sum += arr[i + 1];
sum += arr[i + 2];
sum += arr[i + 3];
}
// Handle remainder
for (; i < n; i++) {
sum += arr[i];
}
*result = sum;
}
// Loop fusion - combine loops
// BAD: Two loops, two cache passes
void process_separate(float *a, float *b, int n) {
for (int i = 0; i < n; i++) a[i] *= 2;
for (int i = 0; i < n; i++) b[i] *= 3;
}
// GOOD: One loop, one cache pass
void process_fused(float *a, float *b, int n) {
for (int i = 0; i < n; i++) {
a[i] *= 2;
b[i] *= 3;
}
}
// Loop interchange for cache locality
// BAD: Column-major access
void init_bad(int m[N][N]) {
for (int j = 0; j < N; j++)
for (int i = 0; i < N; i++)
m[i][j] = 0;
}
// GOOD: Row-major access
void init_good(int m[N][N]) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
m[i][j] = 0;
}
// Loop invariant motion (manual)
// BAD: Recompute every iteration
for (int i = 0; i < n; i++) {
result[i] = data[i] * sin(angle) * scale;
}
// GOOD: Compute once
double factor = sin(angle) * scale;
for (int i = 0; i < n; i++) {
result[i] = data[i] * factor;
}
String and Memory Operations
Copy
// Use optimized library functions
#include <string.h>
// memcpy is highly optimized - use it
void copy_data(char *dst, char *src, size_t n) {
memcpy(dst, src, n); // Uses SIMD, non-temporal stores, etc.
}
// For known small sizes, inline
static inline void copy_16(char *dst, char *src) {
__m128i v = _mm_loadu_si128((const __m128i*)src);
_mm_storeu_si128((__m128i*)dst, v);
}
// Avoid strlen in loops
// BAD
for (int i = 0; i < strlen(str); i++) { // O(n²)!
process(str[i]);
}
// GOOD
size_t len = strlen(str);
for (size_t i = 0; i < len; i++) {
process(str[i]);
}
// Or even better
for (char *p = str; *p; p++) {
process(*p);
}
// Use memset for initialization
int arr[1000];
memset(arr, 0, sizeof(arr)); // Faster than loop
Micro-benchmarking
Copy
#include <time.h>
// High-resolution timing
static inline uint64_t rdtsc(void) {
unsigned int lo, hi;
__asm__ volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
// Or use clock_gettime
double get_time(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
// Benchmark function
void benchmark(const char *name, void (*func)(void), int iterations) {
// Warmup
for (int i = 0; i < 100; i++) func();
double start = get_time();
for (int i = 0; i < iterations; i++) {
func();
}
double end = get_time();
double per_call = (end - start) / iterations * 1e9;
printf("%s: %.2f ns per call\n", name, per_call);
}
// Prevent dead code elimination
void escape(void *p) {
__asm__ volatile("" : : "g"(p) : "memory");
}
void clobber(void) {
__asm__ volatile("" : : : "memory");
}
Checklist
Profile First
Never optimize without measuring
Cache Friendly
Sequential access, avoid cache misses
Reduce Branches
Predictable or branchless code
Vectorize
Use SIMD for data-parallel work
Reduce Allocations
Pool, arena, or stack allocate
Use restrict
Help compiler optimize pointer code
Next Up
Embedded Systems
Write C for resource-constrained systems