Skip to main content

Performance Optimization

Learn how to write C code that runs at maximum speed. This guide covers CPU architecture awareness, memory hierarchy optimization, and profiling techniques.

Understanding Performance

Modern programs spend most time waiting:
  • Memory access: 100+ cycles for cache miss
  • Branch misprediction: 15-20 cycles penalty
  • System calls: Thousands of cycles
  • I/O: Millions of cycles
The key insight: optimize memory access patterns first.

Profiling Tools

# Record performance data
perf record -g ./my_program

# Analyze results
perf report

# Statistical overview
perf stat ./my_program

# Sample output:
# Performance counter stats for './my_program':
#       1,234.56 msec task-clock
#    4,567,890,123 cycles
#    3,456,789,012 instructions      # 0.76 IPC
#       12,345,678 cache-references
#          123,456 cache-misses      # 1.0% of all cache refs

Memory Hierarchy Optimization

// BAD: Column-major access (cache unfriendly)
void matrix_sum_bad(int matrix[N][N]) {
    int sum = 0;
    for (int j = 0; j < N; j++) {        // Column first
        for (int i = 0; i < N; i++) {    // Row second
            sum += matrix[i][j];          // Jumps N ints each access
        }
    }
}

// GOOD: Row-major access (cache friendly)
void matrix_sum_good(int matrix[N][N]) {
    int sum = 0;
    for (int i = 0; i < N; i++) {        // Row first
        for (int j = 0; j < N; j++) {    // Column second
            sum += matrix[i][j];          // Sequential access
        }
    }
}
// Performance difference: 10-50x on large matrices!

// BETTER: Blocking for cache locality
#define BLOCK 64

void matrix_multiply_blocked(double *A, double *B, double *C, int n) {
    for (int i = 0; i < n; i += BLOCK) {
        for (int j = 0; j < n; j += BLOCK) {
            for (int k = 0; k < n; k += BLOCK) {
                // Multiply block
                for (int ii = i; ii < i + BLOCK && ii < n; ii++) {
                    for (int jj = j; jj < j + BLOCK && jj < n; jj++) {
                        double sum = C[ii*n + jj];
                        for (int kk = k; kk < k + BLOCK && kk < n; kk++) {
                            sum += A[ii*n + kk] * B[kk*n + jj];
                        }
                        C[ii*n + jj] = sum;
                    }
                }
            }
        }
    }
}

Data Structure Layout

// BAD: Poor cache utilization (Array of Structs)
struct Particle_AoS {
    float x, y, z;        // Position
    float vx, vy, vz;     // Velocity
    float mass;
    int type;
    char name[32];        // Only used for debugging
};

struct Particle_AoS particles[1000000];

// When updating positions, we load name[] into cache too
void update_positions_aos(void) {
    for (int i = 0; i < 1000000; i++) {
        particles[i].x += particles[i].vx;  // Lots of cache misses
        particles[i].y += particles[i].vy;
        particles[i].z += particles[i].vz;
    }
}

// GOOD: Cache-friendly (Struct of Arrays)
struct ParticleSystem {
    float *x, *y, *z;     // Positions
    float *vx, *vy, *vz;  // Velocities
    float *mass;
    int *type;
    size_t count;
};

// Sequential access, no wasted cache space
void update_positions_soa(struct ParticleSystem *ps) {
    for (size_t i = 0; i < ps->count; i++) {
        ps->x[i] += ps->vx[i];  // Sequential access
        ps->y[i] += ps->vy[i];
        ps->z[i] += ps->vz[i];
    }
}
// Often 2-4x faster for large datasets

// BEST: Hybrid approach - group hot data together
struct ParticleHot {
    float x, y, z;
    float vx, vy, vz;
};

struct ParticleCold {
    float mass;
    int type;
    char name[32];
};

struct ParticleHot hot_data[1000000];   // Updated every frame
struct ParticleCold cold_data[1000000]; // Rarely accessed

Branch Prediction

// BAD: Unpredictable branches
void process_bad(int *data, int n) {
    for (int i = 0; i < n; i++) {
        if (data[i] > 0) {    // 50% taken - unpredictable
            positive_count++;
        } else {
            negative_count++;
        }
    }
}

// GOOD: Branchless code
void process_branchless(int *data, int n) {
    for (int i = 0; i < n; i++) {
        int is_positive = data[i] > 0;
        positive_count += is_positive;
        negative_count += !is_positive;
    }
}

// BETTER: Sort data first if possible
// After sorting, branches become predictable
qsort(data, n, sizeof(int), compare);
process_bad(data, n);  // Now branches are predictable!

// Branchless min/max
int branchless_min(int a, int b) {
    return b ^ ((a ^ b) & -(a < b));
}

int branchless_max(int a, int b) {
    return a ^ ((a ^ b) & -(a < b));
}

// Branchless absolute value
int branchless_abs(int x) {
    int mask = x >> 31;
    return (x ^ mask) - mask;
}

// Conditional move using compiler hints
int conditional_value(int condition, int true_val, int false_val) {
    // GCC might generate CMOV instruction
    return condition ? true_val : false_val;
}

SIMD Vectorization

#include <immintrin.h>  // For AVX/SSE intrinsics

// Scalar version
void add_arrays_scalar(float *a, float *b, float *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

// SIMD version (AVX - 8 floats at once)
void add_arrays_simd(float *a, float *b, float *c, int n) {
    int i;
    for (i = 0; i <= n - 8; i += 8) {
        __m256 va = _mm256_loadu_ps(&a[i]);
        __m256 vb = _mm256_loadu_ps(&b[i]);
        __m256 vc = _mm256_add_ps(va, vb);
        _mm256_storeu_ps(&c[i], vc);
    }
    
    // Handle remainder
    for (; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

// Dot product with SIMD
float dot_product_simd(float *a, float *b, int n) {
    __m256 sum = _mm256_setzero_ps();
    int i;
    
    for (i = 0; i <= n - 8; i += 8) {
        __m256 va = _mm256_loadu_ps(&a[i]);
        __m256 vb = _mm256_loadu_ps(&b[i]);
        sum = _mm256_fmadd_ps(va, vb, sum);  // Fused multiply-add
    }
    
    // Horizontal sum
    __m128 hi = _mm256_extractf128_ps(sum, 1);
    __m128 lo = _mm256_castps256_ps128(sum);
    lo = _mm_add_ps(lo, hi);
    lo = _mm_hadd_ps(lo, lo);
    lo = _mm_hadd_ps(lo, lo);
    
    float result = _mm_cvtss_f32(lo);
    
    // Handle remainder
    for (; i < n; i++) {
        result += a[i] * b[i];
    }
    
    return result;
}

Compiler Optimization Hints

// Alignment hints
void process_aligned(float * __restrict__ __attribute__((aligned(32))) data, int n) {
    // Compiler knows data is 32-byte aligned
    for (int i = 0; i < n; i++) {
        data[i] *= 2.0f;
    }
}

// Restrict pointers - no aliasing
void vector_add(float * __restrict__ a, 
                float * __restrict__ b,
                float * __restrict__ c, 
                int n) {
    // Compiler knows a, b, c don't overlap
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

// Branch prediction hints
#define likely(x)   __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

int process_with_hints(int *data, int n) {
    for (int i = 0; i < n; i++) {
        if (unlikely(data[i] < 0)) {
            return -1;  // Error path - rarely taken
        }
        // Normal processing...
    }
    return 0;
}

// Force inlining
static inline __attribute__((always_inline)) 
int fast_abs(int x) {
    int mask = x >> 31;
    return (x ^ mask) - mask;
}

// Prevent inlining (for debugging)
__attribute__((noinline))
void debug_function(void) {
    // ...
}

// Hot/cold function hints
__attribute__((hot))
void frequently_called(void) {
    // Compiler optimizes more aggressively
}

__attribute__((cold))
void error_handler(void) {
    // Compiler optimizes for size
}

// Prefetch hints
void process_with_prefetch(float *data, int n) {
    for (int i = 0; i < n; i++) {
        // Prefetch data we'll need 64 iterations later
        __builtin_prefetch(&data[i + 64], 0, 3);
        data[i] = sqrt(data[i]);
    }
}

Memory Allocation Optimization

// Avoid frequent malloc/free
// BAD: Allocate in loop
void process_bad(int n) {
    for (int i = 0; i < n; i++) {
        char *buf = malloc(1024);  // Slow!
        do_work(buf);
        free(buf);
    }
}

// GOOD: Allocate once
void process_good(int n) {
    char *buf = malloc(1024);
    for (int i = 0; i < n; i++) {
        do_work(buf);
    }
    free(buf);
}

// BETTER: Arena allocator for bulk operations
typedef struct {
    char *memory;
    size_t used;
    size_t capacity;
} Arena;

void *arena_alloc(Arena *a, size_t size) {
    size = (size + 7) & ~7;  // Align
    if (a->used + size > a->capacity) return NULL;
    void *ptr = a->memory + a->used;
    a->used += size;
    return ptr;
}

void arena_reset(Arena *a) {
    a->used = 0;  // O(1) "free" everything
}

// Stack allocation for small buffers
void process_stack(int n) {
    for (int i = 0; i < n; i++) {
        char buf[1024];  // Stack allocation - very fast
        do_work(buf);
    }
}

// Avoid false sharing in multithreaded code
struct __attribute__((aligned(64))) ThreadData {
    long counter;
    char padding[56];  // Ensure each counter on separate cache line
};

struct ThreadData counters[NUM_THREADS];

Loop Optimizations

// Loop unrolling
void sum_unrolled(int *arr, int n, long *result) {
    long sum = 0;
    int i;
    
    // Process 4 elements per iteration
    for (i = 0; i <= n - 4; i += 4) {
        sum += arr[i];
        sum += arr[i + 1];
        sum += arr[i + 2];
        sum += arr[i + 3];
    }
    
    // Handle remainder
    for (; i < n; i++) {
        sum += arr[i];
    }
    
    *result = sum;
}

// Loop fusion - combine loops
// BAD: Two loops, two cache passes
void process_separate(float *a, float *b, int n) {
    for (int i = 0; i < n; i++) a[i] *= 2;
    for (int i = 0; i < n; i++) b[i] *= 3;
}

// GOOD: One loop, one cache pass
void process_fused(float *a, float *b, int n) {
    for (int i = 0; i < n; i++) {
        a[i] *= 2;
        b[i] *= 3;
    }
}

// Loop interchange for cache locality
// BAD: Column-major access
void init_bad(int m[N][N]) {
    for (int j = 0; j < N; j++)
        for (int i = 0; i < N; i++)
            m[i][j] = 0;
}

// GOOD: Row-major access
void init_good(int m[N][N]) {
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            m[i][j] = 0;
}

// Loop invariant motion (manual)
// BAD: Recompute every iteration
for (int i = 0; i < n; i++) {
    result[i] = data[i] * sin(angle) * scale;
}

// GOOD: Compute once
double factor = sin(angle) * scale;
for (int i = 0; i < n; i++) {
    result[i] = data[i] * factor;
}

String and Memory Operations

// Use optimized library functions
#include <string.h>

// memcpy is highly optimized - use it
void copy_data(char *dst, char *src, size_t n) {
    memcpy(dst, src, n);  // Uses SIMD, non-temporal stores, etc.
}

// For known small sizes, inline
static inline void copy_16(char *dst, char *src) {
    __m128i v = _mm_loadu_si128((const __m128i*)src);
    _mm_storeu_si128((__m128i*)dst, v);
}

// Avoid strlen in loops
// BAD
for (int i = 0; i < strlen(str); i++) {  // O(n²)!
    process(str[i]);
}

// GOOD
size_t len = strlen(str);
for (size_t i = 0; i < len; i++) {
    process(str[i]);
}

// Or even better
for (char *p = str; *p; p++) {
    process(*p);
}

// Use memset for initialization
int arr[1000];
memset(arr, 0, sizeof(arr));  // Faster than loop

Micro-benchmarking

#include <time.h>

// High-resolution timing
static inline uint64_t rdtsc(void) {
    unsigned int lo, hi;
    __asm__ volatile ("rdtsc" : "=a" (lo), "=d" (hi));
    return ((uint64_t)hi << 32) | lo;
}

// Or use clock_gettime
double get_time(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

// Benchmark function
void benchmark(const char *name, void (*func)(void), int iterations) {
    // Warmup
    for (int i = 0; i < 100; i++) func();
    
    double start = get_time();
    for (int i = 0; i < iterations; i++) {
        func();
    }
    double end = get_time();
    
    double per_call = (end - start) / iterations * 1e9;
    printf("%s: %.2f ns per call\n", name, per_call);
}

// Prevent dead code elimination
void escape(void *p) {
    __asm__ volatile("" : : "g"(p) : "memory");
}

void clobber(void) {
    __asm__ volatile("" : : : "memory");
}

Checklist

Profile First

Never optimize without measuring

Cache Friendly

Sequential access, avoid cache misses

Reduce Branches

Predictable or branchless code

Vectorize

Use SIMD for data-parallel work

Reduce Allocations

Pool, arena, or stack allocate

Use restrict

Help compiler optimize pointer code

Next Up

Embedded Systems

Write C for resource-constrained systems