Skip to main content

Performance Optimization in Go

Go is designed for performance, but writing fast Go code requires understanding the runtime, memory model, and profiling tools. This chapter covers practical techniques for optimizing Go applications.

Profiling with pprof

Go’s built-in profiler helps identify performance bottlenecks.

CPU Profiling

import (
    "os"
    "runtime/pprof"
)

func main() {
    // Create CPU profile
    f, err := os.Create("cpu.prof")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()
    
    if err := pprof.StartCPUProfile(f); err != nil {
        log.Fatal(err)
    }
    defer pprof.StopCPUProfile()
    
    // Your application code
    runApplication()
}

Memory Profiling

func main() {
    // At the end of your program
    f, err := os.Create("mem.prof")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()
    
    runtime.GC() // Get up-to-date statistics
    if err := pprof.WriteHeapProfile(f); err != nil {
        log.Fatal(err)
    }
}

HTTP pprof Server

import _ "net/http/pprof"

func main() {
    // pprof endpoints automatically registered
    go func() {
        log.Println(http.ListenAndServe("localhost:6060", nil))
    }()
    
    // Your application
    runApplication()
}
Access profiles at:
  • http://localhost:6060/debug/pprof/profile - CPU profile
  • http://localhost:6060/debug/pprof/heap - Memory profile
  • http://localhost:6060/debug/pprof/goroutine - Goroutine stacks
  • http://localhost:6060/debug/pprof/block - Blocking profile
  • http://localhost:6060/debug/pprof/mutex - Mutex contention

Analyzing Profiles

# Interactive analysis
go tool pprof cpu.prof

# Top functions by CPU
(pprof) top10

# View function details
(pprof) list myFunction

# Generate flame graph
go tool pprof -http=:8080 cpu.prof

# Compare profiles
go tool pprof -base old.prof new.prof

Benchmarking

Writing Benchmarks

func BenchmarkFibonacci(b *testing.B) {
    for i := 0; i < b.N; i++ {
        Fibonacci(20)
    }
}

func BenchmarkFibonacciParallel(b *testing.B) {
    b.RunParallel(func(pb *testing.PB) {
        for pb.Next() {
            Fibonacci(20)
        }
    })
}

// Benchmark with different inputs
func BenchmarkSort(b *testing.B) {
    sizes := []int{100, 1000, 10000}
    for _, size := range sizes {
        b.Run(fmt.Sprintf("size-%d", size), func(b *testing.B) {
            data := generateRandomSlice(size)
            b.ResetTimer() // Don't count setup
            for i := 0; i < b.N; i++ {
                sort.Ints(data)
            }
        })
    }
}

Memory Benchmarks

func BenchmarkAllocations(b *testing.B) {
    b.ReportAllocs() // Report memory allocations
    for i := 0; i < b.N; i++ {
        _ = make([]byte, 1024)
    }
}

Running Benchmarks

# Run all benchmarks
go test -bench=.

# Run specific benchmark
go test -bench=BenchmarkFibonacci

# Include memory stats
go test -bench=. -benchmem

# Run for specific duration
go test -bench=. -benchtime=5s

# Compare benchmarks
go install golang.org/x/perf/cmd/benchstat@latest
go test -bench=. -count=10 > old.txt
# Make changes
go test -bench=. -count=10 > new.txt
benchstat old.txt new.txt

Memory Optimization

Understanding Escape Analysis

// Stack allocation (fast)
func stackAlloc() int {
    x := 42  // Stays on stack
    return x
}

// Heap allocation (slower, GC pressure)
func heapAlloc() *int {
    x := 42   // Escapes to heap
    return &x // Pointer escapes function
}

// Check escape analysis
// go build -gcflags="-m" ./...

Reducing Allocations

// ❌ Bad: Allocates on each call
func processItems(items []Item) []Result {
    results := make([]Result, 0)  // Allocates, may grow
    for _, item := range items {
        results = append(results, process(item))
    }
    return results
}

// ✅ Good: Pre-allocate
func processItems(items []Item) []Result {
    results := make([]Result, 0, len(items))  // Pre-allocate capacity
    for _, item := range items {
        results = append(results, process(item))
    }
    return results
}

// ✅ Better: Reuse with sync.Pool
var resultPool = sync.Pool{
    New: func() interface{} {
        return make([]Result, 0, 100)
    },
}

func processItems(items []Item) []Result {
    results := resultPool.Get().([]Result)
    results = results[:0]  // Reset length, keep capacity
    defer resultPool.Put(results)
    
    for _, item := range items {
        results = append(results, process(item))
    }
    
    // Copy to return (if needed)
    out := make([]Result, len(results))
    copy(out, results)
    return out
}

String Concatenation

// ❌ Bad: O(n²) allocations
func buildString(parts []string) string {
    result := ""
    for _, part := range parts {
        result += part  // Creates new string each time
    }
    return result
}

// ✅ Good: strings.Builder
func buildString(parts []string) string {
    var builder strings.Builder
    builder.Grow(estimateSize(parts))  // Pre-allocate
    for _, part := range parts {
        builder.WriteString(part)
    }
    return builder.String()
}

// ✅ Also good for simple cases
func buildString(parts []string) string {
    return strings.Join(parts, "")
}

Struct Field Alignment

// ❌ Bad: 24 bytes (with padding)
type BadStruct struct {
    a bool    // 1 byte + 7 padding
    b int64   // 8 bytes
    c bool    // 1 byte + 7 padding
}

// ✅ Good: 16 bytes (minimal padding)
type GoodStruct struct {
    b int64   // 8 bytes
    a bool    // 1 byte
    c bool    // 1 byte + 6 padding
}

// Check with: go vet -fieldalignment ./...

Concurrency Optimization

Goroutine Pool

type Pool struct {
    work chan func()
    wg   sync.WaitGroup
}

func NewPool(size int) *Pool {
    p := &Pool{
        work: make(chan func(), size*2),
    }
    
    for i := 0; i < size; i++ {
        go p.worker()
    }
    
    return p
}

func (p *Pool) worker() {
    for fn := range p.work {
        fn()
        p.wg.Done()
    }
}

func (p *Pool) Submit(fn func()) {
    p.wg.Add(1)
    p.work <- fn
}

func (p *Pool) Wait() {
    p.wg.Wait()
}

func (p *Pool) Close() {
    close(p.work)
}

Reducing Lock Contention

// ❌ Bad: Single lock, high contention
type Cache struct {
    mu    sync.RWMutex
    items map[string]interface{}
}

// ✅ Good: Sharded cache
type ShardedCache struct {
    shards    [256]*shard
    shardMask uint8
}

type shard struct {
    mu    sync.RWMutex
    items map[string]interface{}
}

func NewShardedCache() *ShardedCache {
    c := &ShardedCache{shardMask: 255}
    for i := range c.shards {
        c.shards[i] = &shard{items: make(map[string]interface{})}
    }
    return c
}

func (c *ShardedCache) getShard(key string) *shard {
    hash := fnv32(key)
    return c.shards[hash&uint32(c.shardMask)]
}

func (c *ShardedCache) Get(key string) (interface{}, bool) {
    shard := c.getShard(key)
    shard.mu.RLock()
    defer shard.mu.RUnlock()
    val, ok := shard.items[key]
    return val, ok
}

func (c *ShardedCache) Set(key string, value interface{}) {
    shard := c.getShard(key)
    shard.mu.Lock()
    defer shard.mu.Unlock()
    shard.items[key] = value
}

Atomic Operations

import "sync/atomic"

// ❌ Mutex for simple counters
type Counter struct {
    mu    sync.Mutex
    value int64
}

func (c *Counter) Increment() {
    c.mu.Lock()
    c.value++
    c.mu.Unlock()
}

// ✅ Atomic for simple counters
type AtomicCounter struct {
    value atomic.Int64
}

func (c *AtomicCounter) Increment() {
    c.value.Add(1)
}

func (c *AtomicCounter) Value() int64 {
    return c.value.Load()
}

I/O Optimization

Buffered I/O

// ❌ Bad: Unbuffered writes
func writeLines(filename string, lines []string) error {
    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()
    
    for _, line := range lines {
        f.WriteString(line + "\n")  // Many small writes
    }
    return nil
}

// ✅ Good: Buffered writes
func writeLines(filename string, lines []string) error {
    f, err := os.Create(filename)
    if err != nil {
        return err
    }
    defer f.Close()
    
    w := bufio.NewWriter(f)
    defer w.Flush()
    
    for _, line := range lines {
        w.WriteString(line)
        w.WriteByte('\n')
    }
    return nil
}

Connection Pooling

// HTTP client with connection pooling
var httpClient = &http.Client{
    Timeout: 30 * time.Second,
    Transport: &http.Transport{
        MaxIdleConns:        100,
        MaxIdleConnsPerHost: 10,
        IdleConnTimeout:     90 * time.Second,
        DisableCompression:  false,
    },
}

// Reuse client across requests
func fetchURL(url string) ([]byte, error) {
    resp, err := httpClient.Get(url)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()
    return io.ReadAll(resp.Body)
}

JSON Optimization

Standard Library Tips

// Pre-allocate buffer for encoding
var bufferPool = sync.Pool{
    New: func() interface{} {
        return new(bytes.Buffer)
    },
}

func encodeJSON(v interface{}) ([]byte, error) {
    buf := bufferPool.Get().(*bytes.Buffer)
    buf.Reset()
    defer bufferPool.Put(buf)
    
    encoder := json.NewEncoder(buf)
    if err := encoder.Encode(v); err != nil {
        return nil, err
    }
    
    result := make([]byte, buf.Len())
    copy(result, buf.Bytes())
    return result, nil
}

Using Faster JSON Libraries

import "github.com/json-iterator/go"

var json = jsoniter.ConfigCompatibleWithStandardLibrary

// Drop-in replacement for encoding/json
func parseJSON(data []byte) (*User, error) {
    var user User
    err := json.Unmarshal(data, &user)
    return &user, err
}

Code Generation

//go:generate easyjson -all user.go

// easyjson generates fast marshaling code
type User struct {
    ID   int    `json:"id"`
    Name string `json:"name"`
}

// Use generated methods
func (u *User) MarshalJSON() ([]byte, error)
func (u *User) UnmarshalJSON(data []byte) error

Compiler Optimizations

Inlining

// Small functions get inlined automatically
func add(a, b int) int {
    return a + b
}

// Check inlining decisions
// go build -gcflags="-m" ./...

// Force inlining (use sparingly)
//go:noinline
func doNotInline() {}

Bounds Check Elimination

// ❌ Bounds check on each access
func sum(s []int) int {
    total := 0
    for i := 0; i < len(s); i++ {
        total += s[i]  // Bounds check
    }
    return total
}

// ✅ BCE with hint
func sum(s []int) int {
    total := 0
    _ = s[len(s)-1]  // Hint: we'll access all elements
    for i := 0; i < len(s); i++ {
        total += s[i]  // No bounds check needed
    }
    return total
}

// ✅ Range loop (compiler optimizes)
func sum(s []int) int {
    total := 0
    for _, v := range s {
        total += v  // Optimized
    }
    return total
}

Common Anti-Patterns

Defer in Hot Loops

// ❌ Bad: defer overhead in loop
func processFiles(files []string) error {
    for _, file := range files {
        f, err := os.Open(file)
        if err != nil {
            return err
        }
        defer f.Close()  // Deferred until function returns, not loop iteration!
        // Also: all files stay open!
        process(f)
    }
    return nil
}

// ✅ Good: Close explicitly or use helper
func processFiles(files []string) error {
    for _, file := range files {
        if err := processFile(file); err != nil {
            return err
        }
    }
    return nil
}

func processFile(file string) error {
    f, err := os.Open(file)
    if err != nil {
        return err
    }
    defer f.Close()  // Now correctly scoped
    return process(f)
}

Interface Conversions

// ❌ Bad: Interface conversion in hot path
func processItems(items []interface{}) {
    for _, item := range items {
        if s, ok := item.(string); ok {  // Type assertion overhead
            processString(s)
        }
    }
}

// ✅ Good: Use concrete types or generics
func processStrings(items []string) {
    for _, item := range items {
        processString(item)
    }
}

Profiling Checklist

  1. Identify hotspots with CPU profiling
  2. Check memory allocations with heap profiling
  3. Find goroutine leaks with goroutine profiling
  4. Detect lock contention with mutex profiling
  5. Analyze blocking with block profiling
# Comprehensive profiling
go test -bench=. -cpuprofile=cpu.prof -memprofile=mem.prof -blockprofile=block.prof

# Trace for detailed analysis
go test -trace=trace.out
go tool trace trace.out

Interview Questions

  1. Use pprof heap profile to see allocations
  2. Check goroutine count over time (runtime.NumGoroutine())
  3. Monitor process memory with external tools
  4. Look for growing maps, slices, or channels
  5. Check for goroutines blocked forever
Escape analysis determines if a variable can stay on the stack or must escape to the heap. Stack allocation is faster and doesn’t require garbage collection. Use go build -gcflags="-m" to see escape decisions.
  • Reduce allocations (pre-allocate slices, use sync.Pool)
  • Avoid creating many short-lived objects
  • Use value types instead of pointers when possible
  • Batch operations to amortize allocation cost
  • Consider GOGC tuning for specific workloads
Use sync.Pool for:
  • Frequently allocated/deallocated objects
  • Objects with predictable lifecycle
  • Buffers, temporary structs, connection wrappers
Don’t use for long-lived objects or when object state matters.

Summary

TechniqueWhen to Use
CPU ProfilingIdentify slow functions
Memory ProfilingFind allocation hotspots
BenchmarkingMeasure and compare performance
sync.PoolReduce GC pressure for temp objects
ShardingReduce lock contention
Buffered I/OReduce system calls
Pre-allocationAvoid slice/map growth
Atomic OperationsSimple concurrent counters