Kernel Data Structures
The Linux kernel uses specialized data structures optimized for performance, concurrency, and cache efficiency. Understanding these is essential for reading kernel source code and for systems programming interviews.
Interview Frequency : High (especially for infrastructure roles)
Key Topics : Linked lists, RB-trees, RCU, slab allocator
Time to Master : 8-10 hours
Kernel Linked Lists
The kernel’s linked list implementation is one of the most elegant pieces of C code you’ll encounter.
Traditional vs Kernel-Style Linked Lists
┌─────────────────────────────────────────────────────────────────────────────┐
│ TRADITIONAL VS KERNEL LINKED LIST │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ TRADITIONAL (data embedded in node): │
│ │
│ struct node { │
│ int data; │
│ struct node *next; │
│ struct node *prev; │
│ }; │
│ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ data: 1 │───→│ data: 2 │───→│ data: 3 │ │
│ │ next ────│ │ next ────│ │ next:NULL│ │
│ │ prev:NULL│←───│ prev ────│←───│ prev ────│ │
│ └──────────┘ └──────────┘ └──────────┘ │
│ │
│ KERNEL-STYLE (node embedded in data): │
│ │
│ struct list_head { │
│ struct list_head *next; │
│ struct list_head *prev; │
│ }; │
│ │
│ struct my_data { │
│ int value; │
│ char name[32]; │
│ struct list_head list; // embed the link │
│ }; │
│ │
│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │
│ │ value: 1 │ │ value: 2 │ │ value: 3 │ │
│ │ name: "first" │ │ name: "second" │ │ name: "third" │ │
│ │ list.next ──────│───→│ list.next ──────│───→│ list.next ──────│─→(head)│
│ │ list.prev ←─────│←───│ list.prev ←─────│←───│ list.prev ←─────│ │
│ └─────────────────┘ └─────────────────┘ └─────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
Why Kernel-Style is Better
Type-agnostic : Same list_head works for any struct
No memory allocation : Node is part of the data
Multiple lists : One struct can be on multiple lists
Cache friendly : Data and links together
The Magic: container_of Macro
// include/linux/kernel.h
#define container_of ( ptr , type , member ) ({ \
const typeof (((type * ) 0 )-> member ) * __mptr = (ptr); \
(type * )(( char * )__mptr - offsetof (type, member)); })
// Example usage:
struct my_data {
int value;
struct list_head list;
};
// Given a list_head pointer, get the containing struct
struct list_head * pos;
struct my_data * data = container_of (pos, struct my_data , list);
List Operations
#include <linux/list.h>
// Initialize list head
LIST_HEAD (my_list); // Static initialization
// or
struct list_head my_list;
INIT_LIST_HEAD ( & my_list );
// Add to list
struct my_data * new_item = kmalloc ( sizeof ( * new_item), GFP_KERNEL);
new_item -> value = 42 ;
list_add ( & new_item -> list , & my_list ); // Add to front
list_add_tail ( & new_item -> list , & my_list ); // Add to back
// Delete from list
list_del ( & item -> list );
// Iterate over list
struct my_data * entry;
list_for_each_entry (entry, & my_list , list) {
printk ( "Value: %d \n " , entry -> value );
}
// Safe iteration (allows deletion during iteration)
struct my_data * tmp;
list_for_each_entry_safe (entry, tmp, & my_list , list) {
if ( entry -> value < 0 ) {
list_del ( & entry -> list );
kfree (entry);
}
}
// Check if list is empty
if ( list_empty ( & my_list )) { ... }
// Move entire list
LIST_HEAD (new_list);
list_splice ( & my_list , & new_list );
Red-Black Trees
Used throughout the kernel for fast ordered access: process scheduling, memory management, etc.
RB-Tree Properties
Every node is red or black
Root is black
All leaves (NULL) are black
Red node’s children are black
All paths to leaves have same black count
Kernel RB-Tree API
#include <linux/rbtree.h>
struct my_node {
struct rb_node node; // Must be first or use rb_entry
int key;
int value;
};
// Tree root
struct rb_root my_tree = RB_ROOT;
// Insert (must implement search yourself)
void my_insert ( struct rb_root * root , struct my_node * new )
{
struct rb_node ** link = & root -> rb_node ;
struct rb_node * parent = NULL ;
struct my_node * entry;
while ( * link) {
parent = * link;
entry = rb_entry (parent, struct my_node, node);
if ( new -> key < entry -> key )
link = & parent -> rb_left ;
else
link = & parent -> rb_right ;
}
rb_link_node ( & new -> node , parent, link);
rb_insert_color ( & new -> node , root);
}
// Search
struct my_node * my_search ( struct rb_root * root , int key )
{
struct rb_node * node = root -> rb_node ;
while (node) {
struct my_node * entry = rb_entry (node, struct my_node, node);
if (key < entry -> key )
node = node -> rb_left ;
else if (key > entry -> key )
node = node -> rb_right ;
else
return entry;
}
return NULL ;
}
// Iterate in order
struct rb_node * node;
for (node = rb_first ( & my_tree ); node; node = rb_next (node)) {
struct my_node * entry = rb_entry (node, struct my_node, node);
printk ( "Key: %d , Value: %d \n " , entry -> key , entry -> value );
}
RB-Tree Use Cases in Kernel
Subsystem Use CFS Scheduler Task ordering by vruntime Memory Management VMA ordering by address epoll File descriptor management ext4 Extent tree
Radix Trees and XArray
For sparse arrays with integer keys (like page cache).
XArray (Modern Replacement for Radix Tree)
#include <linux/xarray.h>
DEFINE_XARRAY (my_xa);
// Store value at index
void * old = xa_store ( & my_xa , index, pointer, GFP_KERNEL);
// Load value at index
void * value = xa_load ( & my_xa , index);
// Erase entry
xa_erase ( & my_xa , index);
// Iterate
unsigned long index;
void * entry;
xa_for_each ( & my_xa , index, entry) {
printk ( "Index: %lu , Entry: %p x \n " , index, entry);
}
Use Cases
Page cache : Map file offset to page
Process ID allocation : Map PID to task_struct
Inode cache : Map inode number to inode
Hash Tables
For O(1) lookups when ordering isn’t needed.
Kernel Hash Table API
#include <linux/hashtable.h>
// Define hash table with 2^10 = 1024 buckets
DEFINE_HASHTABLE (my_hash, 10 );
struct my_entry {
int key;
int value;
struct hlist_node node;
};
// Add entry
struct my_entry * entry = kmalloc ( sizeof ( * entry), GFP_KERNEL);
entry -> key = 42 ;
entry -> value = 100 ;
hash_add (my_hash, & entry -> node , entry -> key );
// Lookup
struct my_entry * found;
hash_for_each_possible (my_hash, found, node, search_key) {
if ( found -> key == search_key)
return found;
}
// Delete
hash_del ( & entry -> node );
// Iterate all entries
int bkt;
struct my_entry * cur;
hash_for_each (my_hash, bkt, cur, node) {
printk ( "Key: %d , Value: %d \n " , cur -> key , cur -> value );
}
Per-CPU Variables
Avoid cache bouncing by giving each CPU its own copy.
#include <linux/percpu.h>
// Static per-CPU variable
DEFINE_PER_CPU ( int , my_counter);
// Access
int val = get_cpu_var (my_counter); // Disables preemption
val ++ ;
put_cpu_var (my_counter); // Re-enables preemption
// Or using this_cpu operations (preferred, handles preemption)
this_cpu_inc (my_counter);
this_cpu_add (my_counter, 5 );
int sum = this_cpu_read (my_counter);
// Dynamic per-CPU allocation
int __percpu * ptr = alloc_percpu ( int );
* per_cpu_ptr (ptr, cpu) = value;
free_percpu (ptr);
// Sum across all CPUs
int total = 0 ;
int cpu;
for_each_possible_cpu (cpu)
total += * per_cpu_ptr (ptr, cpu);
Why Per-CPU Matters
┌─────────────────────────────────────────────────────────────────────────────┐
│ SHARED VS PER-CPU VARIABLE │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ SHARED VARIABLE (cache bouncing problem): │
│ │
│ CPU 0 writes → invalidates cache line on CPU 1, 2, 3 │
│ CPU 1 writes → invalidates cache line on CPU 0, 2, 3 │
│ CPU 2 writes → ... │
│ │
│ Cache line bounces between CPUs = ~100+ cycles per access │
│ │
│ ──────────────────────────────────────────────────────────────── │
│ │
│ PER-CPU VARIABLE (no bouncing): │
│ │
│ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │
│ │ CPU 0 │ │ CPU 1 │ │ CPU 2 │ │ CPU 3 │ │
│ │counter │ │counter │ │counter │ │counter │ │
│ │ 5 │ │ 3 │ │ 7 │ │ 2 │ │
│ └────────┘ └────────┘ └────────┘ └────────┘ │
│ │
│ Each CPU has its own copy = always in local cache │
│ To get total: sum all per-CPU values │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
RCU (Read-Copy-Update)
The most important synchronization mechanism for read-heavy kernel data structures.
RCU Concept
┌─────────────────────────────────────────────────────────────────────────────┐
│ RCU UPDATE PROCESS │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ Initial state: │
│ ┌─────────┐ ┌─────────────┐ │
│ │ Head │────→│ Old Data │ │
│ └─────────┘ └─────────────┘ │
│ ↑ ↑ │
│ Writers Readers (may be reading) │
│ │
│ Step 1: Allocate new, copy data, modify copy │
│ ┌─────────┐ ┌─────────────┐ │
│ │ Head │────→│ Old Data │ (readers still using) │
│ └─────────┘ └─────────────┘ │
│ ┌─────────────┐ │
│ │ New Data │ (prepared, not linked) │
│ └─────────────┘ │
│ │
│ Step 2: Update pointer atomically │
│ ┌─────────┐ ┌─────────────┐ │
│ │ Head │──┐ │ Old Data │ (existing readers still valid!) │
│ └─────────┘ │ └─────────────┘ │
│ │ ┌─────────────┐ │
│ └─→│ New Data │ (new readers see this) │
│ └─────────────┘ │
│ │
│ Step 3: Wait for grace period (all readers finish) │
│ synchronize_rcu() blocks until no reader can have old pointer │
│ │
│ Step 4: Free old data safely │
│ ┌─────────────┐ │
│ ┌─────────┐────→│ New Data │ │
│ └─────────┘ └─────────────┘ │
│ Old data freed (no readers could be using it) │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
RCU API
#include <linux/rcupdate.h>
struct my_data {
int value;
struct rcu_head rcu; // For deferred freeing
};
struct my_data __rcu * global_ptr;
// Reader side (no blocking allowed!)
rcu_read_lock ();
struct my_data * ptr = rcu_dereference (global_ptr);
if (ptr)
printk ( "Value: %d \n " , ptr -> value );
rcu_read_unlock ();
// Writer side
struct my_data * new = kmalloc ( sizeof ( * new), GFP_KERNEL);
new -> value = 42 ;
struct my_data * old = rcu_dereference_protected (global_ptr,
lockdep_is_held ( & my_lock ));
rcu_assign_pointer (global_ptr, new);
// Wait for all readers, then free
synchronize_rcu ();
kfree (old);
// Or use call_rcu for async freeing
void my_free ( struct rcu_head * head ) {
struct my_data * ptr = container_of (head, struct my_data, rcu);
kfree (ptr);
}
call_rcu ( & old -> rcu , my_free);
Why RCU is Important
Zero-overhead reads : No atomics, no memory barriers on read path
Scalability : Readers never block each other
Used everywhere : Routing tables, file descriptors, module list
Memory Allocators
kmalloc - Small Object Allocation
#include <linux/slab.h>
// Basic allocation
void * ptr = kmalloc (size, GFP_KERNEL);
kfree (ptr);
// Zero-initialized
void * ptr = kzalloc (size, GFP_KERNEL);
// Array allocation
void * arr = kmalloc_array (n, sizeof (type), GFP_KERNEL);
void * arr = kcalloc (n, sizeof (type), GFP_KERNEL); // zeroed
GFP Flags
Flag Meaning When to Use GFP_KERNELMay sleep, normal allocation Most kernel code GFP_ATOMICCannot sleep, uses reserves Interrupt context GFP_NOWAITCannot sleep, fails quickly When you can retry GFP_USERFor user-space requests mmap, etc. GFP_DMADMA-capable memory Device drivers
vmalloc - Large Allocations
#include <linux/vmalloc.h>
// For large allocations (pages may not be contiguous)
void * ptr = vmalloc (large_size);
vfree (ptr);
// With zeroing
void * ptr = vzalloc (large_size);
kmalloc vs vmalloc
Aspect kmalloc vmalloc Physical memory Contiguous May be non-contiguous Size limit ~4MB Limited by virtual address space Performance Faster Slower (TLB overhead) Use case Small objects Large buffers, module loading
Slab Allocator
For frequently allocated objects of the same size:
// Create cache
struct kmem_cache * my_cache = kmem_cache_create (
"my_objects" , // Name (for debugging)
sizeof ( struct my_obj), // Object size
0 , // Alignment (0 = default)
SLAB_HWCACHE_ALIGN, // Flags
NULL // Constructor
);
// Allocate from cache
struct my_obj * obj = kmem_cache_alloc (my_cache, GFP_KERNEL);
// Free to cache
kmem_cache_free (my_cache, obj);
// Destroy cache
kmem_cache_destroy (my_cache);
Slab Allocator Internals
┌─────────────────────────────────────────────────────────────────────────────┐
│ SLAB ALLOCATOR STRUCTURE │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ kmem_cache ("task_struct" cache) │
│ ┌────────────────────────────────────────────────────────────────────┐ │
│ │ name: "task_struct" │ │
│ │ object_size: 2688 bytes │ │
│ │ objects_per_slab: 12 │ │
│ │ │ │
│ │ Per-CPU cache (fast path): │ │
│ │ ┌─────────────────────────────────────────────────────────────┐ │ │
│ │ │ CPU 0: [obj][obj][obj][free][free] │ │ │
│ │ │ CPU 1: [obj][obj][free][free][free] │ │ │
│ │ │ CPU 2: [free][free][free][free][free] │ │ │
│ │ └─────────────────────────────────────────────────────────────┘ │ │
│ │ │ │
│ │ Partial slabs (shared): │ │
│ │ ┌──────────────────────┐ ┌──────────────────────┐ │ │
│ │ │ Slab 1: 8/12 used │ │ Slab 2: 3/12 used │ │ │
│ │ └──────────────────────┘ └──────────────────────┘ │ │
│ │ │ │
│ │ Full slabs: │ │
│ │ ┌──────────────────────┐ │ │
│ │ │ Slab 3: 12/12 used │ │ │
│ │ └──────────────────────┘ │ │
│ └────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
Lab Exercises
Lab 1: Implement Kernel-Style Linked List
Objective : Use kernel-style linked lists in user space// list.c - User-space implementation
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
// Minimal kernel-style list implementation
struct list_head {
struct list_head * next, * prev;
};
#define LIST_HEAD_INIT ( name ) { & (name), & (name) }
#define LIST_HEAD ( name ) struct list_head name = LIST_HEAD_INIT (name)
#define container_of ( ptr , type , member ) \
((type * )(( char * )(ptr) - offsetof (type, member)))
#define list_entry ( ptr , type , member ) container_of (ptr, type, member)
#define list_for_each_entry ( pos , head , member ) \
for (pos = list_entry ((head)-> next , typeof ( * pos), member); \
& pos -> member != (head); \
pos = list_entry ( pos -> member . next , typeof ( * pos), member))
static inline void INIT_LIST_HEAD ( struct list_head * list ) {
list -> next = list -> prev = list;
}
static inline void __list_add ( struct list_head * new ,
struct list_head * prev ,
struct list_head * next ) {
next -> prev = new;
new -> next = next;
new -> prev = prev;
prev -> next = new;
}
static inline void list_add_tail ( struct list_head * new ,
struct list_head * head ) {
__list_add (new, head -> prev , head);
}
// Example struct with embedded list
struct process {
int pid;
char name [ 32 ];
struct list_head list;
};
LIST_HEAD (process_list);
int main () {
// Create processes
for ( int i = 0 ; i < 5 ; i ++ ) {
struct process * p = malloc ( sizeof ( * p));
p -> pid = 100 + i;
snprintf ( p -> name , sizeof ( p -> name ), "process_ %d " , i);
list_add_tail ( & p -> list , & process_list);
}
// Iterate
struct process * p;
list_for_each_entry (p, & process_list, list) {
printf ( "PID: %d , Name: %s \n " , p -> pid , p -> name );
}
return 0 ;
}
Lab 2: Explore Slab Caches
Objective : Analyze kernel memory allocation# View all slab caches
cat /proc/slabinfo | head -20
# Detailed slab stats
sudo slabtop
# Memory usage by cache
cat /proc/slabinfo | awk 'NR>2 {print $1, $2*$4}' | sort -nk2 | tail -20
# Find task_struct cache
cat /proc/slabinfo | grep task_struct
# View with slabinfo tool
sudo cat /sys/kernel/slab/ * /alloc_calls | head -50
Objective : Understand RCU behavior// rcu_sim.c - Simple RCU-like simulation
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <stdatomic.h>
#include <unistd.h>
struct data {
int value;
};
atomic_int grace_period;
struct data * _Atomic global_ptr;
void * reader ( void * arg ) {
int tid = * ( int * )arg;
for ( int i = 0 ; i < 1000000 ; i ++ ) {
// Simulate rcu_read_lock (just read grace period)
int gp = atomic_load ( & grace_period);
// Read data
struct data * ptr = atomic_load ( & global_ptr);
if (ptr) {
// Use the data
volatile int v = ptr -> value ;
( void )v;
}
// Simulate rcu_read_unlock
}
printf ( "Reader %d done \n " , tid);
return NULL ;
}
void * writer ( void * arg ) {
for ( int i = 0 ; i < 100 ; i ++ ) {
// Allocate new
struct data * new = malloc ( sizeof ( * new));
new -> value = i;
// Swap pointer
struct data * old = atomic_exchange ( & global_ptr, new);
// Wait for grace period (simplified)
atomic_fetch_add ( & grace_period, 1 );
usleep ( 1000 ); // Real RCU uses smarter waiting
// Free old
free (old);
}
printf ( "Writer done \n " );
return NULL ;
}
int main () {
global_ptr = malloc ( sizeof ( struct data));
(( struct data * )global_ptr)-> value = 0 ;
pthread_t readers [ 4 ], writer_t ;
int tids [ 4 ] = { 0 , 1 , 2 , 3 };
for ( int i = 0 ; i < 4 ; i ++ )
pthread_create ( & readers [i], NULL , reader, & tids [i]);
pthread_create ( & writer_t , NULL , writer, NULL );
for ( int i = 0 ; i < 4 ; i ++ )
pthread_join ( readers [i], NULL );
pthread_join ( writer_t , NULL );
return 0 ;
}
Interview Questions
Q1: Explain how container_of works
Answer :container_of(ptr, type, member) returns a pointer to the containing structure given a pointer to a member.How it works :#define container_of ( ptr , type , member ) \
(type * )(( char * )ptr - offsetof (type, member))
offsetof(type, member) - bytes from struct start to member
(char *)ptr - treat ptr as byte pointer
Subtract offset to get struct start
Cast to struct pointer
Example :struct person {
char name [ 32 ]; // offset 0
int age; // offset 32
struct list_head list; // offset 36
};
// Given &person.list (address 0x1024)
// container_of returns 0x1024 - 36 = 0x1000 (struct start)
Why important : Enables type-agnostic data structures in C.
Q2: When would you use RCU vs a spinlock?
Answer :Use RCU when :
Reads vastly outnumber writes
Read-side performance is critical
Can tolerate slightly stale data on reads
Updates can use copy-and-replace semantics
Use spinlock when :
Read/write ratio is balanced
Need to modify data in place
Can’t tolerate stale reads
Critical section is very short
Performance comparison :Operation RCU Spinlock Read (uncontended) ~1 cycle ~10 cycles Read (contended) ~1 cycle ~100+ cycles Write ~100+ cycles ~10 cycles
Real examples :
Routing table: RCU (millions of lookups, rare updates)
Memory allocator: Spinlock (frequent alloc/free)
Q3: Why does the kernel use slab allocation?
Answer :Problems with raw page allocation :
Internal fragmentation (small objects waste pages)
Slow initialization (must init objects each time)
Cache unfriendly (random placement)
Slab benefits :
Reduced fragmentation : Groups same-size objects
Cache coloring : Staggers objects across cache lines
Object caching : Keeps freed objects initialized
Per-CPU caches : Reduces lock contention
Debugging support : Red zones, poisoning
Example impact :
Creating process: task_struct allocated instantly from slab
Without slab: Would need page allocation + initialization
Measured speedup: 10-100x for hot objects
Q4: Explain per-CPU variables and their benefits
Answer :What are per-CPU variables :
Each CPU gets its own copy of the variable
No synchronization needed for CPU-local access
Benefits :
No cache bouncing : Variable always in local cache
No locking needed : Each CPU has exclusive access
Better scalability : Performance scales with CPU count
Implementation :// Data layout in memory:
CPU 0 : [counter_0][padding...][other_vars_0]
CPU 1 : [counter_1][padding...][other_vars_1]
CPU 2 : [counter_2][padding...][other_vars_2]
Use cases :
Counters (stats, metrics)
Caches (per-CPU object pools)
State (current process, interrupt flags)
Trade-off : Uses more memory (N copies) for better performance.
Key Takeaways
Kernel Lists Embedded list_head pattern enables type-agnostic, efficient linked lists
RCU Read-Copy-Update provides scalable read-heavy synchronization
Slab Allocator Object caching and per-CPU pools optimize frequent allocations
Per-CPU Variables Eliminate cache bouncing for frequently accessed data
Next: Process Subsystem Deep Dive →