Skip to main content

System Calls & POSIX

System calls are the gateway between your program and the operating system kernel. Let’s understand how they work. System call path from user space to kernel

User Space vs Kernel Space

┌─────────────────────────────────────────────────────────────────────────────┐
│                           USER SPACE                                         │
│                                                                              │
│  ┌─────────┐  ┌─────────┐  ┌─────────┐                                      │
│  │  Your   │  │  libc   │  │ Other   │                                      │
│  │ Program │  │ (glibc) │  │ Libraries│                                     │
│  └────┬────┘  └────┬────┘  └─────────┘                                      │
│       │           │                                                          │
│       │   printf()│ → write() wrapper                                       │
│       └───────────┼──────────────────────────────────────────────────────┐  │
│                   │                                                      │  │
├───────────────────┴──────────────────────────────────────────────────────┼──┤
│                          SYSTEM CALL INTERFACE                            │  │
│                                                                          │  │
│         syscall(SYS_write, fd, buf, count)                              │  │
│                              │                                           │  │
├──────────────────────────────┼───────────────────────────────────────────┴──┤
│                              ▼                                              │
│                         KERNEL SPACE                                        │
│                                                                              │
│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐                      │
│  │   Scheduler  │  │  Filesystem  │  │   Memory     │                      │
│  │              │  │   (VFS)      │  │  Management  │                      │
│  └──────────────┘  └──────────────┘  └──────────────┘                      │
│                                                                              │
│  ┌──────────────────────────────────────────────────────────────────────┐  │
│  │                        Hardware Drivers                               │  │
│  └──────────────────────────────────────────────────────────────────────┘  │
│                                                                              │
└─────────────────────────────────────────────────────────────────────────────┘
User Mode vs Kernel Mode Transition

The Transition Steps

  1. User Mode (Ring 3): Your program runs with limited privileges. It cannot access hardware directly.
  2. Library Call: You call printf(). The C library (libc) formats the string and calls write().
  3. System Call: The write() wrapper puts arguments in CPU registers (e.g., rax=1 for write) and executes a special instruction (syscall on x86-64).
  4. Mode Switch: The CPU switches to Kernel Mode (Ring 0) and jumps to a predefined kernel entry point.
  5. Kernel Execution: The kernel validates arguments, checks permissions, and performs the operation (e.g., writing to the terminal buffer).
  6. Return: The kernel executes sysret, switching the CPU back to User Mode and returning the result (number of bytes written or error).

Making System Calls

Via libc Wrappers

#include <unistd.h>
#include <fcntl.h>

int main(void) {
    // These are libc wrapper functions, not raw syscalls
    int fd = open("file.txt", O_RDONLY);
    
    char buffer[1024];
    ssize_t bytes = read(fd, buffer, sizeof(buffer));
    
    write(STDOUT_FILENO, buffer, bytes);
    
    close(fd);
    return 0;
}

Direct System Calls

#include <unistd.h>
#include <sys/syscall.h>

int main(void) {
    // Direct syscall (bypasses libc)
    const char *msg = "Hello, kernel!\n";
    
    // SYS_write = 1 on x86-64 Linux
    syscall(SYS_write, STDOUT_FILENO, msg, 15);
    
    // Inline assembly (x86-64)
    // Register convention: rax=syscall#, rdi=arg1, rsi=arg2, rdx=arg3
    long ret;
    __asm__ volatile (
        "syscall"
        : "=a" (ret)
        : "a" (SYS_write), "D" (STDOUT_FILENO), "S" (msg), "d" (15)
        : "rcx", "r11", "memory"
    );
    
    return 0;
}

Error Handling

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>

int main(void) {
    int fd = open("/nonexistent", O_RDONLY);
    
    if (fd == -1) {
        // errno is set by the system call
        fprintf(stderr, "Error code: %d\n", errno);
        fprintf(stderr, "Error message: %s\n", strerror(errno));
        perror("open");  // Prints: "open: No such file or directory"
        
        // Common errno values
        switch (errno) {
            case ENOENT: printf("File not found\n"); break;
            case EACCES: printf("Permission denied\n"); break;
            case EEXIST: printf("File exists\n"); break;
            case EINTR:  printf("Interrupted\n"); break;
            case EINVAL: printf("Invalid argument\n"); break;
            case ENOMEM: printf("Out of memory\n"); break;
            case ENOSPC: printf("No space left\n"); break;
        }
        
        return 1;
    }
    
    close(fd);
    return 0;
}

// Robust error-handling wrapper
int safe_open(const char *path, int flags) {
    int fd;
    
    do {
        fd = open(path, flags);
    } while (fd == -1 && errno == EINTR);  // Retry on interrupt
    
    return fd;
}

// Robust read (handles partial reads and interrupts)
ssize_t safe_read(int fd, void *buf, size_t count) {
    ssize_t total = 0;
    char *ptr = buf;
    
    while (count > 0) {
        ssize_t n = read(fd, ptr, count);
        
        if (n == -1) {
            if (errno == EINTR) continue;  // Retry
            return -1;  // Real error
        }
        
        if (n == 0) break;  // EOF
        
        total += n;
        ptr += n;
        count -= n;
    }
    
    return total;
}

File Descriptors

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>

int main(void) {
    // Standard file descriptors
    // 0 = stdin, 1 = stdout, 2 = stderr
    
    // Open returns lowest available fd
    int fd = open("file.txt", O_RDWR | O_CREAT, 0644);
    printf("Opened fd: %d\n", fd);  // Usually 3
    
    // Duplicate file descriptor
    int fd2 = dup(fd);  // fd2 points to same file
    
    // Duplicate to specific number
    int fd3 = dup2(fd, 10);  // fd 10 now points to same file
    
    // Get/set file descriptor flags
    int flags = fcntl(fd, F_GETFL);
    fcntl(fd, F_SETFL, flags | O_NONBLOCK);
    
    // Get file info
    struct stat st;
    fstat(fd, &st);
    printf("Size: %ld bytes\n", st.st_size);
    printf("Mode: %o\n", st.st_mode);
    printf("Is regular file: %d\n", S_ISREG(st.st_mode));
    printf("Is directory: %d\n", S_ISDIR(st.st_mode));
    
    close(fd);
    close(fd2);
    close(fd3);
    
    return 0;
}

Process Information

#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <pwd.h>

int main(void) {
    printf("PID:  %d\n", getpid());
    printf("PPID: %d\n", getppid());
    printf("UID:  %d\n", getuid());
    printf("EUID: %d\n", geteuid());
    printf("GID:  %d\n", getgid());
    
    // Get username
    struct passwd *pw = getpwuid(getuid());
    printf("User: %s\n", pw->pw_name);
    printf("Home: %s\n", pw->pw_dir);
    
    // Current working directory
    char cwd[1024];
    if (getcwd(cwd, sizeof(cwd))) {
        printf("CWD:  %s\n", cwd);
    }
    
    // Hostname
    char hostname[256];
    gethostname(hostname, sizeof(hostname));
    printf("Host: %s\n", hostname);
    
    return 0;
}

Environment Variables

#include <stdio.h>
#include <stdlib.h>

extern char **environ;  // Global environment

int main(int argc, char *argv[], char *envp[]) {
    // Get environment variable
    const char *path = getenv("PATH");
    if (path) {
        printf("PATH: %s\n", path);
    }
    
    // Set environment variable
    setenv("MY_VAR", "my_value", 1);  // 1 = overwrite
    putenv("ANOTHER_VAR=another_value");  // Alternative
    
    // Remove environment variable
    unsetenv("MY_VAR");
    
    // Iterate all environment variables
    printf("\n=== All Environment Variables ===\n");
    for (char **env = environ; *env; env++) {
        printf("%s\n", *env);
    }
    
    // Or using the envp parameter
    // for (int i = 0; envp[i]; i++) { ... }
    
    return 0;
}

Time and Date

#include <stdio.h>
#include <time.h>
#include <sys/time.h>

int main(void) {
    // Current time (seconds since epoch)
    time_t now = time(NULL);
    printf("Epoch time: %ld\n", now);
    
    // Human-readable
    struct tm *local = localtime(&now);
    printf("Local: %04d-%02d-%02d %02d:%02d:%02d\n",
           local->tm_year + 1900, local->tm_mon + 1, local->tm_mday,
           local->tm_hour, local->tm_min, local->tm_sec);
    
    // UTC
    struct tm *utc = gmtime(&now);
    printf("UTC:   %04d-%02d-%02d %02d:%02d:%02d\n",
           utc->tm_year + 1900, utc->tm_mon + 1, utc->tm_mday,
           utc->tm_hour, utc->tm_min, utc->tm_sec);
    
    // Formatted string
    char buf[100];
    strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S %Z", local);
    printf("Formatted: %s\n", buf);
    
    // High-resolution time
    struct timeval tv;
    gettimeofday(&tv, NULL);
    printf("Microseconds: %ld.%06ld\n", tv.tv_sec, tv.tv_usec);
    
    // Monotonic clock (for measuring intervals)
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    printf("Monotonic: %ld.%09ld\n", ts.tv_sec, ts.tv_nsec);
    
    // Sleep
    sleep(1);           // Seconds
    usleep(100000);     // Microseconds
    nanosleep(&(struct timespec){0, 100000000}, NULL);  // 100ms
    
    return 0;
}

// Measuring execution time
void measure_time(void (*func)(void)) {
    struct timespec start, end;
    
    clock_gettime(CLOCK_MONOTONIC, &start);
    func();
    clock_gettime(CLOCK_MONOTONIC, &end);
    
    double elapsed = (end.tv_sec - start.tv_sec) +
                     (end.tv_nsec - start.tv_nsec) / 1e9;
    printf("Elapsed: %.6f seconds\n", elapsed);
}

Resource Limits

#include <stdio.h>
#include <sys/resource.h>

int main(void) {
    struct rlimit rl;
    
    // Get limits
    getrlimit(RLIMIT_NOFILE, &rl);
    printf("Max open files: %ld (soft), %ld (hard)\n",
           rl.rlim_cur, rl.rlim_max);
    
    getrlimit(RLIMIT_STACK, &rl);
    printf("Stack size: %ld KB (soft), %ld KB (hard)\n",
           rl.rlim_cur / 1024, rl.rlim_max / 1024);
    
    getrlimit(RLIMIT_AS, &rl);
    printf("Address space: %ld MB (soft)\n", rl.rlim_cur / (1024*1024));
    
    // Set limits
    rl.rlim_cur = 1024 * 1024 * 100;  // 100 MB
    setrlimit(RLIMIT_AS, &rl);
    
    // Resource usage
    struct rusage usage;
    getrusage(RUSAGE_SELF, &usage);
    printf("User time: %ld.%06ld s\n",
           usage.ru_utime.tv_sec, usage.ru_utime.tv_usec);
    printf("System time: %ld.%06ld s\n",
           usage.ru_stime.tv_sec, usage.ru_stime.tv_usec);
    printf("Max RSS: %ld KB\n", usage.ru_maxrss);
    
    return 0;
}

POSIX Portability

#include <stdio.h>
#include <unistd.h>

int main(void) {
    // Check POSIX features
    #ifdef _POSIX_VERSION
        printf("POSIX version: %ld\n", _POSIX_VERSION);
    #endif
    
    // Runtime capability checks
    long val;
    
    val = sysconf(_SC_PAGESIZE);
    printf("Page size: %ld\n", val);
    
    val = sysconf(_SC_NPROCESSORS_ONLN);
    printf("CPUs online: %ld\n", val);
    
    val = sysconf(_SC_OPEN_MAX);
    printf("Max open files: %ld\n", val);
    
    val = sysconf(_SC_CLK_TCK);
    printf("Clock ticks/sec: %ld\n", val);
    
    // Path configuration
    val = pathconf("/", _PC_NAME_MAX);
    printf("Max filename length: %ld\n", val);
    
    val = pathconf("/", _PC_PATH_MAX);
    printf("Max path length: %ld\n", val);
    
    return 0;
}

Common System Call Reference

CategorySystem Calls
File I/Oopen, close, read, write, lseek, pread, pwrite
File Infostat, fstat, lstat, access, chmod, chown
Directoriesmkdir, rmdir, chdir, getcwd, opendir, readdir
Processesfork, exec*, wait, waitpid, exit, _exit
Signalskill, sigaction, sigprocmask, pause, sigsuspend
Memorymmap, munmap, mprotect, brk, sbrk
IPCpipe, socketpair, shmget, semget, msgget
Networksocket, bind, listen, accept, connect, send, recv
Timetime, gettimeofday, clock_gettime, nanosleep
Miscioctl, fcntl, dup, dup2, select, poll, epoll

Exercises

1

System Info Tool

Build a tool that prints comprehensive system information (CPU, memory, disk, network).
2

Safe Wrapper Library

Create a library of safe wrappers for common system calls with proper error handling and EINTR retry.
3

Syscall Tracer

Use ptrace to build a simple strace-like tool.
4

Resource Monitor

Build a tool that monitors a process’s resource usage over time.

Next Up

Concurrency

Process and thread programming