Best Practices

Code Optimization Techniques

Learn how to write faster, more efficient C code. Understand compiler optimizations, algorithmic improvements, cache optimization, and profiling. Make your programs run faster without sacrificing readability or correctness.

The Optimization Mindset

Premature optimization is the root of all evil. First, write correct code. Then profile to find bottlenecks. Optimize only hot paths. Use compiler optimizations. Most gains come from better algorithms, not micro-optimizations. Always measure before and after. Readability matters more than minor speed improvements.

C
/* Optimization process */

/* 1. Write correct, clear code */
/* 2. Profile to find bottlenecks */
/* 3. Optimize bottlenecks */
/* 4. Measure improvement */
/* 5. Repeat if needed */

/* Rules of optimization */

/*
   1. Don't optimize prematurely
   2. Profile before optimizing
   3. Optimize algorithms first
   4. Let compiler optimize when possible
   5. Cache is king
   6. Branch prediction matters
   7. Measure everything
   8. Document optimizations
*/

/* Compiler optimization levels */

/* gcc -O0  = No optimization (default, fast compile) */
/* gcc -O1  = Basic optimization */
/* gcc -O2  = Recommended optimization */
/* gcc -O3  = Aggressive optimization */
/* gcc -Os  = Optimize for size */
/* gcc -Ofast = Maximum speed (may break standards) */

/* Example: Compile with optimization */
/* gcc -O2 -march=native program.c -o program */

/* -O2          = Good balance of speed and compile time */
/* -march=native = Use CPU-specific instructions */

/* Profile before optimizing */

/* Compile with profiling info */
/* gcc -pg program.c -o program */
/* ./program */
/* gprof program gmon.out &gt; profile.txt */

/* Or use perf (Linux) */
/* perf record ./program */
/* perf report */

/* Or use Valgrind Callgrind */
/* valgrind --tool=callgrind ./program */
/* kcachegrind callgrind.out.* */

/* Time your code */
#include <time.h>

clock_t start = clock();

/* Code to measure */

clock_t end = clock();
double cpu_time = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("Time: %f seconds\n", cpu_time);

/* Or use gettimeofday for wall time */
#include <sys/time.h>

struct timeval start, end;
gettimeofday(&start, NULL);

/* Code to measure */

gettimeofday(&end, NULL);
long seconds = end.tv_sec - start.tv_sec;
long micros = end.tv_usec - start.tv_usec;
double elapsed = seconds + micros / 1e6;
printf("Time: %f seconds\n", elapsed);

Algorithmic Optimization

The biggest performance gains come from better algorithms. O(nÂ²) to O(n log n) makes huge difference. Choose right data structures. Avoid unnecessary work. This matters far more than micro-optimizations. Learn algorithm complexity.

C
/* SLOW: O(nÂ²) bubble sort */
void bubble_sort_slow(int arr[], int n) {
    for (int i = 0; i < n - 1; i++) {
        for (int j = 0; j < n - i - 1; j++) {
            if (arr[j] &gt; arr[j + 1]) {
                int temp = arr[j];
                arr[j] = arr[j + 1];
                arr[j + 1] = temp;
            }
        }
    }
}

/* FASTER: O(n log n) quicksort */
#include <stdlib.h>

int compare(const void *a, const void *b) {
    return (*(int*)a - *(int*)b);
}

void quick_sort_fast(int arr[], int n) {
    qsort(arr, n, sizeof(int), compare);
}

/* For n=10000: bubble sort ~100,000,000 operations */
/*              quicksort    ~130,000 operations */
/* 769x faster! */

/* SLOW: Linear search O(n) */
int linear_search(const int arr[], int n, int target) {
    for (int i = 0; i < n; i++) {
        if (arr[i] == target) {
            return i;
        }
    }
    return -1;
}

/* FASTER: Binary search O(log n) - array must be sorted */
int binary_search(const int arr[], int n, int target) {
    int left = 0, right = n - 1;
    
    while (left <= right) {
        int mid = left + (right - left) / 2;
        
        if (arr[mid] == target) {
            return mid;
        } else if (arr[mid] < target) {
            left = mid + 1;
        } else {
            right = mid - 1;
        }
    }
    return -1;
}

/* For n=1,000,000: linear search ~500,000 comparisons (average) */
/*                  binary search ~20 comparisons */
/* 25,000x faster! */

/* SLOW: Repeated computation */
int fibonacci_slow(int n) {
    if (n <= 1) return n;
    return fibonacci_slow(n - 1) + fibonacci_slow(n - 2);
}
/* O(2^n) - exponential! */

/* FASTER: Memoization */
int fib_memo[100] = {0};

int fibonacci_fast(int n) {
    if (n <= 1) return n;
    
    if (fib_memo[n] != 0) {
        return fib_memo[n];
    }
    
    fib_memo[n] = fibonacci_fast(n - 1) + fibonacci_fast(n - 2);
    return fib_memo[n];
}
/* O(n) - linear! */

/* SLOW: Creating many small allocations */
for (int i = 0; i < 1000; i++) {
    char *str = malloc(10);
    /* Use str */
    free(str);
}

/* FASTER: Single allocation */
char buffer[1000][10];
for (int i = 0; i < 1000; i++) {
    /* Use buffer[i] */
}

/* SLOW: String concatenation in loop */
char result[10000] = "";
for (int i = 0; i < 100; i++) {
    strcat(result, "item");  /* O(n) each time */
}
/* Total: O(nÂ²) */

/* FASTER: Track position */
char result[10000];
char *pos = result;
for (int i = 0; i < 100; i++) {
    strcpy(pos, "item");
    pos += 4;  /* length of "item" */
}
*pos = '\0';
/* Total: O(n) */

/* Choose right data structure */

/* Need frequent insertion/deletion at ends? Use linked list */
/* Need random access? Use array */
/* Need fast lookup? Use hash table */
/* Need sorted data? Use binary search tree or sorted array */

/* Avoid unnecessary work */

/* SLOW: Computing inside loop */
for (int i = 0; i < n; i++) {
    int limit = compute_expensive_limit();  /* Called n times */
    if (i < limit) {
        /* ... */
    }
}

/* FASTER: Compute once outside loop */
int limit = compute_expensive_limit();  /* Called once */
for (int i = 0; i < n; i++) {
    if (i < limit) {
        /* ... */
    }
}

Memory and Cache Optimization

Modern CPUs are fast, but memory is slow. Cache misses cost hundreds of cycles. Access memory sequentially for best cache performance. Keep data structures small and compact. Avoid pointer chasing. Locality matters enormously.

C
/* Cache-friendly code */

/* SLOW: Column-major access (cache unfriendly) */
int matrix[1000][1000];
for (int col = 0; col < 1000; col++) {
    for (int row = 0; row < 1000; row++) {
        matrix[row][col] = 0;  /* Jumps around memory */
    }
}

/* FASTER: Row-major access (cache friendly) */
for (int row = 0; row < 1000; row++) {
    for (int col = 0; col < 1000; col++) {
        matrix[row][col] = 0;  /* Sequential access */
    }
}
/* Can be 10x faster due to cache hits */

/* SLOW: Linked list (poor cache locality) */
struct Node {
    int data;
    struct Node *next;  /* Pointer chase = cache miss */
};

/* FASTER: Array (excellent cache locality) */
int data[1000];  /* All in contiguous memory */

/* Structure packing */

/* SLOW: Poor layout (24 bytes on 64-bit) */
struct BadLayout {
    char c;      /* 1 byte + 7 bytes padding */
    double d;    /* 8 bytes */
    int i;       /* 4 bytes + 4 bytes padding */
};

/* FASTER: Good layout (16 bytes on 64-bit) */
struct GoodLayout {
    double d;    /* 8 bytes */
    int i;       /* 4 bytes */
    char c;      /* 1 byte + 3 bytes padding */
};
/* 33% smaller! */

/* Array of structures vs. structure of arrays */

/* SLOW: Array of structures (poor cache use) */
struct Particle {
    float x, y, z;     /* 12 bytes */
    float vx, vy, vz;  /* 12 bytes */
    float mass;        /* 4 bytes */
    int id;            /* 4 bytes */
};                     /* Total: 32 bytes */

struct Particle particles[10000];

/* Update positions only - loads unnecessary data */
for (int i = 0; i < 10000; i++) {
    particles[i].x += particles[i].vx;
    particles[i].y += particles[i].vy;
    particles[i].z += particles[i].vz;
}

/* FASTER: Structure of arrays (better cache use) */
struct ParticleSystem {
    float x[10000], y[10000], z[10000];
    float vx[10000], vy[10000], vz[10000];
    float mass[10000];
    int id[10000];
};

struct ParticleSystem system;

/* Update positions - only loads position data */
for (int i = 0; i < 10000; i++) {
    system.x[i] += system.vx[i];
    system.y[i] += system.vy[i];
    system.z[i] += system.vz[i];
}
/* Can be 2-4x faster */

/* Memory alignment */

/* Aligned access is faster */
/* Align to power of 2: 4, 8, 16, 32, 64 bytes */

/* Manual alignment */
#include <stdlib.h>

/* Aligned allocation */
void *aligned_alloc(size_t alignment, size_t size);

/* Example: 64-byte alignment (cache line) */
float *data = aligned_alloc(64, 1000 * sizeof(float));

/* GCC attribute */
struct __attribute__((aligned(64))) CacheLineAligned {
    int data[16];
};

/* Prefetching (advanced) */

/* Manual prefetch hint (GCC) */
__builtin_prefetch(&data[i + 64], 0, 3);
/* Prefetch data that will be needed soon */

/* Avoid false sharing (multithreading) */

/* BAD: False sharing (threads fight over cache line) */
struct SharedData {
    int counter1;  /* Used by thread 1 */
    int counter2;  /* Used by thread 2 */
};  /* Both in same cache line - slow! */

/* GOOD: Padding to separate cache lines */
struct PaddedData {
    int counter1;
    char padding[60];  /* Force to different cache lines */
    int counter2;
};

Loop Optimization

Loops are often hotspots. Move invariant code outside. Unroll for fewer branches. Fuse similar loops. Minimize operations inside loop body. Let compiler do the work when possible - modern compilers are smart.

C
/* Loop invariant code motion */

/* SLOW: Recomputing inside loop */
for (int i = 0; i < n; i++) {
    result[i] = array[i] * (x + y + z);  /* (x+y+z) computed every time */
}

/* FASTER: Compute once outside */
int factor = x + y + z;
for (int i = 0; i < n; i++) {
    result[i] = array[i] * factor;
}

/* Loop unrolling */

/* SLOW: Many iterations, many branches */
for (int i = 0; i < 1000; i++) {
    sum += array[i];
}

/* FASTER: Loop unrolling */
for (int i = 0; i < 1000; i += 4) {
    sum += array[i];
    sum += array[i + 1];
    sum += array[i + 2];
    sum += array[i + 3];
}
/* Fewer branches, better pipelining */

/* Compiler can do this with -O2 or -O3 */

/* Loop fusion */

/* SLOW: Two separate loops */
for (int i = 0; i < n; i++) {
    a[i] = b[i] + c[i];
}
for (int i = 0; i < n; i++) {
    d[i] = a[i] * 2;
}

/* FASTER: Single loop */
for (int i = 0; i < n; i++) {
    a[i] = b[i] + c[i];
    d[i] = a[i] * 2;
}
/* Better cache locality */

/* Strength reduction */

/* SLOW: Expensive operation in loop */
for (int i = 0; i < n; i++) {
    result[i] = array[i] * i;  /* Multiplication */
}

/* FASTER: Use addition instead */
int temp = 0;
for (int i = 0; i < n; i++) {
    result[i] = array[i] * temp;
    temp += array[i];  /* Addition is faster */
}

/* Actually, modern CPUs make multiplication fast */
/* This optimization often not worth it - let compiler decide */

/* Minimize function calls in loops */

/* SLOW */
for (int i = 0; i < n; i++) {
    process(array[i]);  /* Function call overhead */
}

/* FASTER: Inline function */
static inline void process(int x) {
    /* Small function body */
}

/* Or use compiler hint */
__attribute__((always_inline))
void process(int x) {
    /* ... */
}

/* Loop tiling (blocking) for cache */

/* SLOW: Cache unfriendly for large matrices */
for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
        for (int k = 0; k < N; k++) {
            C[i][j] += A[i][k] * B[k][j];
        }
    }
}

/* FASTER: Blocked for cache */
#define BLOCK 64
for (int i = 0; i < N; i += BLOCK) {
    for (int j = 0; j < N; j += BLOCK) {
        for (int k = 0; k < N; k += BLOCK) {
            /* Process BLOCK x BLOCK sub-matrices */
            for (int ii = i; ii < i + BLOCK && ii < N; ii++) {
                for (int jj = j; jj < j + BLOCK && jj < N; jj++) {
                    for (int kk = k; kk < BLOCK && kk < N; kk++) {
                        C[ii][jj] += A[ii][kk] * B[kk][jj];
                    }
                }
            }
        }
    }
}
/* Can be 10x faster for large matrices */

Branch Optimization & CPU-Friendly Code

Modern CPUs use branch prediction. Mispredicted branches are expensive (10-20 cycles). Make branches predictable. Eliminate branches when possible. Use branchless code for simple cases. Profile to find mispredictions.

C
/* Branch prediction */

/* PREDICTABLE: Same outcome most of the time */
for (int i = 0; i < 1000000; i++) {
    if (array[i] &gt; 0) {  /* Usually true or usually false */
        /* ... */
    }
}
/* CPU predicts well */

/* UNPREDICTABLE: Random outcomes */
for (int i = 0; i < 1000000; i++) {
    if (array[i] % 2 == 0) {  /* 50/50 random */
        /* ... */
    }
}
/* CPU mispredicts often - slower */

/* Branchless code */

/* SLOW: Branch for min/max */
int min(int a, int b) {
    if (a < b) {
        return a;
    } else {
        return b;
    }
}

/* FASTER: Branchless (on some CPUs) */
int min_branchless(int a, int b) {
    return b + ((a - b) & ((a - b) >> 31));
}

/* Actually, modern compilers optimize if to cmov instruction */
/* So simple if is fine - compiler handles it */

/* Branchless abs */
int abs_branchless(int x) {
    int mask = x >> 31;
    return (x + mask) ^ mask;
}

/* Eliminate branches */

/* SLOW: Multiple conditions */
if (x == 1) {
    result = a;
} else if (x == 2) {
    result = b;
} else if (x == 3) {
    result = c;
} else {
    result = d;
}

/* FASTER: Lookup table */
int lookup[] = {d, a, b, c};  /* index 0=default, 1=a, 2=b, 3=c */
result = lookup[x % 4];

/* Function pointer tables */

/* SLOW: Switch on function selection */
void process(int type, int data) {
    switch (type) {
        case TYPE_A: process_a(data); break;
        case TYPE_B: process_b(data); break;
        case TYPE_C: process_c(data); break;
    }
}

/* FASTER: Function pointer table */
typedef void (*process_func)(int);

process_func processors[] = {
    process_a,
    process_b,
    process_c
};

void process_fast(int type, int data) {
    processors[type](data);
}

/* Likely/unlikely hints (GCC) */

/* Tell compiler which branch is likely */
#define likely(x)   __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)

if (unlikely(ptr == NULL)) {
    /* Error path - rare */
    return -1;
}

/* Normal path - common */
/* ... */

/* Profile-guided optimization (PGO) */

/* 1. Compile with profiling */
/* gcc -fprofile-generate program.c -o program */

/* 2. Run program with typical workload */
/* ./program */

/* 3. Recompile with profile data */
/* gcc -fprofile-use program.c -o program */

/* Compiler uses actual branch statistics for optimization */

Compiler Optimization Tricks

C
/* Help the compiler optimize */

/* Use restrict keyword */
void copy(int * restrict dest, const int * restrict src, size_t n) {
    for (size_t i = 0; i < n; i++) {
        dest[i] = src[i];
    }
}
/* Tells compiler dest and src don't overlap - enables optimizations */

/* Use const */
void process(const int *data, int n) {
    /* Compiler knows data isn't modified */
}

/* Use static for file-scope functions */
static void helper(void) {
    /* Compiler can inline or optimize knowing it's not called externally */
}

/* Use inline for small functions */
static inline int square(int x) {
    return x * x;
}

/* Link-time optimization (LTO) */
/* gcc -flto -O2 file1.c file2.c -o program */
/* Optimizes across multiple source files */

/* Compiler intrinsics */

/* Use SIMD instructions */
#include <immintrin.h>  /* x86 SSE/AVX */

/* Example: Add 4 floats at once */
__m128 a = _mm_load_ps(array1);
__m128 b = _mm_load_ps(array2);
__m128 c = _mm_add_ps(a, b);
_mm_store_ps(result, c);

/* Auto-vectorization */
/* gcc -O3 -march=native -ftree-vectorize */

/* Simple loops can auto-vectorize */
void add_arrays(float *a, float *b, float *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}
/* With -O3 -march=native, compiler generates SIMD code */

/* Check if vectorized */
/* gcc -O3 -march=native -ftree-vectorize -fopt-info-vec */

/* Optimization pragmas */

/* Unroll loop */
#pragma GCC unroll 4
for (int i = 0; i < n; i++) {
    /* ... */
}

/* Vectorize loop */
#pragma GCC ivdep  /* Ignore vector dependencies */
for (int i = 0; i < n; i++) {
    a[i] = b[i] + c[i];
}

/* Summary: Let compiler do the work */
/* - Use -O2 or -O3 */
/* - Use -march=native */
/* - Enable warnings: -Wall -Wextra */
/* - Profile and optimize only bottlenecks */
/* - Modern compilers are very smart! */

Summary & What's Next

Optimization Key Points:

âœ… Profile first, optimize second
âœ… Algorithm choice matters most
âœ… Cache locality is critical
âœ… Sequential access beats random access
âœ… Use compiler optimizations (-O2, -O3)
âœ… Avoid premature optimization
âœ… Measure everything
âœ… Readability matters

What's Next?

Let's explore modern C and the future of the language!

Next: Modern C & Future â†’