C Programming: Low-Level Mastery
HomeInsightsCoursesC ProgrammingInline Assembly (GCC Specific)
Advanced Topics

Inline Assembly

Master inline assembly for performance-critical code and low-level operations. Learn GCC extended asm syntax, constraints, clobbers, volatile keyword, and when assembly is necessary versus when compiler intrinsics suffice.

Understanding Inline Assembly

Inline assembly embeds assembly instructions directly in C code. Useful for: accessing CPU instructions unavailable in C, optimizing critical sections, interfacing with hardware. Compiler-specific and architecture-dependent. Use sparingly - modern compilers optimize well. GCC uses extended asm syntax with constraints.

C
/* GCC inline assembly syntax */

/* Basic asm (statements only) */
asm("nop");  /* No operation */

/* Extended asm (with operands) */
asm("movl %1, %0"
    : "=r" (output)      /* Outputs */
    : "r" (input)        /* Inputs */
    : "memory"           /* Clobbers */
);

/* Complete example: Add two numbers */
int add_asm(int a, int b) {
    int result;
    
    asm("addl %2, %1\n\t"
        "movl %1, %0"
        : "=r" (result)       /* Output: result in register */
        : "r" (a), "r" (b)   /* Inputs: a and b in registers */
    );
    
    return result;
}

/* When to use inline assembly */
/*
   Use for:
   - CPU-specific instructions (CPUID, RDTSC)
   - Atomic operations (before C11)
   - Memory barriers
   - Bit manipulation tricks
   - Performance-critical loops
   
   Don't use for:
   - Simple arithmetic (compiler is better)
   - Portable code
   - Unless profiled and proven necessary
   - If intrinsics available
*/

/* Volatile asm (don't optimize away) */
asm volatile("nop");  /* Always executed */

/* Example: Read timestamp counter */
unsigned long long rdtsc(void) {
    unsigned int lo, hi;
    
    asm volatile("rdtsc"
        : "=a" (lo), "=d" (hi)
    );
    
    return ((unsigned long long)hi << 32) | lo;
}

/* Syntax breakdown */
/*
   asm [volatile] (
       "assembly code"
       : output operands
       : input operands
       : clobbered registers
   );
   
   %0, %1, %2 = operand numbers
   %= = unique label number
*/

Operand Constraints

Constraints tell compiler where to put operands: registers, memory, immediates. Understanding constraints is key to effective inline assembly. Common constraints: r (register), m (memory), i (immediate), a/b/c/d (specific registers).

C
/* Common constraints */

/* "r" - any register */
int a, b;
asm("addl %1, %0" : "=r" (a) : "r" (b));

/* "m" - memory location */
int mem_var;
asm("movl %1, %0" : "=m" (mem_var) : "r" (42));

/* "i" - immediate constant */
asm("addl $5, %0" : "+r" (a) : "i" (5));

/* Specific registers (x86) */
/* "a" = eax/rax */
/* "b" = ebx/rbx */
/* "c" = ecx/rcx */
/* "d" = edx/rdx */

unsigned int cpuid_info(unsigned int info_type) {
    unsigned int eax, ebx, ecx, edx;
    
    asm volatile("cpuid"
        : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
        : "a" (info_type)
    );
    
    return eax;
}

/* Constraint modifiers */

/* "=" - write-only output */
asm("movl $42, %0" : "=r" (a));

/* "+" - read-write */
asm("addl $5, %0" : "+r" (a));  /* a = a + 5 */

/* "&" - early clobber (output before inputs read) */
asm("..." : "=&r" (output) : "r" (input));

/* Matching constraints */
asm("incl %0" : "+r" (a));  /* Increment a */

/* Or explicitly: */
asm("incl %0" : "=r" (a) : "0" (a));  /* %0 = input */

/* Multiple operands */
void swap_asm(int *a, int *b) {
    int temp;
    
    asm("movl (%1), %0\n\t"    /* temp = *a */
        "movl (%2), %%eax\n\t"  /* eax = *b */
        "movl %%eax, (%1)\n\t"  /* *a = eax */
        "movl %0, (%2)"           /* *b = temp */
        : "=&r" (temp)
        : "r" (a), "r" (b)
        : "eax", "memory"
    );
}

/* %% for literal % */
asm("movl %%eax, %0" : "=r" (a));  /* %% becomes % */

/* Common constraint combinations */

/* Register or memory */
asm("..." : "=rm" (output));

/* Register or immediate */
asm("..." : "ri" (input));

/* Specific register (eax) or memory */
asm("..." : "=am" (output));

/* Constraint examples by architecture */

/* x86/x86-64 */
"a" /* eax, rax */
"b" /* ebx, rbx */
"c" /* ecx, rcx */
"d" /* edx, rdx */
"S" /* esi, rsi */
"D" /* edi, rdi */
"r" /* any general register */
"m" /* memory */
"i" /* immediate */
"n" /* immediate known at compile time */
"g" /* register, memory, or immediate */

/* ARM */
"r" /* general register */
"w" /* VFP floating point register */
"m" /* memory */
"I" /* immediate 0-255 */
"J" /* immediate -255-0 */

Clobber List and Side Effects

Clobber list tells compiler which registers/memory assembly modifies. Prevents compiler from assuming values preserved. Essential for correctness. Common clobbers: "cc" (flags), "memory" (any memory), register names. Missing clobbers cause subtle bugs.

C
/* Clobber list syntax */
asm("..."
    : outputs
    : inputs
    : "clobbers"
);

/* "cc" - condition codes (flags register) */
asm("addl %1, %0" : "=r" (a) : "r" (b) : "cc");
/* Addition affects flags */

/* "memory" - arbitrary memory modified */
asm("movl $0, (%0)" : : "r" (ptr) : "memory");
/* Tells compiler memory changed */

/* Specific registers */
asm("..." : : : "eax", "edx");
/* eax and edx clobbered */

/* Complete example: Atomic compare-and-swap */
int atomic_cas(int *ptr, int old_val, int new_val) {
    int result;
    
    asm volatile(
        "lock cmpxchgl %2, %1"
        : "=a" (result), "+m" (*ptr)
        : "r" (new_val), "a" (old_val)
        : "cc", "memory"
    );
    
    return result;
}

/* Why clobbers matter */

/* BAD: Missing clobbers */
void bad_asm(void) {
    int x = 10;
    
    asm("movl $42, %%eax");  /* No clobber */
    
    /* Compiler assumes eax unchanged */
    /* May place variables in eax - BUG! */
}

/* GOOD: Proper clobbers */
void good_asm(void) {
    int x = 10;
    
    asm("movl $42, %%eax"
        : /* no outputs */
        : /* no inputs */
        : "eax");  /* Tell compiler eax modified */
}

/* Memory clobber example */
void memory_clobber_example(void) {
    int *ptr = malloc(sizeof(int));
    *ptr = 100;
    
    /* Without memory clobber */
    asm("movl $42, (%0)" : : "r" (ptr));
    /* Compiler might cache *ptr = 100 */
    
    /* With memory clobber */
    asm("movl $42, (%0)" : : "r" (ptr) : "memory");
    /* Compiler knows memory changed */
    
    int value = *ptr;  /* Correctly reads 42 */
}

/* Volatile and memory */

/* Non-volatile: Can be optimized away */
asm("nop");  /* Might be removed */

/* Volatile: Always executed */
asm volatile("nop");  /* Never removed */

/* Memory barrier */
void memory_barrier(void) {
    asm volatile("" : : : "memory");
    /* Prevents reordering across this point */
}

/* Common clobber patterns */

/* Basic arithmetic (affects flags) */
asm("addl %1, %0" : "+r" (a) : "r" (b) : "cc");

/* Memory write */
asm("movl $0, %0" : "=m" (var) : : "memory");

/* Multiple registers */
asm("..."
    : /* outputs */
    : /* inputs */
    : "eax", "ebx", "ecx", "edx", "cc", "memory"
);

/* All caller-saved registers (x86) */
asm("call my_function"
    : /* outputs */
    : /* inputs */
    : "eax", "ecx", "edx", "cc", "memory"
);

/* Inline assembly best practices for clobbers */

/* 1. Always specify affected flags */
/* 2. Always specify "memory" if memory touched */
/* 3. List all modified registers */
/* 4. Use volatile for side effects */
/* 5. Test thoroughly */

/* Example: Complete atomic operation */
int atomic_add(int *ptr, int value) {
    int result;
    
    asm volatile(
        "lock xaddl %0, %1"
        : "=r" (result), "+m" (*ptr)
        : "0" (value)
        : "cc", "memory"
    );
    
    return result;
}

Practical Examples

Real-world inline assembly: CPU feature detection, performance counting, atomic operations, bit manipulation, system calls. These examples show when assembly provides value over pure C.

C
/* Example 1: CPUID (x86 CPU information) */
typedef struct {
    unsigned int eax, ebx, ecx, edx;
} CPUIDResult;

CPUIDResult cpuid(unsigned int leaf) {
    CPUIDResult result;
    
    asm volatile("cpuid"
        : "=a" (result.eax),
          "=b" (result.ebx),
          "=c" (result.ecx),
          "=d" (result.edx)
        : "a" (leaf)
    );
    
    return result;
}

int has_sse2(void) {
    CPUIDResult r = cpuid(1);
    return (r.edx >> 26) & 1;  /* SSE2 bit */
}

/* Example 2: Read Time Stamp Counter */
static inline unsigned long long rdtsc_inline(void) {
    unsigned int lo, hi;
    
    asm volatile("rdtsc" : "=a" (lo), "=d" (hi));
    
    return ((unsigned long long)hi << 32) | lo;
}

void benchmark_code(void) {
    unsigned long long start = rdtsc_inline();
    
    /* Code to measure */
    
    unsigned long long end = rdtsc_inline();
    printf("Cycles: %llu\n", end - start);
}

/* Example 3: Atomic operations (pre-C11) */
int atomic_increment(int *ptr) {
    int result = 1;
    
    asm volatile(
        "lock xaddl %0, %1"
        : "+r" (result), "+m" (*ptr)
        : : "cc", "memory"
    );
    
    return result + 1;  /* Return new value */
}

/* Example 4: Bit manipulation */
int count_leading_zeros(unsigned int x) {
    int result;
    
    asm("bsrl %1, %0\n\t"
        "xorl $31, %0"
        : "=r" (result)
        : "r" (x)
        : "cc"
    );
    
    return result;
}

/* Example 5: Memory fence */
void mfence(void) {
    asm volatile("mfence" : : : "memory");
}

void sfence(void) {
    asm volatile("sfence" : : : "memory");
}

void lfence(void) {
    asm volatile("lfence" : : : "memory");
}

/* Example 6: Fast division by constant */
unsigned int div_by_3(unsigned int x) {
    unsigned int result;
    
    asm("movl %1, %%eax\n\t"
        "movl $0xAAAAAAAB, %%edx\n\t"
        "mull %%edx\n\t"
        "shrl $1, %%edx\n\t"
        "movl %%edx, %0"
        : "=r" (result)
        : "r" (x)
        : "eax", "edx", "cc"
    );
    
    return result;
}

/* Example 7: Endianness swap */
unsigned int bswap(unsigned int x) {
    unsigned int result;
    
    asm("bswap %0"
        : "=r" (result)
        : "0" (x)
    );
    
    return result;
}

/* Example 8: Pause instruction (spinlock) */
static inline void cpu_pause(void) {
    asm volatile("pause" : : : "memory");
}

void spin_wait(volatile int *flag) {
    while (*flag == 0) {
        cpu_pause();  /* Reduce power, improve performance */
    }
}

/* Example 9: Prefetch */
static inline void prefetch(const void *addr) {
    asm volatile("prefetcht0 %0" : : "m" (*(const char*)addr));
}

void process_array(int *arr, int size) {
    for (int i = 0; i < size; i++) {
        if (i + 64 < size) {
            prefetch(&arr[i + 64]);  /* Prefetch future data */
        }
        /* Process arr[i] */
    }
}

/* Example 10: System call (Linux x86-64) */
long syscall_write(int fd, const void *buf, size_t count) {
    long ret;
    
    asm volatile(
        "movq $1, %%rax\n\t"      /* syscall number */
        "movq %1, %%rdi\n\t"      /* fd */
        "movq %2, %%rsi\n\t"      /* buf */
        "movq %3, %%rdx\n\t"      /* count */
        "syscall\n\t"
        "movq %%rax, %0"
        : "=r" (ret)
        : "r" ((long)fd), "r" ((long)buf), "r" ((long)count)
        : "rax", "rdi", "rsi", "rdx", "rcx", "r11", "memory", "cc"
    );
    
    return ret;
}

Alternatives and Best Practices

Before using inline assembly, consider alternatives: compiler intrinsics, C11 atomics, library functions. Assembly is last resort. Test on target CPU. Document thoroughly. Provide C fallback. Profile to verify benefit.

C
/* Alternative 1: Compiler intrinsics */

/* Instead of inline asm: */
int clz_asm(unsigned int x) {
    int result;
    asm("bsrl %1, %0; xorl $31, %0" : "=r" (result) : "r" (x));
    return result;
}

/* Use intrinsic: */
int clz_intrinsic(unsigned int x) {
    return __builtin_clz(x);  /* GCC/Clang builtin */
}

/* Common intrinsics */
__builtin_popcount(x);        /* Count set bits */
__builtin_ctz(x);             /* Count trailing zeros */
__builtin_clz(x);             /* Count leading zeros */
__builtin_bswap32(x);         /* Byte swap */
__builtin_prefetch(addr);     /* Prefetch */

/* Alternative 2: C11 atomics */

/* Instead of inline asm atomics: */
int atomic_load_asm(int *ptr) {
    int result;
    asm volatile("movl %1, %0" : "=r" (result) : "m" (*ptr) : "memory");
    return result;
}

/* Use C11: */
#include <stdatomic.h>

int atomic_load_c11(atomic_int *ptr) {
    return atomic_load(ptr);
}

/* Alternative 3: Library functions */

/* Instead of asm for common operations */
#include <x86intrin.h>  /* Intel intrinsics */

unsigned long long rdtsc_intrinsic(void) {
    return __rdtsc();
}

void cpu_pause_intrinsic(void) {
    _mm_pause();
}

/* Best practices */

/* 1. Document CPU requirements */
#if defined(__x86_64__) && defined(__SSE2__)
    /* SSE2 assembly here */
#else
    /* Fallback implementation */
#endif

/* 2. Provide C fallback */
static inline int popcount(unsigned int x) {
#ifdef __GNUC__
    return __builtin_popcount(x);
#else
    /* Pure C fallback */
    int count = 0;
    while (x) {
        count += x & 1;
        x >>= 1;
    }
    return count;
#endif
}

/* 3. Isolate in separate functions */
/* Don't mix asm and complex C logic */

/* BAD: Complex function with asm */
int complex_function(int a, int b) {
    /* Lots of C code */
    asm("..."); /* Assembly buried in middle */
    /* More C code */
}

/* GOOD: Separate asm function */
static inline int asm_operation(int x) {
    int result;
    asm("..." : "=r" (result) : "r" (x));
    return result;
}

int complex_function(int a, int b) {
    /* C code */
    int temp = asm_operation(a);
    /* More C code */
}

/* 4. Profile before and after */
void benchmark_comparison(void) {
    unsigned long long start, end;
    
    /* C version */
    start = rdtsc();
    /* C implementation */
    end = rdtsc();
    printf("C version: %llu cycles\n", end - start);
    
    /* Asm version */
    start = rdtsc();
    /* Asm implementation */
    end = rdtsc();
    printf("Asm version: %llu cycles\n", end - start);
}

/* 5. Comment assembly */
asm volatile(
    "movl %1, %%eax\n\t"   /* Load a into eax */
    "addl %2, %%eax\n\t"   /* Add b to eax */
    "movl %%eax, %0"         /* Store result */
    : "=r" (result)
    : "r" (a), "r" (b)
    : "eax", "cc"
);

/* 6. Test on target architecture */
/* Asm is architecture-specific */
/* Test on x86, ARM, etc. as needed */

/* 7. Check compiler output */
/* gcc -S file.c */
/* Verify compiler doesn't already generate optimal code */

/* 8. Use only when necessary */
/*
   When to use inline asm:
   - Accessing special CPU instructions
   - Proved performance benefit (profiled!)
   - No intrinsic available
   - Critical low-level code
   
   When NOT to use:
   - Portability needed
   - Unproven benefit
   - Intrinsic exists
   - Compiler already optimizes well
*/

/* Example: Complete with all best practices */
#if defined(__x86_64__) && defined(__SSE2__)
/**
 * Fast memory copy using SSE2
 * @param dest Destination (16-byte aligned)
 * @param src Source (16-byte aligned)
 * @param size Size in bytes (multiple of 16)
 */
static inline void memcpy_sse2(void *dest, const void *src, size_t size) {
    asm volatile(
        "1:\n\t"
        "movdqa (%1), %%xmm0\n\t"
        "movdqa %%xmm0, (%0)\n\t"
        "addq $16, %0\n\t"
        "addq $16, %1\n\t"
        "subq $16, %2\n\t"
        "jnz 1b"
        : "+r" (dest), "+r" (src), "+r" (size)
        : : "xmm0", "memory", "cc"
    );
}
#else
/* Fallback to standard memcpy */
static inline void memcpy_sse2(void *dest, const void *src, size_t size) {
    memcpy(dest, src, size);
}
#endif

Summary & What's Next

Key Takeaways:

  • ✅ Inline asm embeds assembly in C
  • ✅ GCC extended asm: outputs, inputs, clobbers
  • ✅ Use constraints to specify operand locations
  • ✅ Clobber list tells compiler what's modified
  • ✅ Volatile prevents optimization
  • ✅ Architecture and compiler specific
  • ✅ Prefer intrinsics when available
  • ✅ Profile to verify performance benefit

What's Next?

Let's learn about linking and creating libraries!