Phase 6-8: RDTSC cycle profiling - Critical bottleneck discovered!

Implementation:
Ultra-lightweight CPU cycle profiling using RDTSC instruction (~10 cycles overhead).

Changes:
1. Added rdtsc() inline function for x86_64 CPU cycle counter
2. Instrumented tiny_fast_alloc(), tiny_fast_free(), tiny_fast_refill()
3. Track malloc, free, refill, and migration cycles separately
4. Profile output via HAKMEM_TINY_PROFILE=1 environment variable
5. Renamed variables to avoid conflict with core/hakmem.c globals

Files modified:
- core/tiny_fastcache.h: rdtsc(), profile helpers, extern declarations
- core/tiny_fastcache.c: counter definitions, print_profile() output

Usage:
```bash
HAKMEM_TINY_PROFILE=1 ./larson_hakmem 2 8 128 1024 1 12345 4
```

Results (Larson 4 threads, 1.637M ops/s):
```
[MALLOC] count=20,480, avg_cycles=2,476
[REFILL] count=1,285,  avg_cycles=38,412  ← 15.5x slower!
[FREE]   (no data - not called via fast path)
```

Critical discoveries:

1. **REFILL is the bottleneck:**
   - Average 38,412 cycles per refill (15.5x slower than malloc)
   - Refill accounts for: 1,285 × 38,412 = 49.3M cycles
   - Despite Phase 3 batch optimization, still extremely slow
   - Calling hak_tiny_alloc() 16 times has massive overhead

2. **MALLOC is 24x slower than expected:**
   - Average 2,476 cycles (expected ~100 cycles for tcache)
   - Even cache hits are slow
   - Profiling overhead is only ~10 cycles, so real cost is ~2,466 cycles
   - Something fundamentally wrong with fast path

3. **Only 2.5% of allocations use fast path:**
   - Total operations: 1.637M × 2s = 3.27M ops
   - Tiny fast alloc: 20,480 × 4 threads = 81,920 ops
   - Coverage: 81,920 / 3,270,000 = **2.5%**
   - **97.5% of allocations bypass tiny_fast_alloc entirely!**

4. **FREE is not instrumented:**
   - No free() calls captured by profiling
   - hakmem.c's free() likely takes different path
   - Not calling tiny_fast_free() at all

Root cause analysis:

The 4x performance gap (vs system malloc) is NOT due to:
- Entry point overhead (Phase 1) 
- Dual free lists (Phase 2) 
- Batch refill efficiency (Phase 3) 

The REAL problems:
1. **Tiny fast path is barely used** (2.5% coverage)
2. **Refill is catastrophically slow** (38K cycles)
3. **Even cache hits are 24x too slow** (2.5K cycles)
4. **Free path is completely bypassed**

Why system malloc is 4x faster:
- System tcache has ~100 cycle malloc
- System tcache has ~90% hit rate (vs our 2.5% usage)
- System malloc/free are symmetric (we only optimize malloc)

Next steps:
1. Investigate why 97.5% bypass tiny_fast_alloc
2. Profile the slow path (hak_alloc_at) that handles 97.5%
3. Understand why even cache hits take 2,476 cycles
4. Instrument free() path to see where frees go
5. May need to optimize slow path instead of fast path

This profiling reveals we've been optimizing the wrong thing.
The "fast path" is neither fast (2.5K cycles) nor used (2.5%).
This commit is contained in:
Claude
2025-11-05 05:44:18 +00:00
parent 3429ed4457
commit 872622b78b
2 changed files with 153 additions and 2 deletions

View File

@ -43,12 +43,48 @@ extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
static __thread uint64_t g_tiny_fast_refill_count = 0;
static __thread uint64_t g_tiny_fast_drain_count = 0;
// Forward declaration for atexit registration
// ========== RDTSC Cycle Profiling ==========
// Ultra-lightweight profiling using CPU Time-Stamp Counter (~10 cycles overhead)
#ifdef __x86_64__
static inline uint64_t rdtsc(void) {
unsigned int lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t rdtsc(void) { return 0; } // Fallback for non-x86
#endif
// Per-thread cycle counters (gated by HAKMEM_TINY_PROFILE env var)
// Declared as extern in tiny_fastcache.h for inline functions
__thread uint64_t g_tiny_malloc_count = 0;
__thread uint64_t g_tiny_malloc_cycles = 0;
__thread uint64_t g_tiny_free_count = 0;
__thread uint64_t g_tiny_free_cycles = 0;
__thread uint64_t g_tiny_refill_cycles = 0;
__thread uint64_t g_tiny_migration_count = 0;
__thread uint64_t g_tiny_migration_cycles = 0;
int g_profile_enabled = -1; // -1: uninitialized, 0: off, 1: on (extern in header)
static inline int profile_enabled(void) {
if (__builtin_expect(g_profile_enabled == -1, 0)) {
const char* env = getenv("HAKMEM_TINY_PROFILE");
g_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
}
return g_profile_enabled;
}
// Forward declarations for atexit registration
void tiny_fast_print_stats(void);
void tiny_fast_print_profile(void);
// ========== Slow Path: Refill from Magazine/SuperSlab ==========
void* tiny_fast_refill(int class_idx) {
uint64_t start = profile_enabled() ? rdtsc() : 0;
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return NULL;
}
@ -59,6 +95,9 @@ void* tiny_fast_refill(int class_idx) {
static __thread int stats_registered = 0;
if (!stats_registered) {
atexit(tiny_fast_print_stats);
if (profile_enabled()) {
atexit(tiny_fast_print_profile);
}
stats_registered = 1;
}
@ -103,6 +142,11 @@ void* tiny_fast_refill(int class_idx) {
g_tiny_fast_cache[class_idx] = *(void**)result;
g_tiny_fast_count[class_idx]--;
// Profile: Record refill cycles
if (start) {
g_tiny_refill_cycles += (rdtsc() - start);
}
return result;
}
@ -158,3 +202,46 @@ void tiny_fast_print_stats(void) {
(unsigned long)g_tiny_fast_drain_count);
}
}
// ========== RDTSC Cycle Profiling Output ==========
void tiny_fast_print_profile(void) {
if (!profile_enabled()) return;
if (g_tiny_malloc_count == 0 && g_tiny_free_count == 0) return; // No data
fprintf(stderr, "\n========== HAKMEM Tiny Fast Path Profile (RDTSC cycles) ==========\n");
if (g_tiny_malloc_count > 0) {
uint64_t avg_malloc = g_tiny_malloc_cycles / g_tiny_malloc_count;
fprintf(stderr, "[MALLOC] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_malloc_count,
(unsigned long)g_tiny_malloc_cycles,
(unsigned long)avg_malloc);
}
if (g_tiny_free_count > 0) {
uint64_t avg_free = g_tiny_free_cycles / g_tiny_free_count;
fprintf(stderr, "[FREE] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_free_count,
(unsigned long)g_tiny_free_cycles,
(unsigned long)avg_free);
}
if (g_tiny_fast_refill_count > 0) {
uint64_t avg_refill = g_tiny_refill_cycles / g_tiny_fast_refill_count;
fprintf(stderr, "[REFILL] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_fast_refill_count,
(unsigned long)g_tiny_refill_cycles,
(unsigned long)avg_refill);
}
if (g_tiny_migration_count > 0) {
uint64_t avg_migration = g_tiny_migration_cycles / g_tiny_migration_count;
fprintf(stderr, "[MIGRATE] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_migration_count,
(unsigned long)g_tiny_migration_cycles,
(unsigned long)avg_migration);
}
fprintf(stderr, "===================================================================\n\n");
}

View File

@ -6,6 +6,7 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <stdlib.h> // For getenv()
// ========== Configuration ==========
@ -42,6 +43,37 @@ extern __thread int g_tiny_fast_initialized;
extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT];
extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT];
// ========== RDTSC Profiling (Phase 6-8) ==========
// Extern declarations for inline functions to access profiling counters
extern __thread uint64_t g_tiny_malloc_count;
extern __thread uint64_t g_tiny_malloc_cycles;
extern __thread uint64_t g_tiny_free_count;
extern __thread uint64_t g_tiny_free_cycles;
extern __thread uint64_t g_tiny_refill_cycles;
extern __thread uint64_t g_tiny_migration_count;
extern __thread uint64_t g_tiny_migration_cycles;
#ifdef __x86_64__
static inline uint64_t tiny_fast_rdtsc(void) {
unsigned int lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
#endif
extern int g_profile_enabled;
static inline int tiny_fast_profile_enabled(void) {
extern int g_profile_enabled;
if (__builtin_expect(g_profile_enabled == -1, 0)) {
const char* env = getenv("HAKMEM_TINY_PROFILE");
g_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
}
return g_profile_enabled;
}
// ========== Size to Class Mapping ==========
// Inline size-to-class for fast path (O(1) lookup table)
@ -91,6 +123,8 @@ void tiny_fast_drain(int class_idx);
// ========== Fast Path: Alloc (3-4 instructions!) ==========
static inline void* tiny_fast_alloc(size_t size) {
uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0;
// Step 1: Size to class (1-2 instructions, branch predictor friendly)
int cls = tiny_fast_size_to_class(size);
if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare)
@ -101,6 +135,11 @@ static inline void* tiny_fast_alloc(size_t size) {
// Fast path: Pop head, decrement count
g_tiny_fast_cache[cls] = *(void**)ptr;
g_tiny_fast_count[cls]--;
if (start) {
g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_malloc_count++;
}
return ptr;
}
@ -110,6 +149,8 @@ static inline void* tiny_fast_alloc(size_t size) {
// This is mimalloc's key optimization: batched migration, zero overhead
// ========================================================================
if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) {
uint64_t mig_start = start ? tiny_fast_rdtsc() : 0;
// Migrate entire free_head → alloc_head (pointer swap, instant!)
g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls];
g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls];
@ -120,16 +161,34 @@ static inline void* tiny_fast_alloc(size_t size) {
ptr = g_tiny_fast_cache[cls];
g_tiny_fast_cache[cls] = *(void**)ptr;
g_tiny_fast_count[cls]--;
if (mig_start) {
g_tiny_migration_cycles += (tiny_fast_rdtsc() - mig_start);
g_tiny_migration_count++;
}
if (start) {
g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_malloc_count++;
}
return ptr;
}
// Step 3: Slow path - refill from Magazine/SuperSlab
return tiny_fast_refill(cls);
ptr = tiny_fast_refill(cls);
if (start) {
g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_malloc_count++;
}
return ptr;
}
// ========== Fast Path: Free (2-3 instructions!) ==========
static inline void tiny_fast_free(void* ptr, size_t size) {
uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0;
// Step 1: Size to class
int cls = tiny_fast_size_to_class(size);
if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error)
@ -150,6 +209,11 @@ static inline void tiny_fast_free(void* ptr, size_t size) {
*(void**)ptr = g_tiny_fast_free_head[cls];
g_tiny_fast_free_head[cls] = ptr;
g_tiny_fast_free_count[cls]++;
if (start) {
g_tiny_free_cycles += (tiny_fast_rdtsc() - start);
g_tiny_free_count++;
}
}
// ========== Initialization ==========