diff --git a/core/hakmem.c b/core/hakmem.c index 71ce5a9b..f91b61ad 100644 --- a/core/hakmem.c +++ b/core/hakmem.c @@ -1247,7 +1247,16 @@ void* realloc(void* ptr, size_t size) { #else // malloc wrapper - intercepts system malloc() calls +// Debug counters for malloc routing (Phase 6-6 analysis) +__thread uint64_t g_malloc_total_calls = 0; +__thread uint64_t g_malloc_tiny_size_match = 0; +__thread uint64_t g_malloc_fast_path_tried = 0; +__thread uint64_t g_malloc_fast_path_null = 0; +__thread uint64_t g_malloc_slow_path = 0; + void* malloc(size_t size) { + g_malloc_total_calls++; + // ======================================================================== // Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style) // Inspired by research: tcache has 0 branches, mimalloc has 1-2 branches @@ -1256,6 +1265,7 @@ void* malloc(size_t size) { #ifdef HAKMEM_TINY_FAST_PATH // Branch 1: Size check (predicted taken for tiny allocations) if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) { + g_malloc_tiny_size_match++; extern void* tiny_fast_alloc(size_t); extern void tiny_fast_init(void); extern __thread int g_tiny_fast_initialized; @@ -1263,10 +1273,12 @@ void* malloc(size_t size) { // Branch 2: Initialization check (predicted taken after first call) if (__builtin_expect(g_tiny_fast_initialized, 1)) { // Branch 3: Cache hit check (predicted taken ~90% of time) + g_malloc_fast_path_tried++; void* ptr = tiny_fast_alloc(size); if (__builtin_expect(ptr != NULL, 1)) { return ptr; // ✅ FAST PATH: 3 branches total (vs tcache's 0, mimalloc's 1-2) } + g_malloc_fast_path_null++; // Cache miss: fall through to slow path refill } else { // Cold path: initialize once per thread (rare) @@ -1279,6 +1291,7 @@ void* malloc(size_t size) { // ======================================================================== // SLOW PATH: All guards moved here (only executed on fast path miss) // ======================================================================== + g_malloc_slow_path++; // Recursion guard: if we're inside the allocator already, fall back to libc if (g_hakmem_lock_depth > 0) { diff --git a/core/tiny_fastcache.c b/core/tiny_fastcache.c index 50ebd974..ff3ba050 100644 --- a/core/tiny_fastcache.c +++ b/core/tiny_fastcache.c @@ -66,6 +66,12 @@ __thread uint64_t g_tiny_refill_cycles = 0; __thread uint64_t g_tiny_migration_count = 0; __thread uint64_t g_tiny_migration_cycles = 0; +// Refill failure tracking +static __thread uint64_t g_refill_success_count = 0; +static __thread uint64_t g_refill_partial_count = 0; // Some blocks allocated +static __thread uint64_t g_refill_fail_count = 0; // Zero blocks allocated +static __thread uint64_t g_refill_total_blocks = 0; // Total blocks actually allocated + int g_profile_enabled = -1; // -1: uninitialized, 0: off, 1: on (extern in header) static inline int profile_enabled(void) { @@ -124,7 +130,16 @@ void* tiny_fast_refill(int class_idx) { batch[count++] = ptr; } - if (count == 0) return NULL; // Complete failure + // Track refill results + if (count == 0) { + g_refill_fail_count++; + return NULL; // Complete failure + } else if (count < TINY_FAST_REFILL_BATCH) { + g_refill_partial_count++; + } else { + g_refill_success_count++; + } + g_refill_total_blocks += count; // Step 2: Link all blocks into freelist in one pass (batch linking) // This is the key optimization: N individual pushes → 1 batch link @@ -205,12 +220,38 @@ void tiny_fast_print_stats(void) { // ========== RDTSC Cycle Profiling Output ========== +// External routing counters from hakmem.c +extern __thread uint64_t g_malloc_total_calls; +extern __thread uint64_t g_malloc_tiny_size_match; +extern __thread uint64_t g_malloc_fast_path_tried; +extern __thread uint64_t g_malloc_fast_path_null; +extern __thread uint64_t g_malloc_slow_path; + void tiny_fast_print_profile(void) { if (!profile_enabled()) return; if (g_tiny_malloc_count == 0 && g_tiny_free_count == 0) return; // No data fprintf(stderr, "\n========== HAKMEM Tiny Fast Path Profile (RDTSC cycles) ==========\n"); + // Routing statistics first + if (g_malloc_total_calls > 0) { + fprintf(stderr, "\n[ROUTING]\n"); + fprintf(stderr, " Total malloc() calls: %lu\n", (unsigned long)g_malloc_total_calls); + fprintf(stderr, " Size <= %d (tiny range): %lu (%.1f%%)\n", + TINY_FAST_THRESHOLD, + (unsigned long)g_malloc_tiny_size_match, + 100.0 * g_malloc_tiny_size_match / g_malloc_total_calls); + fprintf(stderr, " Fast path tried: %lu (%.1f%%)\n", + (unsigned long)g_malloc_fast_path_tried, + 100.0 * g_malloc_fast_path_tried / g_malloc_total_calls); + fprintf(stderr, " Fast path returned NULL: %lu (%.1f%% of tried)\n", + (unsigned long)g_malloc_fast_path_null, + g_malloc_fast_path_tried > 0 ? 100.0 * g_malloc_fast_path_null / g_malloc_fast_path_tried : 0); + fprintf(stderr, " Slow path entered: %lu (%.1f%%)\n\n", + (unsigned long)g_malloc_slow_path, + 100.0 * g_malloc_slow_path / g_malloc_total_calls); + } + if (g_tiny_malloc_count > 0) { uint64_t avg_malloc = g_tiny_malloc_cycles / g_tiny_malloc_count; fprintf(stderr, "[MALLOC] count=%lu, total_cycles=%lu, avg_cycles=%lu\n", @@ -233,6 +274,20 @@ void tiny_fast_print_profile(void) { (unsigned long)g_tiny_fast_refill_count, (unsigned long)g_tiny_refill_cycles, (unsigned long)avg_refill); + + // Refill success/failure breakdown + fprintf(stderr, "[REFILL SUCCESS] count=%lu (%.1f%%) - full batch\n", + (unsigned long)g_refill_success_count, + 100.0 * g_refill_success_count / g_tiny_fast_refill_count); + fprintf(stderr, "[REFILL PARTIAL] count=%lu (%.1f%%) - some blocks\n", + (unsigned long)g_refill_partial_count, + 100.0 * g_refill_partial_count / g_tiny_fast_refill_count); + fprintf(stderr, "[REFILL FAIL] count=%lu (%.1f%%) - zero blocks\n", + (unsigned long)g_refill_fail_count, + 100.0 * g_refill_fail_count / g_tiny_fast_refill_count); + fprintf(stderr, "[REFILL AVG BLOCKS] %.1f per refill (target=%d)\n", + (double)g_refill_total_blocks / g_tiny_fast_refill_count, + TINY_FAST_REFILL_BATCH); } if (g_tiny_migration_count > 0) {