diff --git a/core/tiny_fastcache.c b/core/tiny_fastcache.c index 813201e8..50ebd974 100644 --- a/core/tiny_fastcache.c +++ b/core/tiny_fastcache.c @@ -43,12 +43,48 @@ extern void* hak_tiny_alloc_slow(size_t size, int class_idx); static __thread uint64_t g_tiny_fast_refill_count = 0; static __thread uint64_t g_tiny_fast_drain_count = 0; -// Forward declaration for atexit registration +// ========== RDTSC Cycle Profiling ========== +// Ultra-lightweight profiling using CPU Time-Stamp Counter (~10 cycles overhead) + +#ifdef __x86_64__ +static inline uint64_t rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return ((uint64_t)hi << 32) | lo; +} +#else +static inline uint64_t rdtsc(void) { return 0; } // Fallback for non-x86 +#endif + +// Per-thread cycle counters (gated by HAKMEM_TINY_PROFILE env var) +// Declared as extern in tiny_fastcache.h for inline functions +__thread uint64_t g_tiny_malloc_count = 0; +__thread uint64_t g_tiny_malloc_cycles = 0; +__thread uint64_t g_tiny_free_count = 0; +__thread uint64_t g_tiny_free_cycles = 0; +__thread uint64_t g_tiny_refill_cycles = 0; +__thread uint64_t g_tiny_migration_count = 0; +__thread uint64_t g_tiny_migration_cycles = 0; + +int g_profile_enabled = -1; // -1: uninitialized, 0: off, 1: on (extern in header) + +static inline int profile_enabled(void) { + if (__builtin_expect(g_profile_enabled == -1, 0)) { + const char* env = getenv("HAKMEM_TINY_PROFILE"); + g_profile_enabled = (env && *env && *env != '0') ? 1 : 0; + } + return g_profile_enabled; +} + +// Forward declarations for atexit registration void tiny_fast_print_stats(void); +void tiny_fast_print_profile(void); // ========== Slow Path: Refill from Magazine/SuperSlab ========== void* tiny_fast_refill(int class_idx) { + uint64_t start = profile_enabled() ? rdtsc() : 0; + if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) { return NULL; } @@ -59,6 +95,9 @@ void* tiny_fast_refill(int class_idx) { static __thread int stats_registered = 0; if (!stats_registered) { atexit(tiny_fast_print_stats); + if (profile_enabled()) { + atexit(tiny_fast_print_profile); + } stats_registered = 1; } @@ -103,6 +142,11 @@ void* tiny_fast_refill(int class_idx) { g_tiny_fast_cache[class_idx] = *(void**)result; g_tiny_fast_count[class_idx]--; + // Profile: Record refill cycles + if (start) { + g_tiny_refill_cycles += (rdtsc() - start); + } + return result; } @@ -158,3 +202,46 @@ void tiny_fast_print_stats(void) { (unsigned long)g_tiny_fast_drain_count); } } + +// ========== RDTSC Cycle Profiling Output ========== + +void tiny_fast_print_profile(void) { + if (!profile_enabled()) return; + if (g_tiny_malloc_count == 0 && g_tiny_free_count == 0) return; // No data + + fprintf(stderr, "\n========== HAKMEM Tiny Fast Path Profile (RDTSC cycles) ==========\n"); + + if (g_tiny_malloc_count > 0) { + uint64_t avg_malloc = g_tiny_malloc_cycles / g_tiny_malloc_count; + fprintf(stderr, "[MALLOC] count=%lu, total_cycles=%lu, avg_cycles=%lu\n", + (unsigned long)g_tiny_malloc_count, + (unsigned long)g_tiny_malloc_cycles, + (unsigned long)avg_malloc); + } + + if (g_tiny_free_count > 0) { + uint64_t avg_free = g_tiny_free_cycles / g_tiny_free_count; + fprintf(stderr, "[FREE] count=%lu, total_cycles=%lu, avg_cycles=%lu\n", + (unsigned long)g_tiny_free_count, + (unsigned long)g_tiny_free_cycles, + (unsigned long)avg_free); + } + + if (g_tiny_fast_refill_count > 0) { + uint64_t avg_refill = g_tiny_refill_cycles / g_tiny_fast_refill_count; + fprintf(stderr, "[REFILL] count=%lu, total_cycles=%lu, avg_cycles=%lu\n", + (unsigned long)g_tiny_fast_refill_count, + (unsigned long)g_tiny_refill_cycles, + (unsigned long)avg_refill); + } + + if (g_tiny_migration_count > 0) { + uint64_t avg_migration = g_tiny_migration_cycles / g_tiny_migration_count; + fprintf(stderr, "[MIGRATE] count=%lu, total_cycles=%lu, avg_cycles=%lu\n", + (unsigned long)g_tiny_migration_count, + (unsigned long)g_tiny_migration_cycles, + (unsigned long)avg_migration); + } + + fprintf(stderr, "===================================================================\n\n"); +} diff --git a/core/tiny_fastcache.h b/core/tiny_fastcache.h index 8e21768a..96e76164 100644 --- a/core/tiny_fastcache.h +++ b/core/tiny_fastcache.h @@ -6,6 +6,7 @@ #include #include #include +#include // For getenv() // ========== Configuration ========== @@ -42,6 +43,37 @@ extern __thread int g_tiny_fast_initialized; extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT]; extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT]; +// ========== RDTSC Profiling (Phase 6-8) ========== +// Extern declarations for inline functions to access profiling counters + +extern __thread uint64_t g_tiny_malloc_count; +extern __thread uint64_t g_tiny_malloc_cycles; +extern __thread uint64_t g_tiny_free_count; +extern __thread uint64_t g_tiny_free_cycles; +extern __thread uint64_t g_tiny_refill_cycles; +extern __thread uint64_t g_tiny_migration_count; +extern __thread uint64_t g_tiny_migration_cycles; + +#ifdef __x86_64__ +static inline uint64_t tiny_fast_rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return ((uint64_t)hi << 32) | lo; +} +#else +static inline uint64_t tiny_fast_rdtsc(void) { return 0; } +#endif + +extern int g_profile_enabled; +static inline int tiny_fast_profile_enabled(void) { + extern int g_profile_enabled; + if (__builtin_expect(g_profile_enabled == -1, 0)) { + const char* env = getenv("HAKMEM_TINY_PROFILE"); + g_profile_enabled = (env && *env && *env != '0') ? 1 : 0; + } + return g_profile_enabled; +} + // ========== Size to Class Mapping ========== // Inline size-to-class for fast path (O(1) lookup table) @@ -91,6 +123,8 @@ void tiny_fast_drain(int class_idx); // ========== Fast Path: Alloc (3-4 instructions!) ========== static inline void* tiny_fast_alloc(size_t size) { + uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0; + // Step 1: Size to class (1-2 instructions, branch predictor friendly) int cls = tiny_fast_size_to_class(size); if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare) @@ -101,6 +135,11 @@ static inline void* tiny_fast_alloc(size_t size) { // Fast path: Pop head, decrement count g_tiny_fast_cache[cls] = *(void**)ptr; g_tiny_fast_count[cls]--; + + if (start) { + g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start); + g_tiny_malloc_count++; + } return ptr; } @@ -110,6 +149,8 @@ static inline void* tiny_fast_alloc(size_t size) { // This is mimalloc's key optimization: batched migration, zero overhead // ======================================================================== if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) { + uint64_t mig_start = start ? tiny_fast_rdtsc() : 0; + // Migrate entire free_head → alloc_head (pointer swap, instant!) g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls]; g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls]; @@ -120,16 +161,34 @@ static inline void* tiny_fast_alloc(size_t size) { ptr = g_tiny_fast_cache[cls]; g_tiny_fast_cache[cls] = *(void**)ptr; g_tiny_fast_count[cls]--; + + if (mig_start) { + g_tiny_migration_cycles += (tiny_fast_rdtsc() - mig_start); + g_tiny_migration_count++; + } + + if (start) { + g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start); + g_tiny_malloc_count++; + } return ptr; } // Step 3: Slow path - refill from Magazine/SuperSlab - return tiny_fast_refill(cls); + ptr = tiny_fast_refill(cls); + + if (start) { + g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start); + g_tiny_malloc_count++; + } + return ptr; } // ========== Fast Path: Free (2-3 instructions!) ========== static inline void tiny_fast_free(void* ptr, size_t size) { + uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0; + // Step 1: Size to class int cls = tiny_fast_size_to_class(size); if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error) @@ -150,6 +209,11 @@ static inline void tiny_fast_free(void* ptr, size_t size) { *(void**)ptr = g_tiny_fast_free_head[cls]; g_tiny_fast_free_head[cls] = ptr; g_tiny_fast_free_count[cls]++; + + if (start) { + g_tiny_free_cycles += (tiny_fast_rdtsc() - start); + g_tiny_free_count++; + } } // ========== Initialization ==========