diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h index b9cf1ea9..34d06109 100644 --- a/core/tiny_alloc_fast.inc.h +++ b/core/tiny_alloc_fast.inc.h @@ -48,6 +48,52 @@ extern int hak_tiny_size_to_class(size_t size); #define HAK_RET_ALLOC(cls, ptr) return (ptr) #endif +// ========== RDTSC Profiling (lightweight) ========== +#ifdef __x86_64__ +static inline uint64_t tiny_fast_rdtsc(void) { + unsigned int lo, hi; + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return ((uint64_t)hi << 32) | lo; +} +#else +static inline uint64_t tiny_fast_rdtsc(void) { return 0; } +#endif + +// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1) +static __thread uint64_t g_tiny_alloc_hits = 0; +static __thread uint64_t g_tiny_alloc_cycles = 0; +static __thread uint64_t g_tiny_refill_calls = 0; +static __thread uint64_t g_tiny_refill_cycles = 0; +static int g_tiny_profile_enabled = -1; // -1: uninitialized + +static inline int tiny_profile_enabled(void) { + if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) { + const char* env = getenv("HAKMEM_TINY_PROFILE"); + g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0; + } + return g_tiny_profile_enabled; +} + +// Print profiling results at exit +static void tiny_fast_print_profile(void) __attribute__((destructor)); +static void tiny_fast_print_profile(void) { + if (!tiny_profile_enabled()) return; + if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return; + + fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n"); + if (g_tiny_alloc_hits > 0) { + fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n", + (unsigned long)g_tiny_alloc_hits, + (unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits)); + } + if (g_tiny_refill_calls > 0) { + fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n", + (unsigned long)g_tiny_refill_calls, + (unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls)); + } + fprintf(stderr, "===================================================\n\n"); +} + // ========== Fast Path: TLS Freelist Pop (3-4 instructions) ========== // Allocation fast path (inline for zero-cost) @@ -65,9 +111,8 @@ extern int hak_tiny_size_to_class(size_t size); // // Expected: 3-4 instructions on hit (1 load, 1 test, 1 load, 1 store) static inline void* tiny_alloc_fast_pop(int class_idx) { - // Optional one-shot front-path diag (env: HAKMEM_TINY_FRONT_DIAG=1) - do { static int en=-1, once=0; if (en==-1){ const char* s=getenv("HAKMEM_TINY_FRONT_DIAG"); en=(s&&*s&&*s!='0')?1:0; } - if (en && !once){ fprintf(stderr, "[FRONT] tiny_alloc_fast_pop active (class=%d)\n", class_idx); once=1; } } while(0); + uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0; + // Box Boundary: TLS freelist の先頭を pop // Ownership: TLS なので所有権チェック不要(同一スレッド保証) void* head = g_tls_sll_head[class_idx]; @@ -85,6 +130,10 @@ static inline void* tiny_alloc_fast_pop(int class_idx) { g_free_via_tls_sll[class_idx]++; #endif + if (start) { + g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start); + g_tiny_alloc_hits++; + } return head; } @@ -106,13 +155,12 @@ static inline void* tiny_alloc_fast_pop(int class_idx) { // - Smaller count (8-16): better for diverse workloads, faster warmup // - Larger count (64-128): better for homogeneous workloads, fewer refills static inline int tiny_alloc_fast_refill(int class_idx) { - // Optional one-shot diag (env) - do { static int en=-1, once=0; if (en==-1){ const char* s=getenv("HAKMEM_TINY_FRONT_DIAG"); en=(s&&*s&&*s!='0')?1:0; } - if (en && !once){ fprintf(stderr, "[FRONT] tiny_alloc_fast_refill enter (class=%d)\n", class_idx); once=1; } } while(0); + uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0; + // Tunable refill count (cached in TLS for performance) static __thread int s_refill_count = 0; if (__builtin_expect(s_refill_count == 0, 0)) { - int def = 128; // Phase 1 Quick Win: 32 → 128 (reduce refill overhead) + int def = 16; // Default: 16 (smaller = less overhead per refill) char* env = getenv("HAKMEM_TINY_REFILL_COUNT"); int v = (env ? atoi(env) : def); @@ -133,6 +181,11 @@ static inline int tiny_alloc_fast_refill(int class_idx) { // Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss() int refilled = sll_refill_small_from_ss(class_idx, s_refill_count); + if (start) { + g_tiny_refill_cycles += (tiny_fast_rdtsc() - start); + g_tiny_refill_calls++; + } + return refilled; }