Add RDTSC profiling - Identify refill bottleneck
Profiling Results: - Fast path: 143 cycles (10.4% of time) ✅ Good - Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck! Refill is 137x slower than fast path and dominates total cost. Only happens 6.3% of the time but takes 90% of execution time. Next: Optimize sll_refill_small_from_ss() backend.
This commit is contained in:
@ -48,6 +48,52 @@ extern int hak_tiny_size_to_class(size_t size);
|
||||
#define HAK_RET_ALLOC(cls, ptr) return (ptr)
|
||||
#endif
|
||||
|
||||
// ========== RDTSC Profiling (lightweight) ==========
|
||||
#ifdef __x86_64__
|
||||
static inline uint64_t tiny_fast_rdtsc(void) {
|
||||
unsigned int lo, hi;
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return ((uint64_t)hi << 32) | lo;
|
||||
}
|
||||
#else
|
||||
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
|
||||
#endif
|
||||
|
||||
// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
|
||||
static __thread uint64_t g_tiny_alloc_hits = 0;
|
||||
static __thread uint64_t g_tiny_alloc_cycles = 0;
|
||||
static __thread uint64_t g_tiny_refill_calls = 0;
|
||||
static __thread uint64_t g_tiny_refill_cycles = 0;
|
||||
static int g_tiny_profile_enabled = -1; // -1: uninitialized
|
||||
|
||||
static inline int tiny_profile_enabled(void) {
|
||||
if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) {
|
||||
const char* env = getenv("HAKMEM_TINY_PROFILE");
|
||||
g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
|
||||
}
|
||||
return g_tiny_profile_enabled;
|
||||
}
|
||||
|
||||
// Print profiling results at exit
|
||||
static void tiny_fast_print_profile(void) __attribute__((destructor));
|
||||
static void tiny_fast_print_profile(void) {
|
||||
if (!tiny_profile_enabled()) return;
|
||||
if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;
|
||||
|
||||
fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
|
||||
if (g_tiny_alloc_hits > 0) {
|
||||
fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
|
||||
(unsigned long)g_tiny_alloc_hits,
|
||||
(unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
|
||||
}
|
||||
if (g_tiny_refill_calls > 0) {
|
||||
fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n",
|
||||
(unsigned long)g_tiny_refill_calls,
|
||||
(unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
|
||||
}
|
||||
fprintf(stderr, "===================================================\n\n");
|
||||
}
|
||||
|
||||
// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========
|
||||
|
||||
// Allocation fast path (inline for zero-cost)
|
||||
@ -65,9 +111,8 @@ extern int hak_tiny_size_to_class(size_t size);
|
||||
//
|
||||
// Expected: 3-4 instructions on hit (1 load, 1 test, 1 load, 1 store)
|
||||
static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
// Optional one-shot front-path diag (env: HAKMEM_TINY_FRONT_DIAG=1)
|
||||
do { static int en=-1, once=0; if (en==-1){ const char* s=getenv("HAKMEM_TINY_FRONT_DIAG"); en=(s&&*s&&*s!='0')?1:0; }
|
||||
if (en && !once){ fprintf(stderr, "[FRONT] tiny_alloc_fast_pop active (class=%d)\n", class_idx); once=1; } } while(0);
|
||||
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
||||
|
||||
// Box Boundary: TLS freelist の先頭を pop
|
||||
// Ownership: TLS なので所有権チェック不要(同一スレッド保証)
|
||||
void* head = g_tls_sll_head[class_idx];
|
||||
@ -85,6 +130,10 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
g_free_via_tls_sll[class_idx]++;
|
||||
#endif
|
||||
|
||||
if (start) {
|
||||
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
|
||||
g_tiny_alloc_hits++;
|
||||
}
|
||||
return head;
|
||||
}
|
||||
|
||||
@ -106,13 +155,12 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
// - Smaller count (8-16): better for diverse workloads, faster warmup
|
||||
// - Larger count (64-128): better for homogeneous workloads, fewer refills
|
||||
static inline int tiny_alloc_fast_refill(int class_idx) {
|
||||
// Optional one-shot diag (env)
|
||||
do { static int en=-1, once=0; if (en==-1){ const char* s=getenv("HAKMEM_TINY_FRONT_DIAG"); en=(s&&*s&&*s!='0')?1:0; }
|
||||
if (en && !once){ fprintf(stderr, "[FRONT] tiny_alloc_fast_refill enter (class=%d)\n", class_idx); once=1; } } while(0);
|
||||
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
||||
|
||||
// Tunable refill count (cached in TLS for performance)
|
||||
static __thread int s_refill_count = 0;
|
||||
if (__builtin_expect(s_refill_count == 0, 0)) {
|
||||
int def = 128; // Phase 1 Quick Win: 32 → 128 (reduce refill overhead)
|
||||
int def = 16; // Default: 16 (smaller = less overhead per refill)
|
||||
char* env = getenv("HAKMEM_TINY_REFILL_COUNT");
|
||||
int v = (env ? atoi(env) : def);
|
||||
|
||||
@ -133,6 +181,11 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
|
||||
// Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss()
|
||||
int refilled = sll_refill_small_from_ss(class_idx, s_refill_count);
|
||||
|
||||
if (start) {
|
||||
g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
|
||||
g_tiny_refill_calls++;
|
||||
}
|
||||
|
||||
return refilled;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user