Add malloc routing analysis and refill success tracking

### Changes:
- **Routing Counters**: Added per-thread counters in hakmem.c to track:
  - g_malloc_total_calls: Total malloc() invocations
  - g_malloc_tiny_size_match: Calls within tiny size range (<=128B)
  - g_malloc_fast_path_tried: Calls that attempted fast path
  - g_malloc_fast_path_null: Fast path returned NULL
  - g_malloc_slow_path: Calls routed to slow path

- **Refill Success Tracking**: Added counters in tiny_fastcache.c:
  - g_refill_success_count: Full batch (16 blocks)
  - g_refill_partial_count: Partial batch (<16 blocks)
  - g_refill_fail_count: Zero blocks allocated
  - g_refill_total_blocks: Total blocks across all refills

- **Profile Output Enhanced**: tiny_fast_print_profile() now shows:
  - Routing statistics (which path allocations take)
  - Refill success/failure breakdown
  - Average blocks per refill

### Key Findings:
 Fast path routing: 100% success (20,479/20,480 calls per thread)
 Refill success: 100% (1,285 refills, all 16 blocks each)
⚠️  Performance: Still only 1.68M ops/s vs System's 8.06M (20.8%)

**Root Cause Confirmed**:
- NOT a routing problem (100% reach fast path)
- NOT a refill failure (100% success)
- IS a structural performance issue (2,418 cycles avg for malloc)

**Bottlenecks Identified**:
1. Fast path cache hits: ~2,418 cycles (vs tcache ~100 cycles)
2. Refill operations: ~39,938 cycles (expensive but infrequent)
3. Overall throughput: 4.8x slower than system malloc

**Next Steps** (per LARSON_PERFORMANCE_ANALYSIS_2025_11_05.md):
- Option B: Refill efficiency (batch allocation from SuperSlab)
- Option C: Ultra-fast path redesign (tcache-equivalent)

Related: LARSON_PERFORMANCE_ANALYSIS_2025_11_05.md
This commit is contained in:
Claude
2025-11-05 05:56:02 +00:00
parent 872622b78b
commit 31af3eab27
2 changed files with 69 additions and 1 deletions

View File

@ -1247,7 +1247,16 @@ void* realloc(void* ptr, size_t size) {
#else #else
// malloc wrapper - intercepts system malloc() calls // malloc wrapper - intercepts system malloc() calls
// Debug counters for malloc routing (Phase 6-6 analysis)
__thread uint64_t g_malloc_total_calls = 0;
__thread uint64_t g_malloc_tiny_size_match = 0;
__thread uint64_t g_malloc_fast_path_tried = 0;
__thread uint64_t g_malloc_fast_path_null = 0;
__thread uint64_t g_malloc_slow_path = 0;
void* malloc(size_t size) { void* malloc(size_t size) {
g_malloc_total_calls++;
// ======================================================================== // ========================================================================
// Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style) // Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style)
// Inspired by research: tcache has 0 branches, mimalloc has 1-2 branches // Inspired by research: tcache has 0 branches, mimalloc has 1-2 branches
@ -1256,6 +1265,7 @@ void* malloc(size_t size) {
#ifdef HAKMEM_TINY_FAST_PATH #ifdef HAKMEM_TINY_FAST_PATH
// Branch 1: Size check (predicted taken for tiny allocations) // Branch 1: Size check (predicted taken for tiny allocations)
if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) { if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) {
g_malloc_tiny_size_match++;
extern void* tiny_fast_alloc(size_t); extern void* tiny_fast_alloc(size_t);
extern void tiny_fast_init(void); extern void tiny_fast_init(void);
extern __thread int g_tiny_fast_initialized; extern __thread int g_tiny_fast_initialized;
@ -1263,10 +1273,12 @@ void* malloc(size_t size) {
// Branch 2: Initialization check (predicted taken after first call) // Branch 2: Initialization check (predicted taken after first call)
if (__builtin_expect(g_tiny_fast_initialized, 1)) { if (__builtin_expect(g_tiny_fast_initialized, 1)) {
// Branch 3: Cache hit check (predicted taken ~90% of time) // Branch 3: Cache hit check (predicted taken ~90% of time)
g_malloc_fast_path_tried++;
void* ptr = tiny_fast_alloc(size); void* ptr = tiny_fast_alloc(size);
if (__builtin_expect(ptr != NULL, 1)) { if (__builtin_expect(ptr != NULL, 1)) {
return ptr; // ✅ FAST PATH: 3 branches total (vs tcache's 0, mimalloc's 1-2) return ptr; // ✅ FAST PATH: 3 branches total (vs tcache's 0, mimalloc's 1-2)
} }
g_malloc_fast_path_null++;
// Cache miss: fall through to slow path refill // Cache miss: fall through to slow path refill
} else { } else {
// Cold path: initialize once per thread (rare) // Cold path: initialize once per thread (rare)
@ -1279,6 +1291,7 @@ void* malloc(size_t size) {
// ======================================================================== // ========================================================================
// SLOW PATH: All guards moved here (only executed on fast path miss) // SLOW PATH: All guards moved here (only executed on fast path miss)
// ======================================================================== // ========================================================================
g_malloc_slow_path++;
// Recursion guard: if we're inside the allocator already, fall back to libc // Recursion guard: if we're inside the allocator already, fall back to libc
if (g_hakmem_lock_depth > 0) { if (g_hakmem_lock_depth > 0) {

View File

@ -66,6 +66,12 @@ __thread uint64_t g_tiny_refill_cycles = 0;
__thread uint64_t g_tiny_migration_count = 0; __thread uint64_t g_tiny_migration_count = 0;
__thread uint64_t g_tiny_migration_cycles = 0; __thread uint64_t g_tiny_migration_cycles = 0;
// Refill failure tracking
static __thread uint64_t g_refill_success_count = 0;
static __thread uint64_t g_refill_partial_count = 0; // Some blocks allocated
static __thread uint64_t g_refill_fail_count = 0; // Zero blocks allocated
static __thread uint64_t g_refill_total_blocks = 0; // Total blocks actually allocated
int g_profile_enabled = -1; // -1: uninitialized, 0: off, 1: on (extern in header) int g_profile_enabled = -1; // -1: uninitialized, 0: off, 1: on (extern in header)
static inline int profile_enabled(void) { static inline int profile_enabled(void) {
@ -124,7 +130,16 @@ void* tiny_fast_refill(int class_idx) {
batch[count++] = ptr; batch[count++] = ptr;
} }
if (count == 0) return NULL; // Complete failure // Track refill results
if (count == 0) {
g_refill_fail_count++;
return NULL; // Complete failure
} else if (count < TINY_FAST_REFILL_BATCH) {
g_refill_partial_count++;
} else {
g_refill_success_count++;
}
g_refill_total_blocks += count;
// Step 2: Link all blocks into freelist in one pass (batch linking) // Step 2: Link all blocks into freelist in one pass (batch linking)
// This is the key optimization: N individual pushes → 1 batch link // This is the key optimization: N individual pushes → 1 batch link
@ -205,12 +220,38 @@ void tiny_fast_print_stats(void) {
// ========== RDTSC Cycle Profiling Output ========== // ========== RDTSC Cycle Profiling Output ==========
// External routing counters from hakmem.c
extern __thread uint64_t g_malloc_total_calls;
extern __thread uint64_t g_malloc_tiny_size_match;
extern __thread uint64_t g_malloc_fast_path_tried;
extern __thread uint64_t g_malloc_fast_path_null;
extern __thread uint64_t g_malloc_slow_path;
void tiny_fast_print_profile(void) { void tiny_fast_print_profile(void) {
if (!profile_enabled()) return; if (!profile_enabled()) return;
if (g_tiny_malloc_count == 0 && g_tiny_free_count == 0) return; // No data if (g_tiny_malloc_count == 0 && g_tiny_free_count == 0) return; // No data
fprintf(stderr, "\n========== HAKMEM Tiny Fast Path Profile (RDTSC cycles) ==========\n"); fprintf(stderr, "\n========== HAKMEM Tiny Fast Path Profile (RDTSC cycles) ==========\n");
// Routing statistics first
if (g_malloc_total_calls > 0) {
fprintf(stderr, "\n[ROUTING]\n");
fprintf(stderr, " Total malloc() calls: %lu\n", (unsigned long)g_malloc_total_calls);
fprintf(stderr, " Size <= %d (tiny range): %lu (%.1f%%)\n",
TINY_FAST_THRESHOLD,
(unsigned long)g_malloc_tiny_size_match,
100.0 * g_malloc_tiny_size_match / g_malloc_total_calls);
fprintf(stderr, " Fast path tried: %lu (%.1f%%)\n",
(unsigned long)g_malloc_fast_path_tried,
100.0 * g_malloc_fast_path_tried / g_malloc_total_calls);
fprintf(stderr, " Fast path returned NULL: %lu (%.1f%% of tried)\n",
(unsigned long)g_malloc_fast_path_null,
g_malloc_fast_path_tried > 0 ? 100.0 * g_malloc_fast_path_null / g_malloc_fast_path_tried : 0);
fprintf(stderr, " Slow path entered: %lu (%.1f%%)\n\n",
(unsigned long)g_malloc_slow_path,
100.0 * g_malloc_slow_path / g_malloc_total_calls);
}
if (g_tiny_malloc_count > 0) { if (g_tiny_malloc_count > 0) {
uint64_t avg_malloc = g_tiny_malloc_cycles / g_tiny_malloc_count; uint64_t avg_malloc = g_tiny_malloc_cycles / g_tiny_malloc_count;
fprintf(stderr, "[MALLOC] count=%lu, total_cycles=%lu, avg_cycles=%lu\n", fprintf(stderr, "[MALLOC] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
@ -233,6 +274,20 @@ void tiny_fast_print_profile(void) {
(unsigned long)g_tiny_fast_refill_count, (unsigned long)g_tiny_fast_refill_count,
(unsigned long)g_tiny_refill_cycles, (unsigned long)g_tiny_refill_cycles,
(unsigned long)avg_refill); (unsigned long)avg_refill);
// Refill success/failure breakdown
fprintf(stderr, "[REFILL SUCCESS] count=%lu (%.1f%%) - full batch\n",
(unsigned long)g_refill_success_count,
100.0 * g_refill_success_count / g_tiny_fast_refill_count);
fprintf(stderr, "[REFILL PARTIAL] count=%lu (%.1f%%) - some blocks\n",
(unsigned long)g_refill_partial_count,
100.0 * g_refill_partial_count / g_tiny_fast_refill_count);
fprintf(stderr, "[REFILL FAIL] count=%lu (%.1f%%) - zero blocks\n",
(unsigned long)g_refill_fail_count,
100.0 * g_refill_fail_count / g_tiny_fast_refill_count);
fprintf(stderr, "[REFILL AVG BLOCKS] %.1f per refill (target=%d)\n",
(double)g_refill_total_blocks / g_tiny_fast_refill_count,
TINY_FAST_REFILL_BATCH);
} }
if (g_tiny_migration_count > 0) { if (g_tiny_migration_count > 0) {