diff --git a/core/box/bench_fast_box.c b/core/box/bench_fast_box.c index 689b5ee6..db79845a 100644 --- a/core/box/bench_fast_box.c +++ b/core/box/bench_fast_box.c @@ -120,25 +120,33 @@ int bench_fast_init(void) { // Set guard to prevent recursion during initialization bench_fast_init_in_progress = 1; - // Phase 8-Step2: Prewarm Unified Cache (initialize before benchmark) - // This enables PGO builds to remove lazy init checks in hot paths - #ifdef USE_HAKMEM - extern void unified_cache_init(void); - unified_cache_init(); - fprintf(stderr, "[BENCH_FAST] Unified Cache prewarmed\n"); - #endif + // Phase 8 Root Cause Fix: REMOVED unified_cache_init() call + // Reason: BenchFast uses TLS SLL directly, NOT Unified Cache + // The prewarm was a design misunderstanding - BenchFast has its own allocation strategy + // Calling unified_cache_init() created 16KB mmap allocations that crashed when freed + // in BenchFast mode (header misclassification bug) fprintf(stderr, "[BENCH_FAST] Starting preallocation...\n"); + // Layer 0 Root Cause Fix: Limit prealloc to actual TLS SLL capacity + // Problem: Old code preallocated 50,000 blocks/class, but TLS SLL capacity is 128 (adaptive sizing) + // The "lost" blocks (beyond capacity) caused heap corruption + // Analysis: sll_cap_for_class() returns "desired" capacity (2048), but adaptive sizing + // limits actual capacity to 128 at runtime. We must use the actual limit. + // Solution: Hard-code to 128 blocks/class (observed actual capacity from runtime output) + extern const size_t g_tiny_class_sizes[]; + int total = 0; - const int PREALLOC_COUNT = 50000; // Per class (300,000 total for C2-C7) + const uint32_t ACTUAL_TLS_SLL_CAPACITY = 128; // Observed actual capacity (adaptive sizing limit) // Preallocate C2-C7 (32B-1024B, skip C0/C1 - too small, rarely used) for (int cls = 2; cls <= 7; cls++) { - fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %d blocks...\n", - cls, g_tiny_class_sizes[cls], PREALLOC_COUNT); + uint32_t capacity = ACTUAL_TLS_SLL_CAPACITY; - for (int i = 0; i < PREALLOC_COUNT; i++) { + fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %u blocks (actual TLS SLL capacity)...\n", + cls, g_tiny_class_sizes[cls], capacity); + + for (int i = 0; i < (int)capacity; i++) { // Use normal allocator (hak_alloc_at) - recursion safe here size_t size = g_tiny_class_sizes[cls]; #ifdef HAKMEM_TINY_HEADER_CLASSIDX @@ -149,8 +157,8 @@ int bench_fast_init(void) { void* ptr = hak_alloc_at(size, "bench_fast_init"); if (!ptr) { - fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%d\n", - cls, i, PREALLOC_COUNT); + fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%u\n", + cls, i, capacity); fprintf(stderr, "[BENCH_FAST] Total preallocated: %d blocks\n", total); return total; } @@ -181,10 +189,10 @@ int bench_fast_init(void) { total++; - // Progress indicator every 10,000 blocks - if ((i + 1) % 10000 == 0) { - fprintf(stderr, "[BENCH_FAST] C%d: %d/%d blocks...\n", - cls, i + 1, PREALLOC_COUNT); + // Progress indicator (only for large capacities) + if (capacity >= 500 && (i + 1) % 500 == 0) { + fprintf(stderr, "[BENCH_FAST] C%d: %d/%u blocks...\n", + cls, i + 1, capacity); } } diff --git a/core/box/bench_fast_box.h b/core/box/bench_fast_box.h index fa0870cc..f6192a99 100644 --- a/core/box/bench_fast_box.h +++ b/core/box/bench_fast_box.h @@ -9,6 +9,46 @@ // // Enable with: HAKMEM_BENCH_FAST_MODE=1 // Expected: +65-100% performance (15.7M → 25-30M ops/s) +// +// ============================================================================ +// Box Contract (Phase 8 Root Cause Fix) +// ============================================================================ +// +// BenchFast Box uses TLS SLL allocation strategy, NOT Unified Cache. +// This is a critical design decision that affects all BenchFast code. +// +// Scope Separation: +// 1. WORKLOAD allocations (measured): +// - User malloc/free calls in benchmark loop +// - Contract: ALL are Tiny (size <= 1024B) +// - Path: bench_fast_alloc() → bench_fast_free() +// - Strategy: TLS SLL (g_tls_sll[]) +// +// 2. INFRASTRUCTURE allocations (not measured): +// - Benchmark metadata (slots[] array in bench_random_mixed.c) +// - Cache arrays (if any infrastructure needs allocation) +// - Contract: Bypass HAKMEM entirely (use __libc_calloc/__libc_free) +// - Path: __libc_calloc() → __libc_free() +// +// Preconditions: +// - bench_fast_init() called before workload +// - Infrastructure uses __libc_* directly (NO mixing with HAKMEM paths) +// +// Guarantees: +// - Workload: Ultra-fast (6-8 instructions alloc, 3-5 instructions free) +// - Infrastructure: Isolated (no interference with BenchFast paths) +// - No path crossing (enforced by using different allocation functions) +// +// Contract Violation Example (Phase 8 Bug): +// ❌ bench_fast_init() called unified_cache_init() +// ❌ unified_cache_init() used calloc() (went through HAKMEM wrapper) +// ❌ 16KB allocation went through mmap path (not Tiny) +// ❌ Later free() misclassified it as Tiny → CRASH +// +// ✅ Fixed: Removed unified_cache_init() call (BenchFast uses TLS SLL, not UC) +// ✅ Defensive: unified_cache_init() now uses __libc_calloc (infrastructure isolation) +// +// ============================================================================ #ifndef HAK_BOX_BENCH_FAST_H #define HAK_BOX_BENCH_FAST_H diff --git a/core/front/tiny_unified_cache.c b/core/front/tiny_unified_cache.c index 2d8c9f75..1a354e14 100644 --- a/core/front/tiny_unified_cache.c +++ b/core/front/tiny_unified_cache.c @@ -58,12 +58,17 @@ int unified_cache_enabled(void) { void unified_cache_init(void) { if (!unified_cache_enabled()) return; + // Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations + // Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely + // This prevents interaction with BenchFast mode and ensures clean separation + extern void* __libc_calloc(size_t, size_t); + // Initialize all classes (C0-C7) for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { if (g_unified_cache[cls].slots != NULL) continue; // Already initialized size_t cap = unified_capacity(cls); - g_unified_cache[cls].slots = (void**)calloc(cap, sizeof(void*)); + g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*)); if (!g_unified_cache[cls].slots) { #if !HAKMEM_BUILD_RELEASE @@ -95,10 +100,13 @@ void unified_cache_shutdown(void) { // TODO: Drain caches to SuperSlab before shutdown (prevent leak) + // Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init) + extern void __libc_free(void*); + // Free cache buffers for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { if (g_unified_cache[cls].slots) { - free(g_unified_cache[cls].slots); + __libc_free(g_unified_cache[cls].slots); g_unified_cache[cls].slots = NULL; } }