diff --git a/core/box/bench_fast_box.c b/core/box/bench_fast_box.c
index 689b5ee6..db79845a 100644
--- a/core/box/bench_fast_box.c
+++ b/core/box/bench_fast_box.c
@@ -120,25 +120,33 @@ int bench_fast_init(void) {
     // Set guard to prevent recursion during initialization
     bench_fast_init_in_progress = 1;
 
-    // Phase 8-Step2: Prewarm Unified Cache (initialize before benchmark)
-    // This enables PGO builds to remove lazy init checks in hot paths
-    #ifdef USE_HAKMEM
-    extern void unified_cache_init(void);
-    unified_cache_init();
-    fprintf(stderr, "[BENCH_FAST] Unified Cache prewarmed\n");
-    #endif
+    // Phase 8 Root Cause Fix: REMOVED unified_cache_init() call
+    // Reason: BenchFast uses TLS SLL directly, NOT Unified Cache
+    // The prewarm was a design misunderstanding - BenchFast has its own allocation strategy
+    // Calling unified_cache_init() created 16KB mmap allocations that crashed when freed
+    // in BenchFast mode (header misclassification bug)
 
     fprintf(stderr, "[BENCH_FAST] Starting preallocation...\n");
 
+    // Layer 0 Root Cause Fix: Limit prealloc to actual TLS SLL capacity
+    // Problem: Old code preallocated 50,000 blocks/class, but TLS SLL capacity is 128 (adaptive sizing)
+    //          The "lost" blocks (beyond capacity) caused heap corruption
+    // Analysis: sll_cap_for_class() returns "desired" capacity (2048), but adaptive sizing
+    //           limits actual capacity to 128 at runtime. We must use the actual limit.
+    // Solution: Hard-code to 128 blocks/class (observed actual capacity from runtime output)
+    extern const size_t g_tiny_class_sizes[];
+
     int total = 0;
-    const int PREALLOC_COUNT = 50000;  // Per class (300,000 total for C2-C7)
+    const uint32_t ACTUAL_TLS_SLL_CAPACITY = 128;  // Observed actual capacity (adaptive sizing limit)
 
     // Preallocate C2-C7 (32B-1024B, skip C0/C1 - too small, rarely used)
     for (int cls = 2; cls <= 7; cls++) {
-        fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %d blocks...\n",
-                cls, g_tiny_class_sizes[cls], PREALLOC_COUNT);
+        uint32_t capacity = ACTUAL_TLS_SLL_CAPACITY;
 
-        for (int i = 0; i < PREALLOC_COUNT; i++) {
+        fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %u blocks (actual TLS SLL capacity)...\n",
+                cls, g_tiny_class_sizes[cls], capacity);
+
+        for (int i = 0; i < (int)capacity; i++) {
             // Use normal allocator (hak_alloc_at) - recursion safe here
             size_t size = g_tiny_class_sizes[cls];
             #ifdef HAKMEM_TINY_HEADER_CLASSIDX
@@ -149,8 +157,8 @@ int bench_fast_init(void) {
             void* ptr = hak_alloc_at(size, "bench_fast_init");
 
             if (!ptr) {
-                fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%d\n",
-                        cls, i, PREALLOC_COUNT);
+                fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%u\n",
+                        cls, i, capacity);
                 fprintf(stderr, "[BENCH_FAST] Total preallocated: %d blocks\n", total);
                 return total;
             }
@@ -181,10 +189,10 @@ int bench_fast_init(void) {
 
             total++;
 
-            // Progress indicator every 10,000 blocks
-            if ((i + 1) % 10000 == 0) {
-                fprintf(stderr, "[BENCH_FAST]   C%d: %d/%d blocks...\n",
-                        cls, i + 1, PREALLOC_COUNT);
+            // Progress indicator (only for large capacities)
+            if (capacity >= 500 && (i + 1) % 500 == 0) {
+                fprintf(stderr, "[BENCH_FAST]   C%d: %d/%u blocks...\n",
+                        cls, i + 1, capacity);
             }
         }
 
diff --git a/core/box/bench_fast_box.h b/core/box/bench_fast_box.h
index fa0870cc..f6192a99 100644
--- a/core/box/bench_fast_box.h
+++ b/core/box/bench_fast_box.h
@@ -9,6 +9,46 @@
 //
 // Enable with: HAKMEM_BENCH_FAST_MODE=1
 // Expected: +65-100% performance (15.7M → 25-30M ops/s)
+//
+// ============================================================================
+// Box Contract (Phase 8 Root Cause Fix)
+// ============================================================================
+//
+// BenchFast Box uses TLS SLL allocation strategy, NOT Unified Cache.
+// This is a critical design decision that affects all BenchFast code.
+//
+// Scope Separation:
+//   1. WORKLOAD allocations (measured):
+//      - User malloc/free calls in benchmark loop
+//      - Contract: ALL are Tiny (size <= 1024B)
+//      - Path: bench_fast_alloc() → bench_fast_free()
+//      - Strategy: TLS SLL (g_tls_sll[])
+//
+//   2. INFRASTRUCTURE allocations (not measured):
+//      - Benchmark metadata (slots[] array in bench_random_mixed.c)
+//      - Cache arrays (if any infrastructure needs allocation)
+//      - Contract: Bypass HAKMEM entirely (use __libc_calloc/__libc_free)
+//      - Path: __libc_calloc() → __libc_free()
+//
+// Preconditions:
+//   - bench_fast_init() called before workload
+//   - Infrastructure uses __libc_* directly (NO mixing with HAKMEM paths)
+//
+// Guarantees:
+//   - Workload: Ultra-fast (6-8 instructions alloc, 3-5 instructions free)
+//   - Infrastructure: Isolated (no interference with BenchFast paths)
+//   - No path crossing (enforced by using different allocation functions)
+//
+// Contract Violation Example (Phase 8 Bug):
+//   ❌ bench_fast_init() called unified_cache_init()
+//   ❌ unified_cache_init() used calloc() (went through HAKMEM wrapper)
+//   ❌ 16KB allocation went through mmap path (not Tiny)
+//   ❌ Later free() misclassified it as Tiny → CRASH
+//
+//   ✅ Fixed: Removed unified_cache_init() call (BenchFast uses TLS SLL, not UC)
+//   ✅ Defensive: unified_cache_init() now uses __libc_calloc (infrastructure isolation)
+//
+// ============================================================================
 
 #ifndef HAK_BOX_BENCH_FAST_H
 #define HAK_BOX_BENCH_FAST_H
diff --git a/core/front/tiny_unified_cache.c b/core/front/tiny_unified_cache.c
index 2d8c9f75..1a354e14 100644
--- a/core/front/tiny_unified_cache.c
+++ b/core/front/tiny_unified_cache.c
@@ -58,12 +58,17 @@ int unified_cache_enabled(void) {
 void unified_cache_init(void) {
     if (!unified_cache_enabled()) return;
 
+    // Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations
+    // Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely
+    // This prevents interaction with BenchFast mode and ensures clean separation
+    extern void* __libc_calloc(size_t, size_t);
+
     // Initialize all classes (C0-C7)
     for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
         if (g_unified_cache[cls].slots != NULL) continue;  // Already initialized
 
         size_t cap = unified_capacity(cls);
-        g_unified_cache[cls].slots = (void**)calloc(cap, sizeof(void*));
+        g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*));
 
         if (!g_unified_cache[cls].slots) {
 #if !HAKMEM_BUILD_RELEASE
@@ -95,10 +100,13 @@ void unified_cache_shutdown(void) {
 
     // TODO: Drain caches to SuperSlab before shutdown (prevent leak)
 
+    // Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init)
+    extern void __libc_free(void*);
+
     // Free cache buffers
     for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
         if (g_unified_cache[cls].slots) {
-            free(g_unified_cache[cls].slots);
+            __libc_free(g_unified_cache[cls].slots);
             g_unified_cache[cls].slots = NULL;
         }
     }