Phase 8-TLS-Fix: BenchFast crash root cause fixes

Two critical bugs fixed: 1. TLS→Atomic guard (cross-thread safety): - Changed `__thread int bench_fast_init_in_progress` to `atomic_int` - Root cause: pthread_once() creates threads with fresh TLS (= 0) - Guard must protect entire process, not just calling thread - Box Contract: Observable state across all threads 2. Direct header write (P3 optimization bypass): - bench_fast_alloc() now writes header directly: 0xa0 | class_idx - Root cause: P3 optimization skips header writes by default - BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic) - Box Contract: BenchFast always writes headers Result: - Normal mode: 16.3M ops/s (working) - BenchFast mode: No crash (pool exhaustion expected with 128 blocks/class) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 05:12:32 +09:00
parent 191e659837
commit da8f4d2c86
3 changed files with 21 additions and 11 deletions
--- a/core/box/bench_fast_box.c
+++ b/core/box/bench_fast_box.c
@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdatomic.h>

 // External Tiny infrastructure (defined in hakmem_tiny.c)
 extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
@ -19,9 +20,11 @@ extern const size_t g_tiny_class_sizes[];
 #include "../hakmem.h"

 // Guard: Disable BenchFast during initialization to avoid recursion
-// NOTE: Defined here and declared extern in bench_fast_box.h so that
-// malloc/free wrappers can also see it and skip BenchFast during init.
-__thread int bench_fast_init_in_progress = 0;
+// Phase 8-TLS-Fix: Changed from __thread to atomic_int
+// Root Cause: pthread_once() creates new threads with fresh TLS (= 0),
+//             breaking the guard. Atomic variable works across ALL threads.
+// Box Contract: Guard must protect entire process, not just calling thread.
+atomic_int g_bench_fast_init_in_progress = 0;

 // BenchFast alloc - Minimal path (POP-ONLY, NO REFILL)
 // Flow:
@ -30,8 +33,8 @@ __thread int bench_fast_init_in_progress = 0;
 //   3. Write header + return (2-3 instructions)
 // NOTE: No refill! Pool must be preallocated via bench_fast_init()
 void* bench_fast_alloc(size_t size) {
-    // Guard: Avoid recursion during init phase
-    if (__builtin_expect(bench_fast_init_in_progress, 0)) {
+    // Guard: Avoid recursion during init phase (atomic for cross-thread safety)
+    if (__builtin_expect(atomic_load(&g_bench_fast_init_in_progress), 0)) {
        // Initialization in progress - use normal allocator to avoid recursion
        return hak_alloc_at(size, "bench_fast_alloc_init");
    }
@ -65,8 +68,12 @@ void* bench_fast_alloc(size_t size) {
    }

    // 4. Write header + return USER pointer (2-3 instructions)
+    // Phase 8-P3-Fix: Write header DIRECTLY (bypass tiny_region_id_write_header)
+    // Reason: P3 optimization skips header writes by default (class_map mode)
+    //         But BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic)
+    // Contract: BenchFast always writes headers, regardless of P3 optimization
    #ifdef HAKMEM_TINY_HEADER_CLASSIDX
-    tiny_region_id_write_header(base, class_idx);  // Write 1-byte header (BASE first!)
+    *(uint8_t*)base = (uint8_t)(0xa0 | (class_idx & 0x0f));  // Direct header write
    return (void*)((char*)base + 1);  // Return USER pointer
    #else
    return base;  // No header mode - return BASE directly
@ -117,8 +124,8 @@ int bench_fast_init(void) {
        return 0;
    }

-    // Set guard to prevent recursion during initialization
-    bench_fast_init_in_progress = 1;
+    // Set guard to prevent recursion during initialization (atomic for cross-thread safety)
+    atomic_store(&g_bench_fast_init_in_progress, 1);

    // Phase 8 Root Cause Fix: REMOVED unified_cache_init() call
    // Reason: BenchFast uses TLS SLL directly, NOT Unified Cache
@ -209,7 +216,7 @@ int bench_fast_init(void) {
    }

    // Clear guard - initialization complete, BenchFast mode can now be used
-    bench_fast_init_in_progress = 0;
+    atomic_store(&g_bench_fast_init_in_progress, 0);

    return total;
 }
--- a/core/box/bench_fast_box.h
+++ b/core/box/bench_fast_box.h
@ -56,6 +56,7 @@
 #include <stddef.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdatomic.h>

 // BenchFast mode enabled (ENV cached at first call)
 // Returns: 1 if enabled, 0 if disabled
@ -73,7 +74,8 @@ static inline int bench_fast_enabled(void) {
 }

 // Exposed init guard so wrappers can avoid BenchFast during preallocation
-extern __thread int bench_fast_init_in_progress;
+// Phase 8-TLS-Fix: Changed from __thread to atomic_int (works across ALL threads)
+extern atomic_int g_bench_fast_init_in_progress;

 // BenchFast alloc (Tiny-only, no safety checks)
 // Preconditions: size <= 1024 (Tiny range)
--- a/core/box/hak_wrappers.inc.h
+++ b/core/box/hak_wrappers.inc.h
@ -57,7 +57,8 @@ void* malloc(size_t size) {
    // Phase 20-2: BenchFast mode (structural ceiling measurement)
    // WARNING: Bypasses ALL safety checks - benchmark only!
    // IMPORTANT: Do NOT use BenchFast during preallocation/init to avoid recursion.
-    if (__builtin_expect(!bench_fast_init_in_progress && bench_fast_enabled(), 0)) {
+    // Phase 8-TLS-Fix: Use atomic_load for cross-thread safety
+    if (__builtin_expect(!atomic_load(&g_bench_fast_init_in_progress) && bench_fast_enabled(), 0)) {
        if (size <= 1024) {  // Tiny range
            return bench_fast_alloc(size);
        }