Phase 8-TLS-Fix: BenchFast crash root cause fixes
Two critical bugs fixed: 1. TLS→Atomic guard (cross-thread safety): - Changed `__thread int bench_fast_init_in_progress` to `atomic_int` - Root cause: pthread_once() creates threads with fresh TLS (= 0) - Guard must protect entire process, not just calling thread - Box Contract: Observable state across all threads 2. Direct header write (P3 optimization bypass): - bench_fast_alloc() now writes header directly: 0xa0 | class_idx - Root cause: P3 optimization skips header writes by default - BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic) - Box Contract: BenchFast always writes headers Result: - Normal mode: 16.3M ops/s (working) - BenchFast mode: No crash (pool exhaustion expected with 128 blocks/class) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -9,6 +9,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdatomic.h>
|
||||||
|
|
||||||
// External Tiny infrastructure (defined in hakmem_tiny.c)
|
// External Tiny infrastructure (defined in hakmem_tiny.c)
|
||||||
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
||||||
@ -19,9 +20,11 @@ extern const size_t g_tiny_class_sizes[];
|
|||||||
#include "../hakmem.h"
|
#include "../hakmem.h"
|
||||||
|
|
||||||
// Guard: Disable BenchFast during initialization to avoid recursion
|
// Guard: Disable BenchFast during initialization to avoid recursion
|
||||||
// NOTE: Defined here and declared extern in bench_fast_box.h so that
|
// Phase 8-TLS-Fix: Changed from __thread to atomic_int
|
||||||
// malloc/free wrappers can also see it and skip BenchFast during init.
|
// Root Cause: pthread_once() creates new threads with fresh TLS (= 0),
|
||||||
__thread int bench_fast_init_in_progress = 0;
|
// breaking the guard. Atomic variable works across ALL threads.
|
||||||
|
// Box Contract: Guard must protect entire process, not just calling thread.
|
||||||
|
atomic_int g_bench_fast_init_in_progress = 0;
|
||||||
|
|
||||||
// BenchFast alloc - Minimal path (POP-ONLY, NO REFILL)
|
// BenchFast alloc - Minimal path (POP-ONLY, NO REFILL)
|
||||||
// Flow:
|
// Flow:
|
||||||
@ -30,8 +33,8 @@ __thread int bench_fast_init_in_progress = 0;
|
|||||||
// 3. Write header + return (2-3 instructions)
|
// 3. Write header + return (2-3 instructions)
|
||||||
// NOTE: No refill! Pool must be preallocated via bench_fast_init()
|
// NOTE: No refill! Pool must be preallocated via bench_fast_init()
|
||||||
void* bench_fast_alloc(size_t size) {
|
void* bench_fast_alloc(size_t size) {
|
||||||
// Guard: Avoid recursion during init phase
|
// Guard: Avoid recursion during init phase (atomic for cross-thread safety)
|
||||||
if (__builtin_expect(bench_fast_init_in_progress, 0)) {
|
if (__builtin_expect(atomic_load(&g_bench_fast_init_in_progress), 0)) {
|
||||||
// Initialization in progress - use normal allocator to avoid recursion
|
// Initialization in progress - use normal allocator to avoid recursion
|
||||||
return hak_alloc_at(size, "bench_fast_alloc_init");
|
return hak_alloc_at(size, "bench_fast_alloc_init");
|
||||||
}
|
}
|
||||||
@ -65,8 +68,12 @@ void* bench_fast_alloc(size_t size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 4. Write header + return USER pointer (2-3 instructions)
|
// 4. Write header + return USER pointer (2-3 instructions)
|
||||||
|
// Phase 8-P3-Fix: Write header DIRECTLY (bypass tiny_region_id_write_header)
|
||||||
|
// Reason: P3 optimization skips header writes by default (class_map mode)
|
||||||
|
// But BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic)
|
||||||
|
// Contract: BenchFast always writes headers, regardless of P3 optimization
|
||||||
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
|
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
|
||||||
tiny_region_id_write_header(base, class_idx); // Write 1-byte header (BASE first!)
|
*(uint8_t*)base = (uint8_t)(0xa0 | (class_idx & 0x0f)); // Direct header write
|
||||||
return (void*)((char*)base + 1); // Return USER pointer
|
return (void*)((char*)base + 1); // Return USER pointer
|
||||||
#else
|
#else
|
||||||
return base; // No header mode - return BASE directly
|
return base; // No header mode - return BASE directly
|
||||||
@ -117,8 +124,8 @@ int bench_fast_init(void) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set guard to prevent recursion during initialization
|
// Set guard to prevent recursion during initialization (atomic for cross-thread safety)
|
||||||
bench_fast_init_in_progress = 1;
|
atomic_store(&g_bench_fast_init_in_progress, 1);
|
||||||
|
|
||||||
// Phase 8 Root Cause Fix: REMOVED unified_cache_init() call
|
// Phase 8 Root Cause Fix: REMOVED unified_cache_init() call
|
||||||
// Reason: BenchFast uses TLS SLL directly, NOT Unified Cache
|
// Reason: BenchFast uses TLS SLL directly, NOT Unified Cache
|
||||||
@ -209,7 +216,7 @@ int bench_fast_init(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Clear guard - initialization complete, BenchFast mode can now be used
|
// Clear guard - initialization complete, BenchFast mode can now be used
|
||||||
bench_fast_init_in_progress = 0;
|
atomic_store(&g_bench_fast_init_in_progress, 0);
|
||||||
|
|
||||||
return total;
|
return total;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -56,6 +56,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdatomic.h>
|
||||||
|
|
||||||
// BenchFast mode enabled (ENV cached at first call)
|
// BenchFast mode enabled (ENV cached at first call)
|
||||||
// Returns: 1 if enabled, 0 if disabled
|
// Returns: 1 if enabled, 0 if disabled
|
||||||
@ -73,7 +74,8 @@ static inline int bench_fast_enabled(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Exposed init guard so wrappers can avoid BenchFast during preallocation
|
// Exposed init guard so wrappers can avoid BenchFast during preallocation
|
||||||
extern __thread int bench_fast_init_in_progress;
|
// Phase 8-TLS-Fix: Changed from __thread to atomic_int (works across ALL threads)
|
||||||
|
extern atomic_int g_bench_fast_init_in_progress;
|
||||||
|
|
||||||
// BenchFast alloc (Tiny-only, no safety checks)
|
// BenchFast alloc (Tiny-only, no safety checks)
|
||||||
// Preconditions: size <= 1024 (Tiny range)
|
// Preconditions: size <= 1024 (Tiny range)
|
||||||
|
|||||||
@ -57,7 +57,8 @@ void* malloc(size_t size) {
|
|||||||
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
||||||
// WARNING: Bypasses ALL safety checks - benchmark only!
|
// WARNING: Bypasses ALL safety checks - benchmark only!
|
||||||
// IMPORTANT: Do NOT use BenchFast during preallocation/init to avoid recursion.
|
// IMPORTANT: Do NOT use BenchFast during preallocation/init to avoid recursion.
|
||||||
if (__builtin_expect(!bench_fast_init_in_progress && bench_fast_enabled(), 0)) {
|
// Phase 8-TLS-Fix: Use atomic_load for cross-thread safety
|
||||||
|
if (__builtin_expect(!atomic_load(&g_bench_fast_init_in_progress) && bench_fast_enabled(), 0)) {
|
||||||
if (size <= 1024) { // Tiny range
|
if (size <= 1024) { // Tiny range
|
||||||
return bench_fast_alloc(size);
|
return bench_fast_alloc(size);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user