Phase 8-TLS-Fix: BenchFast crash root cause fixes
Two critical bugs fixed: 1. TLS→Atomic guard (cross-thread safety): - Changed `__thread int bench_fast_init_in_progress` to `atomic_int` - Root cause: pthread_once() creates threads with fresh TLS (= 0) - Guard must protect entire process, not just calling thread - Box Contract: Observable state across all threads 2. Direct header write (P3 optimization bypass): - bench_fast_alloc() now writes header directly: 0xa0 | class_idx - Root cause: P3 optimization skips header writes by default - BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic) - Box Contract: BenchFast always writes headers Result: - Normal mode: 16.3M ops/s (working) - BenchFast mode: No crash (pool exhaustion expected with 128 blocks/class) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -9,6 +9,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
// External Tiny infrastructure (defined in hakmem_tiny.c)
|
||||
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
||||
@ -19,9 +20,11 @@ extern const size_t g_tiny_class_sizes[];
|
||||
#include "../hakmem.h"
|
||||
|
||||
// Guard: Disable BenchFast during initialization to avoid recursion
|
||||
// NOTE: Defined here and declared extern in bench_fast_box.h so that
|
||||
// malloc/free wrappers can also see it and skip BenchFast during init.
|
||||
__thread int bench_fast_init_in_progress = 0;
|
||||
// Phase 8-TLS-Fix: Changed from __thread to atomic_int
|
||||
// Root Cause: pthread_once() creates new threads with fresh TLS (= 0),
|
||||
// breaking the guard. Atomic variable works across ALL threads.
|
||||
// Box Contract: Guard must protect entire process, not just calling thread.
|
||||
atomic_int g_bench_fast_init_in_progress = 0;
|
||||
|
||||
// BenchFast alloc - Minimal path (POP-ONLY, NO REFILL)
|
||||
// Flow:
|
||||
@ -30,8 +33,8 @@ __thread int bench_fast_init_in_progress = 0;
|
||||
// 3. Write header + return (2-3 instructions)
|
||||
// NOTE: No refill! Pool must be preallocated via bench_fast_init()
|
||||
void* bench_fast_alloc(size_t size) {
|
||||
// Guard: Avoid recursion during init phase
|
||||
if (__builtin_expect(bench_fast_init_in_progress, 0)) {
|
||||
// Guard: Avoid recursion during init phase (atomic for cross-thread safety)
|
||||
if (__builtin_expect(atomic_load(&g_bench_fast_init_in_progress), 0)) {
|
||||
// Initialization in progress - use normal allocator to avoid recursion
|
||||
return hak_alloc_at(size, "bench_fast_alloc_init");
|
||||
}
|
||||
@ -65,8 +68,12 @@ void* bench_fast_alloc(size_t size) {
|
||||
}
|
||||
|
||||
// 4. Write header + return USER pointer (2-3 instructions)
|
||||
// Phase 8-P3-Fix: Write header DIRECTLY (bypass tiny_region_id_write_header)
|
||||
// Reason: P3 optimization skips header writes by default (class_map mode)
|
||||
// But BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic)
|
||||
// Contract: BenchFast always writes headers, regardless of P3 optimization
|
||||
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
|
||||
tiny_region_id_write_header(base, class_idx); // Write 1-byte header (BASE first!)
|
||||
*(uint8_t*)base = (uint8_t)(0xa0 | (class_idx & 0x0f)); // Direct header write
|
||||
return (void*)((char*)base + 1); // Return USER pointer
|
||||
#else
|
||||
return base; // No header mode - return BASE directly
|
||||
@ -117,8 +124,8 @@ int bench_fast_init(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Set guard to prevent recursion during initialization
|
||||
bench_fast_init_in_progress = 1;
|
||||
// Set guard to prevent recursion during initialization (atomic for cross-thread safety)
|
||||
atomic_store(&g_bench_fast_init_in_progress, 1);
|
||||
|
||||
// Phase 8 Root Cause Fix: REMOVED unified_cache_init() call
|
||||
// Reason: BenchFast uses TLS SLL directly, NOT Unified Cache
|
||||
@ -209,7 +216,7 @@ int bench_fast_init(void) {
|
||||
}
|
||||
|
||||
// Clear guard - initialization complete, BenchFast mode can now be used
|
||||
bench_fast_init_in_progress = 0;
|
||||
atomic_store(&g_bench_fast_init_in_progress, 0);
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
@ -56,6 +56,7 @@
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
// BenchFast mode enabled (ENV cached at first call)
|
||||
// Returns: 1 if enabled, 0 if disabled
|
||||
@ -73,7 +74,8 @@ static inline int bench_fast_enabled(void) {
|
||||
}
|
||||
|
||||
// Exposed init guard so wrappers can avoid BenchFast during preallocation
|
||||
extern __thread int bench_fast_init_in_progress;
|
||||
// Phase 8-TLS-Fix: Changed from __thread to atomic_int (works across ALL threads)
|
||||
extern atomic_int g_bench_fast_init_in_progress;
|
||||
|
||||
// BenchFast alloc (Tiny-only, no safety checks)
|
||||
// Preconditions: size <= 1024 (Tiny range)
|
||||
|
||||
@ -57,7 +57,8 @@ void* malloc(size_t size) {
|
||||
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
||||
// WARNING: Bypasses ALL safety checks - benchmark only!
|
||||
// IMPORTANT: Do NOT use BenchFast during preallocation/init to avoid recursion.
|
||||
if (__builtin_expect(!bench_fast_init_in_progress && bench_fast_enabled(), 0)) {
|
||||
// Phase 8-TLS-Fix: Use atomic_load for cross-thread safety
|
||||
if (__builtin_expect(!atomic_load(&g_bench_fast_init_in_progress) && bench_fast_enabled(), 0)) {
|
||||
if (size <= 1024) { // Tiny range
|
||||
return bench_fast_alloc(size);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user