From 695aec8279bfba400e85f509f2066b20281cf7fc Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Tue, 2 Dec 2025 16:44:27 +0900 Subject: [PATCH] feat(Phase 1-2): Add atomic initialization wait mechanism (safety improvement) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements thread-safe atomic initialization tracking and a wait helper for non-init threads to avoid libc fallback during the initialization window. Changes: - Convert g_initializing to _Atomic type for thread-safe access - Add g_init_thread to identify which thread performs initialization - Implement hak_init_wait_for_ready() helper with spin/yield mechanism - Update hak_core_init.inc.h to use atomic operations - Update hak_wrappers.inc.h to call wait helper instead of checking g_initializing Results & Analysis: - Performance: ±0% (21s → 21s, no measurable improvement) - Safety: ✓ Prevents recursion in init window - Investigation: Initialization overhead is <1% of total allocations - Expected: 2-8% improvement - Actual: 0% improvement (spin/yield overhead ≈ savings) - libc overhead: 41% → 57% (relative increase, likely sampling variation) Key Findings from Perf Analysis: - getenv: 0% (maintained from Phase 1-1) ✓ - libc malloc/free: ~24.54% of cycles - libc fragmentation (malloc_consolidate/unlink_chunk): ~16% of cycles - Total libc overhead: ~41% (difficult to optimize without changing algorithm) Next Phase Target: - Phase 2: Investigate libc fragmentation (malloc_consolidate 9.33%, unlink_chunk 6.90%) - Potential approaches: hakmem Mid/ACE allocator expansion, sh8bench pattern analysis Recommendation: Keep Phase 1-2 for safety (no performance regression), proceed to Phase 2. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- core/box/hak_core_init.inc.h | 5 +++-- core/box/hak_wrappers.inc.h | 24 ++++++++++++++++-------- core/hakmem.c | 31 +++++++++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/core/box/hak_core_init.inc.h b/core/box/hak_core_init.inc.h index d6b1b827..962cbc7f 100644 --- a/core/box/hak_core_init.inc.h +++ b/core/box/hak_core_init.inc.h @@ -33,7 +33,8 @@ void hak_init(void) { } static void hak_init_impl(void) { - g_initializing = 1; + g_init_thread = pthread_self(); + atomic_store_explicit(&g_initializing, 1, memory_order_release); // Phase 6.X P0 FIX (2025-10-24): Initialize Box 3 (Syscall Layer) FIRST! // This MUST be called before ANY allocation (Tiny/Mid/Large/Learner) @@ -313,7 +314,7 @@ static void hak_init_impl(void) { } #endif - g_initializing = 0; + atomic_store_explicit(&g_initializing, 0, memory_order_release); // Publish that initialization is complete atomic_thread_fence(memory_order_seq_cst); g_initialized = 1; diff --git a/core/box/hak_wrappers.inc.h b/core/box/hak_wrappers.inc.h index 1e97740e..ace72c73 100644 --- a/core/box/hak_wrappers.inc.h +++ b/core/box/hak_wrappers.inc.h @@ -82,7 +82,8 @@ void* malloc(size_t size) { if (wcfg->step_trace && size == 33000) write(2, "STEP:1 Lock++\n", 14); // Guard against recursion during initialization - if (__builtin_expect(g_initializing != 0, 0)) { + int init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(init_wait <= 0, 0)) { g_hakmem_lock_depth--; extern void* __libc_malloc(size_t); if (size == 33000) write(2, "RET:Initializing\n", 17); @@ -115,7 +116,8 @@ void* malloc(size_t size) { return __libc_malloc(size); } if (!g_initialized) { hak_init(); } - if (g_initializing) { + int ld_init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(ld_init_wait <= 0, 0)) { g_hakmem_lock_depth--; extern void* __libc_malloc(size_t); if (wcfg->step_trace && size == 33000) write(2, "RET:Init2\n", 10); @@ -285,7 +287,8 @@ void free(void* ptr) { __libc_free(ptr); return; } - if (__builtin_expect(g_initializing != 0, 0)) { + int free_init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(free_init_wait <= 0, 0)) { #if !HAKMEM_BUILD_RELEASE uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed); if (count < 10) { @@ -301,7 +304,8 @@ void free(void* ptr) { if (hak_ld_env_mode()) { if (hak_ld_block_jemalloc() && g_jemalloc_loaded) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_jemalloc"); __libc_free(ptr); return; } if (!g_initialized) { hak_init(); } - if (g_initializing) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_init"); __libc_free(ptr); return; } + int free_ld_wait = hak_init_wait_for_ready(); + if (__builtin_expect(free_ld_wait <= 0, 0)) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_init"); __libc_free(ptr); return; } } // Phase 15: Box Separation - Domain check to distinguish hakmem vs external pointers @@ -360,7 +364,8 @@ void* calloc(size_t nmemb, size_t size) { return __libc_calloc(nmemb, size); } - if (__builtin_expect(g_initializing != 0, 0)) { + int calloc_init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(calloc_init_wait <= 0, 0)) { g_hakmem_lock_depth--; extern void* __libc_calloc(size_t, size_t); return __libc_calloc(nmemb, size); @@ -387,7 +392,8 @@ void* calloc(size_t nmemb, size_t size) { return __libc_calloc(nmemb, size); } if (!g_initialized) { hak_init(); } - if (g_initializing) { + int calloc_ld_wait = hak_init_wait_for_ready(); + if (__builtin_expect(calloc_ld_wait <= 0, 0)) { g_hakmem_lock_depth--; extern void* __libc_calloc(size_t, size_t); return __libc_calloc(nmemb, size); @@ -416,13 +422,15 @@ void* calloc(size_t nmemb, size_t size) { void* realloc(void* ptr, size_t size) { if (g_hakmem_lock_depth > 0) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } - if (__builtin_expect(g_initializing != 0, 0)) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } + int realloc_init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(realloc_init_wait <= 0, 0)) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } if (__builtin_expect(hak_force_libc_alloc(), 0)) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } int ld_mode = hak_ld_env_mode(); if (ld_mode) { if (hak_ld_block_jemalloc() && g_jemalloc_loaded) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } if (!g_initialized) { hak_init(); } - if (g_initializing) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } + int realloc_ld_wait = hak_init_wait_for_ready(); + if (__builtin_expect(realloc_ld_wait <= 0, 0)) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); } } if (ptr == NULL) { return malloc(size); } if (size == 0) { free(ptr); return NULL; } diff --git a/core/hakmem.c b/core/hakmem.c index 7265d833..afe2f076 100644 --- a/core/hakmem.c +++ b/core/hakmem.c @@ -36,6 +36,7 @@ #include #include // NEW Phase 6.5: For atomic tick counter #include // Phase 6.15: Threading primitives (recursion guard only) +#include // Yield during init wait #include // calloc overflow handling #include #ifdef __GLIBC__ @@ -243,8 +244,34 @@ int hak_in_wrapper(void) { } // Initialization guard -static int g_initializing = 0; -int hak_is_initializing(void) { return g_initializing; } +static _Atomic int g_initializing = 0; +static pthread_t g_init_thread; +int hak_is_initializing(void) { return atomic_load_explicit(&g_initializing, memory_order_acquire); } + +// Wait helper for non-init threads to avoid libc fallback during init window +static inline int hak_init_wait_for_ready(void) { + if (__builtin_expect(!atomic_load_explicit(&g_initializing, memory_order_acquire), 1)) { + return 1; // Ready + } + pthread_t self = pthread_self(); + if (pthread_equal(self, g_init_thread)) { + return 0; // We are the init thread; caller should take the existing fallback path + } + for (int i = 0; atomic_load_explicit(&g_initializing, memory_order_acquire); ++i) { +#if defined(__x86_64__) || defined(__i386__) + if (i < 1024) { + __asm__ __volatile__("pause" ::: "memory"); + } else +#endif + { + sched_yield(); + } + if (i > 1000000) { + return -1; // Timed out waiting for init; allow libc fallback + } + } + return 1; // Init completed +} // ============================================================================ // Phase 6-1.5: Ultra-Simple Fast Path Forward Declarations