feat(Phase 1-1): Complete getenv elimination from malloc/free hot paths (+39-42% perf)
## Summary Eliminated all getenv() calls from malloc/free wrappers and allocator hot paths by implementing constructor-based environment variable caching. This achieves 39-42% performance improvement (36s → 22s on sh8bench single-thread). ## Performance Impact - sh8bench 1 thread: 35-36s → 21-22s (+39-42% improvement) 🚀 - sh8bench 8 threads: ~15s (maintained) - getenv overhead: 36.32% → 0% (completely eliminated) ## Changes ### New Files - **core/box/tiny_env_box.{c,h}**: Centralized environment variable cache for Tiny allocator - Caches 43 environment variables (HAKMEM_TINY_*, HAKMEM_SLL_*, HAKMEM_SS_*, etc.) - Constructor-based initialization with atomic CAS for thread safety - Inline accessor tiny_env_cfg() for hot path access - **core/box/wrapper_env_box.{c,h}**: Environment cache for malloc/free wrappers - Caches 3 wrapper variables (HAKMEM_STEP_TRACE, HAKMEM_LD_SAFE, HAKMEM_FREE_WRAP_TRACE) - Constructor priority 101 ensures early initialization - Replaces all lazy-init patterns in wrapper code ### Modified Files - **Makefile**: Added tiny_env_box.o and wrapper_env_box.o to OBJS_BASE and SHARED_OBJS - **core/box/hak_wrappers.inc.h**: - Removed static lazy-init variables (g_step_trace, ld_safe_mode cache) - Replaced with wrapper_env_cfg() lookups (wcfg->step_trace, wcfg->ld_safe_mode) - All getenv() calls eliminated from malloc/free hot paths - **core/hakmem.c**: - Added hak_ld_env_init() with constructor for LD_PRELOAD caching - Added hak_force_libc_ctor() for HAKMEM_FORCE_LIBC_ALLOC* caching - Simplified hak_ld_env_mode() to return cached value only - Simplified hak_force_libc_alloc() to use cached values - Eliminated all getenv/atoi calls from hot paths ## Technical Details ### Constructor Initialization Pattern All environment variables are now read once at library load time using __attribute__((constructor)): ```c __attribute__((constructor(101))) static void wrapper_env_ctor(void) { wrapper_env_init_once(); // Atomic CAS ensures exactly-once init } ``` ### Thread Safety - Atomic compare-and-swap (CAS) ensures single initialization - Spin-wait for initialization completion in multi-threaded scenarios - Memory barriers (memory_order_acq_rel) ensure visibility ### Hot Path Impact Before: Every malloc/free → getenv("LD_PRELOAD") + getenv("HAKMEM_STEP_TRACE") + ... After: Every malloc/free → Single pointer dereference (wcfg->field) ## Next Optimization Target (Phase 1-2) Perf analysis reveals libc fallback accounts for ~51% of cycles: - _int_malloc: 15.04% - malloc: 9.81% - _int_free: 10.07% - malloc_consolidate: 9.27% - unlink_chunk: 6.82% Reducing libc fallback from 51% → 10% could yield additional +25-30% improvement. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: ChatGPT <chatgpt@openai.com>
This commit is contained in:
@ -33,6 +33,7 @@ void* realloc(void* ptr, size_t size) {
|
||||
#include "../hakmem_pool.h" // Mid registry lookup (failsafe for headerless Mid)
|
||||
#include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification
|
||||
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
|
||||
#include "wrapper_env_box.h" // Wrapper env cache (step trace / LD safe / free trace)
|
||||
|
||||
// malloc wrapper - intercepts system malloc() calls
|
||||
__thread uint64_t g_malloc_total_calls = 0;
|
||||
@ -77,12 +78,8 @@ void* malloc(size_t size) {
|
||||
// This prevents infinite recursion when getenv/fprintf/dlopen call malloc
|
||||
g_hakmem_lock_depth++;
|
||||
// Debug step trace for 33KB: gated by env HAKMEM_STEP_TRACE (default: OFF)
|
||||
static int g_step_trace = -1;
|
||||
if (__builtin_expect(g_step_trace == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_STEP_TRACE");
|
||||
g_step_trace = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:1 Lock++\n", 14);
|
||||
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg();
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:1 Lock++\n", 14);
|
||||
|
||||
// Guard against recursion during initialization
|
||||
if (__builtin_expect(g_initializing != 0, 0)) {
|
||||
@ -103,41 +100,36 @@ void* malloc(size_t size) {
|
||||
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
|
||||
g_hakmem_lock_depth--;
|
||||
extern void* __libc_malloc(size_t);
|
||||
if (size == 33000) write(2, "RET:ForceLibc\n", 14);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "RET:ForceLibc\n", 14);
|
||||
return __libc_malloc(size);
|
||||
}
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:2 ForceLibc passed\n", 24);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:2 ForceLibc passed\n", 24);
|
||||
|
||||
int ld_mode = hak_ld_env_mode();
|
||||
if (ld_mode) {
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:3 LD Mode\n", 15);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:3 LD Mode\n", 15);
|
||||
if (hak_ld_block_jemalloc() && g_jemalloc_loaded) {
|
||||
g_hakmem_lock_depth--;
|
||||
extern void* __libc_malloc(size_t);
|
||||
if (size == 33000) write(2, "RET:Jemalloc\n", 13);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "RET:Jemalloc\n", 13);
|
||||
return __libc_malloc(size);
|
||||
}
|
||||
if (!g_initialized) { hak_init(); }
|
||||
if (g_initializing) {
|
||||
g_hakmem_lock_depth--;
|
||||
extern void* __libc_malloc(size_t);
|
||||
if (size == 33000) write(2, "RET:Init2\n", 10);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "RET:Init2\n", 10);
|
||||
return __libc_malloc(size);
|
||||
}
|
||||
// Cache HAKMEM_LD_SAFE to avoid repeated getenv on hot path
|
||||
static _Atomic int ld_safe_mode = -1; // -1 = uninitialized
|
||||
if (__builtin_expect(ld_safe_mode < 0, 0)) {
|
||||
const char* lds = getenv("HAKMEM_LD_SAFE");
|
||||
ld_safe_mode = (lds ? atoi(lds) : 1);
|
||||
}
|
||||
if (ld_safe_mode >= 2) {
|
||||
if (wcfg->ld_safe_mode >= 2) {
|
||||
g_hakmem_lock_depth--;
|
||||
extern void* __libc_malloc(size_t);
|
||||
if (size == 33000) write(2, "RET:LDSafe\n", 11);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "RET:LDSafe\n", 11);
|
||||
return __libc_malloc(size);
|
||||
}
|
||||
}
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:4 LD Check passed\n", 23);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:4 LD Check passed\n", 23);
|
||||
|
||||
// Phase 26: CRITICAL - Ensure initialization before fast path
|
||||
// (fast path bypasses hak_alloc_at, so we need to init here)
|
||||
@ -151,19 +143,19 @@ void* malloc(size_t size) {
|
||||
// Phase 4-Step3: Use config macro for compile-time optimization
|
||||
// Phase 7-Step1: Changed expect hint from 0→1 (unified path is now LIKELY)
|
||||
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) {
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:5 Unified Gate check\n", 26);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:5 Unified Gate check\n", 26);
|
||||
if (size <= tiny_get_max_size()) {
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:5.1 Inside Unified\n", 24);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:5.1 Inside Unified\n", 24);
|
||||
void* ptr = malloc_tiny_fast(size);
|
||||
if (__builtin_expect(ptr != NULL, 1)) {
|
||||
g_hakmem_lock_depth--;
|
||||
if (size == 33000) write(2, "RET:TinyFast\n", 13);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "RET:TinyFast\n", 13);
|
||||
return ptr;
|
||||
}
|
||||
// Unified Cache miss → fallback to normal path (hak_alloc_at)
|
||||
}
|
||||
}
|
||||
if (g_step_trace && size == 33000) write(2, "STEP:6 All checks passed\n", 25);
|
||||
if (wcfg->step_trace && size == 33000) write(2, "STEP:6 All checks passed\n", 25);
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
if (count > 14250 && count < 14280 && size <= 1024) {
|
||||
|
||||
Reference in New Issue
Block a user