Files
hakmem/core/box/hak_wrappers.inc.h
Moe Charm (CI) 3daf75e57f Phase 5-Step2: Mid Free Route Box (+28.9x free perf, 1.53x faster than system)
Fix critical 19x free() slowdown in Mid MT allocator (1KB-8KB range).

Root Cause:
- Mid MT registers chunks in MidGlobalRegistry
- Free path searches Pool's mid_desc registry (different registry!)
- Result: 100% lookup failure → 4x cascading lookups → libc fallback

Solution (Box Pattern):
- Created core/box/mid_free_route_box.h
- Try Mid MT registry BEFORE classify_ptr() in free()
- Direct route to mid_mt_free() if found
- Fall through to existing path if not found

Performance Results (bench_mid_mt_gap, 1KB-8KB allocs):
- Before: 1.49 M ops/s (19x slower than system malloc)
- After:  41.0 M ops/s (+28.9x improvement)
- vs System malloc: 1.53x faster (41.0 vs 26.8 M ops/s)

Files:
- core/box/mid_free_route_box.h (NEW) - Mid Free Route Box
- core/box/hak_wrappers.inc.h - Add mid_free_route_try() call
- core/hakmem_mid_mt.h - Fix mid_get_min_size() (1024 not 2048)
- bench_mid_mt_gap.c (NEW) - Targeted 1KB-8KB benchmark
- Makefile - Add bench_mid_mt_gap targets

Box Pattern:  Single responsibility, clear contract, testable, minimal change

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 14:18:20 +09:00

411 lines
16 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hak_wrappers.inc.h — malloc/free/calloc/realloc wrappers (LD_PRELOAD-aware)
#ifndef HAK_WRAPPERS_INC_H
#define HAK_WRAPPERS_INC_H
#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD
// Sanitizer/diagnostic builds: bypass hakmem allocator completely.
void* malloc(size_t size) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
void free(void* ptr) {
if (!ptr) return;
extern void __libc_free(void*);
__libc_free(ptr);
}
void* calloc(size_t nmemb, size_t size) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
void* realloc(void* ptr, size_t size) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
#else
#include "../ptr_trace.h" // Debug: pointer trace immediate dump on libc fallback
#include "front_gate_classifier.h" // Box FG: pointer classification (header/reg)
#include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
#include "mid_free_route_box.h" // Phase 5-Step2: Mid MT free routing fix
// malloc wrapper - intercepts system malloc() calls
__thread uint64_t g_malloc_total_calls = 0;
__thread uint64_t g_malloc_tiny_size_match = 0;
__thread uint64_t g_malloc_fast_path_tried = 0;
__thread uint64_t g_malloc_fast_path_null = 0;
__thread uint64_t g_malloc_slow_path = 0;
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
// CRITICAL FIX (BUG #10): Use cached g_jemalloc_loaded instead of calling hak_jemalloc_loaded()
// The function call version triggers infinite recursion: malloc → hak_jemalloc_loaded → dlopen → malloc
extern int g_jemalloc_loaded; // Cached during hak_init_impl(), defined in hakmem.c
// Global malloc call counter for debugging (exposed for validation code)
// Defined here, accessed from tls_sll_box.h for corruption detection
_Atomic uint64_t malloc_count = 0;
void* malloc(size_t size) {
uint64_t count = atomic_fetch_add(&malloc_count, 1);
// Phase 20-2: BenchFast mode (structural ceiling measurement)
// WARNING: Bypasses ALL safety checks - benchmark only!
// IMPORTANT: Do NOT use BenchFast during preallocation/init to avoid recursion.
if (__builtin_expect(!bench_fast_init_in_progress && bench_fast_enabled(), 0)) {
if (size <= 1024) { // Tiny range
return bench_fast_alloc(size);
}
// Fallback to normal path for large allocations
}
// DEBUG BAILOUT DISABLED - Testing full path
// if (__builtin_expect(count >= 14270 && count <= 14285, 0)) {
// extern void* __libc_malloc(size_t);
// fprintf(stderr, "[MALLOC_WRAPPER] count=%lu size=%zu - BAILOUT TO LIBC!\n", count, size);
// fflush(stderr);
// return __libc_malloc(size);
// }
// CRITICAL FIX (BUG #7): Increment lock depth FIRST, before ANY libc calls
// This prevents infinite recursion when getenv/fprintf/dlopen call malloc
g_hakmem_lock_depth++;
// Guard against recursion during initialization
if (__builtin_expect(g_initializing != 0, 0)) {
g_hakmem_lock_depth--;
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
// Now safe to call getenv/fprintf/dlopen (will use __libc_malloc if needed)
extern int g_sfc_debug;
static _Atomic int debug_count = 0;
if (__builtin_expect(g_sfc_debug, 0) && debug_count < 100) {
int n = atomic_fetch_add(&debug_count, 1);
if (n < 20) fprintf(stderr, "[SFC_DEBUG] malloc(%zu)\n", size);
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
g_hakmem_lock_depth--;
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
int ld_mode = hak_ld_env_mode();
if (ld_mode) {
if (hak_ld_block_jemalloc() && g_jemalloc_loaded) {
g_hakmem_lock_depth--;
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
if (!g_initialized) { hak_init(); }
if (g_initializing) {
g_hakmem_lock_depth--;
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
// Cache HAKMEM_LD_SAFE to avoid repeated getenv on hot path
static _Atomic int ld_safe_mode = -1; // -1 = uninitialized
if (__builtin_expect(ld_safe_mode < 0, 0)) {
const char* lds = getenv("HAKMEM_LD_SAFE");
ld_safe_mode = (lds ? atoi(lds) : 1);
}
if (ld_safe_mode >= 2 || size > TINY_MAX_SIZE) {
g_hakmem_lock_depth--;
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
}
// Phase 26: CRITICAL - Ensure initialization before fast path
// (fast path bypasses hak_alloc_at, so we need to init here)
if (!g_initialized) hak_init();
// Phase 26: Front Gate Unification (Tiny fast path)
// Placed AFTER all safety checks (lock depth, initializing, LD_SAFE, jemalloc)
// Bypasses: hak_alloc_at routing (236 lines) + wrapper diagnostics + tiny overhead
// Target: +10-15% performance (11.35M → 12.5-13.5M ops/s)
// ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF)
// Phase 4-Step3: Use config macro for compile-time optimization
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 0)) {
if (size <= tiny_get_max_size()) {
void* ptr = malloc_tiny_fast(size);
if (__builtin_expect(ptr != NULL, 1)) {
g_hakmem_lock_depth--;
return ptr;
}
// Unified Cache miss → fallback to normal path (hak_alloc_at)
}
}
#if !HAKMEM_BUILD_RELEASE
if (count > 14250 && count < 14280 && size <= 1024) {
fprintf(stderr, "[MALLOC_WRAPPER] count=%lu calling hak_alloc_at\n", count);
fflush(stderr);
}
#endif
void* ptr = hak_alloc_at(size, HAK_CALLSITE());
#if !HAKMEM_BUILD_RELEASE
if (count > 14250 && count < 14280 && size <= 1024) {
fprintf(stderr, "[MALLOC_WRAPPER] count=%lu hak_alloc_at returned %p\n", count, ptr);
fflush(stderr);
}
#endif
g_hakmem_lock_depth--;
return ptr;
}
void free(void* ptr) {
atomic_fetch_add_explicit(&g_free_wrapper_calls, 1, memory_order_relaxed);
if (!ptr) return;
// Phase 20-2: BenchFast mode (structural ceiling measurement)
// WARNING: Bypasses ALL safety checks - benchmark only!
if (__builtin_expect(bench_fast_enabled(), 0)) {
// Trust header magic to identify Tiny allocations
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
uint8_t header = *((uint8_t*)ptr - 1);
if ((header & 0xf0) == 0xa0) { // Tiny header magic (0xa0-0xa7)
bench_fast_free(ptr);
return;
}
#endif
// Fallback to normal path for non-Tiny or no-header mode
}
// Phase 26: Front Gate Unification (Tiny free fast path)
// Placed AFTER BenchFast check, BEFORE expensive classify_ptr()
// Bypasses: hak_free_at routing + wrapper overhead + classification
// Target: +10-15% performance (pairs with malloc_tiny_fast)
// ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF)
// Phase 4-Step3: Use config macro for compile-time optimization
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 0)) {
int freed = free_tiny_fast(ptr);
if (__builtin_expect(freed, 1)) {
return; // Success (pushed to Unified Cache)
}
// Unified Cache full OR invalid header → fallback to normal path
}
do { static int on=-1; if (on==-1){ const char* e=getenv("HAKMEM_FREE_WRAP_TRACE"); on=(e&&*e&&*e!='0')?1:0;} if(on){ fprintf(stderr,"[WRAP_FREE_ENTER] ptr=%p depth=%d init=%d\n", ptr, g_hakmem_lock_depth, g_initializing); } } while(0);
#if !HAKMEM_BUILD_RELEASE
// Debug safety: guard obviously invalid tiny integers to avoid libc crash and collect trace
if ((uintptr_t)ptr < 4096) {
ptr_trace_dump_now("wrap_small_ptr");
fprintf(stderr, "[FREE_SMALL_PTR] ignore ptr=%p (likely header-corruption sentinel)\n", ptr);
return;
}
#endif
// Phase 5-Step2: Mid Free Route Box (BEFORE classify_ptr)
// Quick fix for 19x free() slowdown: Try Mid MT registry first
// If found, route directly to mid_mt_free() and return
if (mid_free_route_try(ptr)) return;
// Classify pointer BEFORE early libc fallbacks to avoid misrouting Tiny pointers
// This is safe: classifier uses header probe and registry; does not allocate.
int is_hakmem_owned = 0;
{
ptr_classification_t c = classify_ptr(ptr);
switch (c.kind) {
case PTR_KIND_TINY_HEADER:
case PTR_KIND_TINY_HEADERLESS:
case PTR_KIND_POOL_TLS:
case PTR_KIND_MID_LARGE: // FIX: Include Mid-Large (mmap/ACE) pointers
is_hakmem_owned = 1; break;
default: break;
}
}
if (is_hakmem_owned) {
// Route to hak_free_at even if lock_depth>0ログ抑制のためptr_traceのみ使用
g_hakmem_lock_depth++;
hak_free_at(ptr, 0, HAK_CALLSITE());
g_hakmem_lock_depth--;
return;
}
// Front Gate libc bypass detection (quiet in release)
static _Atomic uint64_t fg_libc_bypass_count = 0;
if (g_hakmem_lock_depth > 0) {
#if !HAKMEM_BUILD_RELEASE
uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed);
if (count < 10) {
fprintf(stderr, "[FG_LIBC_BYPASS] lockdepth=%d count=%llu ptr=%p\n", g_hakmem_lock_depth, (unsigned long long)count, ptr);
}
#else
(void)fg_libc_bypass_count;
#endif
// Safety: If this is a HAKMEM-owned header allocation, free raw correctly
do {
void* raw = (char*)ptr - HEADER_SIZE;
int safe_same_page = (((uintptr_t)ptr & 0xFFFu) >= HEADER_SIZE);
if (!safe_same_page) {
if (!hak_is_memory_readable(raw)) break;
}
AllocHeader* hdr = (AllocHeader*)raw;
if (hdr->magic == HAKMEM_MAGIC) {
// Dispatch based on allocation method
if (hdr->method == ALLOC_METHOD_MALLOC) {
extern void __libc_free(void*);
ptr_trace_dump_now("wrap_libc_lockdepth_hak_hdr_malloc");
__libc_free(raw);
return;
} else if (hdr->method == ALLOC_METHOD_MMAP) {
ptr_trace_dump_now("wrap_libc_lockdepth_hak_hdr_mmap");
hkm_sys_munmap(raw, hdr->size);
return;
}
}
} while (0);
// Unknown pointer or non-HAKMEM: fall back to libc free(ptr)
extern void __libc_free(void*);
ptr_trace_dump_now("wrap_libc_lockdepth");
__libc_free(ptr);
return;
}
if (__builtin_expect(g_initializing != 0, 0)) {
#if !HAKMEM_BUILD_RELEASE
uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed);
if (count < 10) {
fprintf(stderr, "[FG_LIBC_BYPASS] init=%d count=%llu ptr=%p\n", g_initializing, (unsigned long long)count, ptr);
}
#endif
extern void __libc_free(void*);
ptr_trace_dump_now("wrap_libc_init");
__libc_free(ptr);
return;
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_force"); __libc_free(ptr); return; }
if (hak_ld_env_mode()) {
if (hak_ld_block_jemalloc() && g_jemalloc_loaded) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_jemalloc"); __libc_free(ptr); return; }
if (!g_initialized) { hak_init(); }
if (g_initializing) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_init"); __libc_free(ptr); return; }
}
// Phase 15: Box Separation - Domain check to distinguish hakmem vs external pointers
// CRITICAL: Prevent BenchMeta (slots[]) from entering CoreAlloc (hak_free_at)
// Strategy: Check 1-byte header at ptr-1 for HEADER_MAGIC (0xa0/0xb0)
// - If hakmem Tiny allocation → route to hak_free_at()
// - Otherwise → delegate to __libc_free() (external/BenchMeta)
//
// Safety: Only check header if ptr is NOT page-aligned (ptr-1 is safe to read)
uintptr_t offset_in_page = (uintptr_t)ptr & 0xFFF;
if (offset_in_page > 0) {
// Not page-aligned, safe to check ptr-1
uint8_t header = *((uint8_t*)ptr - 1);
if ((header & 0xF0) == 0xA0 || (header & 0xF0) == 0xB0) {
// HEADER_MAGIC found (0xa0 or 0xb0) → hakmem Tiny allocation
g_hakmem_lock_depth++;
hak_free_at(ptr, 0, HAK_CALLSITE());
g_hakmem_lock_depth--;
return;
}
// No header magic → external pointer (BenchMeta, libc allocation, etc.)
extern void __libc_free(void*);
ptr_trace_dump_now("wrap_libc_external_nomag");
__libc_free(ptr);
return;
}
// Page-aligned pointer → cannot safely check header, use full classification
// (This includes Pool/Mid/L25 allocations which may be page-aligned)
g_hakmem_lock_depth++;
hak_free_at(ptr, 0, HAK_CALLSITE());
g_hakmem_lock_depth--;
}
void* calloc(size_t nmemb, size_t size) {
// CRITICAL FIX (BUG #8): Increment lock depth FIRST, before ANY libc calls
g_hakmem_lock_depth++;
// Early check for recursion (lock depth already incremented by outer call)
if (g_hakmem_lock_depth > 1) {
g_hakmem_lock_depth--;
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
if (__builtin_expect(g_initializing != 0, 0)) {
g_hakmem_lock_depth--;
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
// Overflow check
if (size != 0 && nmemb > (SIZE_MAX / size)) {
g_hakmem_lock_depth--;
errno = ENOMEM;
return NULL;
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
g_hakmem_lock_depth--;
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
int ld_mode = hak_ld_env_mode();
if (ld_mode) {
if (hak_ld_block_jemalloc() && g_jemalloc_loaded) {
g_hakmem_lock_depth--;
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
if (!g_initialized) { hak_init(); }
if (g_initializing) {
g_hakmem_lock_depth--;
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
// Reuse cached ld_safe_mode from malloc (same static variable scope won't work, use inline function instead)
// For now, duplicate the caching logic
static _Atomic int ld_safe_mode_calloc = -1;
if (__builtin_expect(ld_safe_mode_calloc < 0, 0)) {
const char* lds = getenv("HAKMEM_LD_SAFE");
ld_safe_mode_calloc = (lds ? atoi(lds) : 1);
}
size_t total = nmemb * size;
if (ld_safe_mode_calloc >= 2 || total > TINY_MAX_SIZE) {
g_hakmem_lock_depth--;
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
}
size_t total_size = nmemb * size;
void* ptr = hak_alloc_at(total_size, HAK_CALLSITE());
if (ptr) { memset(ptr, 0, total_size); }
g_hakmem_lock_depth--;
return ptr;
}
void* realloc(void* ptr, size_t size) {
if (g_hakmem_lock_depth > 0) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
if (__builtin_expect(g_initializing != 0, 0)) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
if (__builtin_expect(hak_force_libc_alloc(), 0)) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
int ld_mode = hak_ld_env_mode();
if (ld_mode) {
if (hak_ld_block_jemalloc() && g_jemalloc_loaded) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
if (!g_initialized) { hak_init(); }
if (g_initializing) { extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
}
if (ptr == NULL) { return malloc(size); }
if (size == 0) { free(ptr); return NULL; }
void* new_ptr = malloc(size);
if (!new_ptr) return NULL;
memcpy(new_ptr, ptr, size);
free(ptr);
return new_ptr;
}
#endif // HAKMEM_FORCE_LIBC_ALLOC_BUILD
#endif // HAK_WRAPPERS_INC_H