hakmem/core/box/hak_core_init.inc.h

// hak_core_init.inc.h — Box: init/shutdown
#ifndef HAK_CORE_INIT_INC_H
#define HAK_CORE_INIT_INC_H

#include <signal.h>
#ifdef __GLIBC__
#include <execinfo.h>
#endif
#include "hakmem_phase7_config.h"  // Phase 7 Task 3

// Debug-only SIGSEGV handler (gated by HAKMEM_DEBUG_SEGV)
static void hakmem_sigsegv_handler(int sig) {
    (void)sig;
    const char* msg = "\n[HAKMEM] Segmentation Fault\n";
    (void)write(2, msg, 29);

#if !HAKMEM_BUILD_RELEASE
    // Dump Class 1 (16B) last push info for debugging
    tiny_debug_dump_last_push(1);
#endif

    // Restore default handler and re-raise
    signal(sig, SIG_DFL);
    raise(sig);
}

// Phase 7 Task 3: Pre-warm TLS cache helper
// Pre-allocate blocks to reduce first-allocation miss penalty
// Note: This function is defined later in hakmem.c after sll_refill_small_from_ss is available
// (moved out of header to avoid linkage issues)

static void hak_init_impl(void);
static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;

void hak_init(void) {
    (void)pthread_once(&g_init_once, hak_init_impl);
}

static void hak_init_impl(void) {
    HAK_TRACE("[init_impl_enter]\n");
    g_init_thread = pthread_self();
    atomic_store_explicit(&g_initializing, 1, memory_order_release);

    // Phase 6.X P0 FIX (2025-10-24): Initialize Box 3 (Syscall Layer) FIRST!
    // This MUST be called before ANY allocation (Tiny/Mid/Large/Learner)
    // dlsym() initializes function pointers to real libc (bypasses LD_PRELOAD)
    hkm_syscall_init();
    HAK_TRACE("[init_impl_after_syscall_init]\n");

    // CRITICAL FIX (BUG #10): Pre-detect jemalloc ONCE during init, not on hot path!
    // This prevents infinite recursion: malloc → hak_jemalloc_loaded → dlopen → malloc → ...
    // We protect dlopen's internal malloc calls with g_hakmem_lock_depth
    extern int g_jemalloc_loaded;  // Declared in hakmem.c
    if (g_jemalloc_loaded < 0) {
        void* h = dlopen("libjemalloc.so.2", RTLD_NOLOAD | RTLD_NOW);
        if (!h) h = dlopen("libjemalloc.so.1", RTLD_NOLOAD | RTLD_NOW);
        g_jemalloc_loaded = (h != NULL) ? 1 : 0;
        if (h) dlclose(h);
        if (g_jemalloc_loaded) {
            HAKMEM_LOG("Detected jemalloc: will avoid interposing\n");
        }
    }
    HAK_TRACE("[init_impl_after_jemalloc_probe]\n");

    // Optional: one-shot SIGSEGV backtrace for early crash diagnosis
    do {
        const char* dbg = getenv("HAKMEM_DEBUG_SEGV");
        if (dbg && atoi(dbg) != 0) {
            struct sigaction sa; memset(&sa, 0, sizeof(sa));
            sa.sa_flags = SA_RESETHAND;
            sa.sa_handler = hakmem_sigsegv_handler;
            sigaction(SIGSEGV, &sa, NULL);
        }
    } while (0);

    // NEW Phase 6.11.1: Initialize debug timing
    hkm_timing_init();

    // NEW Phase 6.11.1: Initialize whale fast-path cache
    hkm_whale_init();

    // NEW Phase Hybrid: Initialize Mid Range MT allocator (8-32KB, mimalloc-style)

    // NEW Phase 6.8: Initialize configuration system (replaces init_free_policy + init_thp_policy)
    hak_config_init();

    // Phase 6.16: Initialize FrozenPolicy (SACS-3)
    hkm_policy_init();

    // Phase 6.15 P0.3: Configure EVO sampling from environment variable
    // HAKMEM_EVO_SAMPLE: 0=disabled (default), N=sample every 2^N calls
    // Example: HAKMEM_EVO_SAMPLE=10 → sample every 1024 calls
    //          HAKMEM_EVO_SAMPLE=16 → sample every 65536 calls
    char* evo_sample_str = getenv("HAKMEM_EVO_SAMPLE");
    if (evo_sample_str && atoi(evo_sample_str) > 0) {
        int freq = atoi(evo_sample_str);
        if (freq >= 64) {
            HAKMEM_LOG("Warning: HAKMEM_EVO_SAMPLE=%d too large, using 63\n", freq);
            freq = 63;
        }
        g_evo_sample_mask = (1ULL << freq) - 1;
        HAKMEM_LOG("EVO sampling enabled: every 2^%d = %llu calls\n",
                   freq, (unsigned long long)(g_evo_sample_mask + 1));
    } else {
        g_evo_sample_mask = 0;  // Disabled by default
        HAKMEM_LOG("EVO sampling disabled (HAKMEM_EVO_SAMPLE not set or 0)\n");
    }

#ifdef __linux__
    // Record baseline KPIs
    memset(g_latency_histogram, 0, sizeof(g_latency_histogram));
    g_latency_samples = 0;

    get_page_faults(&g_baseline_soft_pf, &g_baseline_hard_pf);
    g_baseline_rss_kb = get_rss_kb();

    HAKMEM_LOG("Baseline: soft_pf=%lu, hard_pf=%lu, rss=%lu KB\n",
           (unsigned long)g_baseline_soft_pf,
           (unsigned long)g_baseline_hard_pf,
           (unsigned long)g_baseline_rss_kb);
#endif

    HAKMEM_LOG("Initialized (PoC version)\n");
    HAKMEM_LOG("Sampling rate: 1/%d\n", SAMPLING_RATE);
    HAKMEM_LOG("Max sites: %d\n", MAX_SITES);

    // Build banner (one-shot)
    do {
        const char* bf = "UNKNOWN";
#ifdef HAKMEM_BUILD_RELEASE
        bf = "RELEASE";
#elif defined(HAKMEM_BUILD_DEBUG)
        bf = "DEBUG";
#endif
        HAKMEM_LOG("[Build] Flavor=%s  Flags: HEADER_CLASSIDX=%d, AGGRESSIVE_INLINE=%d, POOL_TLS_PHASE1=%d, POOL_TLS_PREWARM=%d\n",
                   bf,
#if HAKMEM_TINY_HEADER_CLASSIDX
                   1,
#else
                   0,
#endif
#ifdef HAKMEM_TINY_AGGRESSIVE_INLINE
                   1,
#else
                   0,
#endif
#ifdef HAKMEM_POOL_TLS_PHASE1
                   1,
#else
                   0,
#endif
#ifdef HAKMEM_POOL_TLS_PREWARM
                   1
#else
                   0
#endif
                   );
    } while (0);

    // Bench preset: Tiny-only (disable non-essential subsystems)
    {
        char* bt = getenv("HAKMEM_BENCH_TINY_ONLY");
        if (bt && atoi(bt) != 0) {
            g_bench_tiny_only = 1;
        }
    }

    // Under LD_PRELOAD, enforce safer defaults for Tiny path unless overridden
    {
        char* ldpre = getenv("LD_PRELOAD");
        if (ldpre && strstr(ldpre, "libhakmem.so")) {
            g_ldpreload_mode = 1;
            // Default LD-safe mode if not set: 1 (Tiny-only)
            char* lds = getenv("HAKMEM_LD_SAFE");
            if (lds) { /* NOP used in wrappers */ } else { setenv("HAKMEM_LD_SAFE", "1", 0); }
            if (!getenv("HAKMEM_TINY_TLS_SLL")) {
                setenv("HAKMEM_TINY_TLS_SLL", "0", 0);  // disable TLS SLL by default
            }
            if (!getenv("HAKMEM_TINY_USE_SUPERSLAB")) {
                setenv("HAKMEM_TINY_USE_SUPERSLAB", "0", 0);  // disable SuperSlab path by default
            }
        }
    }

    // Runtime safety toggle
    char* safe_free_env = getenv("HAKMEM_SAFE_FREE");
    if (safe_free_env && atoi(safe_free_env) != 0) {
        g_strict_free = 1;
        HAKMEM_LOG("Strict free safety enabled (HAKMEM_SAFE_FREE=1)\n");
    } else {
        // Heuristic: if loaded via LD_PRELOAD, enable strict free by default
        char* ldpre = getenv("LD_PRELOAD");
        if (ldpre && strstr(ldpre, "libhakmem.so")) {
            g_ldpreload_mode = 1;
            g_strict_free = 1;
            HAKMEM_LOG("Strict free safety auto-enabled under LD_PRELOAD\n");
        }
    }

    // Invalid free logging toggle (default off to avoid spam under LD_PRELOAD)
    char* invlog = getenv("HAKMEM_INVALID_FREE_LOG");
    if (invlog && atoi(invlog) != 0) {
        g_invalid_free_log = 1;
        HAKMEM_LOG("Invalid free logging enabled (HAKMEM_INVALID_FREE_LOG=1)\n");
    }

    // Phase 7.4: Cache HAKMEM_INVALID_FREE to eliminate 44% CPU overhead
    // Perf showed getenv() on hot path consumed 43.96% CPU time (26.41% strcmp + 17.55% getenv)
    char* inv = getenv("HAKMEM_INVALID_FREE");
    if (inv && strcmp(inv, "skip") == 0) {
        g_invalid_free_mode = 1;  // explicit opt-in to legacy skip mode
        HAKMEM_LOG("Invalid free mode: skip check (HAKMEM_INVALID_FREE=skip)\n");
    } else if (inv && strcmp(inv, "fallback") == 0) {
        g_invalid_free_mode = 0;  // fallback mode: route invalid frees to libc
        HAKMEM_LOG("Invalid free mode: fallback to libc (HAKMEM_INVALID_FREE=fallback)\n");
    } else {
        // Under LD_PRELOAD, prefer safety: default to fallback unless explicitly overridden
        char* ldpre = getenv("LD_PRELOAD");
        if (ldpre && strstr(ldpre, "libhakmem.so")) {
            g_ldpreload_mode = 1;
            g_invalid_free_mode = 0;
            HAKMEM_LOG("Invalid free mode: fallback to libc (auto under LD_PRELOAD)\n");
        } else {
            // Default: safety first (fallback), avoids routing unknown pointers into Tiny
            g_invalid_free_mode = 0;
            HAKMEM_LOG("Invalid free mode: fallback to libc (default)\n");
        }
    }

    // NEW Phase 6.8: Feature-gated initialization (check g_hakem_config flags)
    if (HAK_ENABLED_ALLOC(HAKMEM_FEATURE_POOL)) {
        hak_pool_init();
    }

    // NEW Phase 6.13: L2.5 LargePool (64KB-1MB allocations)
    hak_l25_pool_init();

    if (!g_bench_tiny_only && HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE)) {
        hak_bigcache_init();
        hak_bigcache_set_free_callback(bigcache_free_callback);
    }

    if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_ELO)) {
        hak_elo_init();
        // Phase 6.11.4 P0-2: Initialize cached strategy to default (strategy 0)
        atomic_store(&g_cached_strategy_id, 0);
    }

    if (!g_bench_tiny_only && HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE)) {
        hak_batch_init();
    }

    if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_EVOLUTION)) {
        hak_evo_init();
    }

    if (!g_bench_tiny_only) {
        // Phase 6.16: Initialize ACE stats (sampling) – default off
        hkm_ace_stats_init();
        // Phase 6.16: Initialize sampling profiler – default off
        hkm_prof_init();
        // Size histogram sampling (optional)
        hkm_size_hist_init();
    }

    if (!g_bench_tiny_only) {
        // Start CAP learner (optional, env-gated)
        hkm_learner_init();
    }

    // NEW Phase 6.10: Site Rules (MVP: always ON)
    // MT note: default disabled unless HAKMEM_SITE_RULES=1
    char* sr_env = getenv("HAKMEM_SITE_RULES");
    g_site_rules_enabled = (sr_env && atoi(sr_env) != 0);
    if (!g_bench_tiny_only && g_site_rules_enabled) {
        hak_site_rules_init();
    }

    // Phase 22: Tiny Pool initialization now LAZY (per-class on first use)
    // hak_tiny_init() moved to lazy_init_class() in hakmem_tiny_lazy_init.inc.h
    // OLD: hak_tiny_init(); (eager init of all 8 classes → 94.94% page faults)
    // NEW: Lazy init triggered by tiny_alloc_fast() → only used classes initialized

    // Env: optional Tiny flush on exit (memory efficiency evaluation)
    {
        char* tf = getenv("HAKMEM_TINY_FLUSH_ON_EXIT");
        if (tf && atoi(tf) != 0) {
            g_flush_tiny_on_exit = 1;
        }
        char* ud = getenv("HAKMEM_TINY_ULTRA_DEBUG");
        if (ud && atoi(ud) != 0) {
            g_ultra_debug_on_exit = 1;
        }
        // Register exit hook if any of the debug/flush toggles are on
        // or when path debug is requested.
        if (g_flush_tiny_on_exit || g_ultra_debug_on_exit || getenv("HAKMEM_TINY_PATH_DEBUG")) {
            atexit(hak_flush_tiny_exit);
        }
    }

    // NEW Phase ACE: Initialize Adaptive Control Engine
    hkm_ace_controller_init(&g_ace_controller);
    if (g_ace_controller.enabled) {
        hkm_ace_controller_start(&g_ace_controller);
        HAKMEM_LOG("ACE Learning Layer enabled and started\n");
    }

    // Phase 20-1: Aggressive TLS SLL + SuperSlab prewarming (ChatGPT strategy)
    // Box SS-HotPrewarm: ENV-controlled per-class prewarm with page fault reduction
#if HAKMEM_TINY_PREWARM_TLS
    #include "box/ss_hot_prewarm_box.h"
    int total_prewarmed = box_ss_hot_prewarm_all();
    HAKMEM_LOG("TLS cache pre-warmed: %d blocks total (Phase 20-1)\n", total_prewarmed);
    // After TLS prewarm, cascade some hot blocks into SFC to raise early hit rate
    {
        extern int g_sfc_enabled;
        if (g_sfc_enabled) {
            extern void sfc_cascade_from_tls_initial(void);
            sfc_cascade_from_tls_initial();
        }
    }
#endif

    atomic_store_explicit(&g_initializing, 0, memory_order_release);
    // Publish that initialization is complete
    atomic_thread_fence(memory_order_seq_cst);
    g_initialized = 1;
}

void hak_shutdown(void) {
    if (!g_initialized) return;

    // NEW Phase ACE: Shutdown Adaptive Control Engine FIRST (before other subsystems)
    hkm_ace_controller_destroy(&g_ace_controller);

    if (!g_bench_tiny_only) {
        HAKMEM_LOG("Shutting down...\n");
        hak_print_stats();
    }

    // NEW Phase 6.9: Shutdown L2 Pool
    if (!g_bench_tiny_only) hak_pool_shutdown();

    // NEW Phase 6.13: Shutdown L2.5 LargePool
    if (!g_bench_tiny_only) hak_l25_pool_shutdown();

    // NEW: Shutdown BigCache Box
    if (!g_bench_tiny_only) hak_bigcache_shutdown();

    // NEW Phase 6.2: Shutdown ELO Strategy Selection
    if (!g_bench_tiny_only) hak_elo_shutdown();

    // NEW Phase 6.3: Shutdown madvise Batching
    if (!g_bench_tiny_only) hak_batch_shutdown();

    // NEW Phase 6.10: Shutdown Site Rules
    if (!g_bench_tiny_only) hak_site_rules_shutdown();

    // NEW Phase 6.12: Print Tiny Pool statistics
    if (!g_bench_tiny_only) hak_tiny_print_stats();

    // NEW Phase 6.11.1: Print whale cache statistics
    if (!g_bench_tiny_only) {
        hkm_whale_dump_stats();
        // NEW Phase 6.11.1: Shutdown whale cache
        hkm_whale_shutdown();
    }

    // NEW Phase 6.11.1: Shutdown debug timing (must be last!)
    if (!g_bench_tiny_only) hkm_timing_shutdown();

    // Phase 6.16: Dump sampling profiler
    if (!g_bench_tiny_only) hkm_prof_shutdown();

    // Stop learner thread
    if (!g_bench_tiny_only) hkm_learner_shutdown();

    // Stop Tiny background components (e.g., Intelligence Engine)
    hak_tiny_shutdown();

    g_initialized = 0;
}


#endif // HAK_CORE_INIT_INC_H
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								// hak_core_init.inc.h — Box: init/shutdown
 								#ifndef HAK_CORE_INIT_INC_H
 								#define HAK_CORE_INIT_INC_H
-												Phase 6-2.4: Fix SuperSlab free SEGV: remove guess loop and add memory readability check; add registry atomic consistency (base as _Atomic uintptr_t with acq/rel); add debug toggles (SUPER_REG_DEBUG/REQTRACE); update CURRENT_TASK with results and next steps; capture suite results.

											
										
										
											2025-11-07 18:07:48 +09:00
+								#include <signal.h>
 								#ifdef __GLIBC__
 								#include <execinfo.h>
 								#endif
-												Phase 7 Task 3: Pre-warm TLS cache (+180-280% improvement!)

MAJOR SUCCESS: HAKMEM now achieves 85-92% of System malloc on tiny
allocations (128-512B) and BEATS System at 146% on 1024B allocations!

Performance Results:
- Random Mixed 128B: 21M → 59M ops/s (+181%) 🚀
- Random Mixed 256B: 19M → 70M ops/s (+268%) 🚀
- Random Mixed 512B: 21M → 68M ops/s (+224%) 🚀
- Random Mixed 1024B: 21M → 65M ops/s (+210%, 146% of System!) 🏆
- Larson 1T: 2.68M ops/s (stable, no regression)

Implementation:
1. Task 3a: Remove profiling overhead in release builds
   - Wrapped RDTSC calls in #if !HAKMEM_BUILD_RELEASE
   - Compiler can eliminate profiling code completely
   - Effect: +2% (2.68M → 2.73M Larson)

2. Task 3b: Simplify refill logic
   - Use constants from hakmem_build_flags.h
   - TLS cache already optimal
   - Effect: No regression

3. Task 3c: Pre-warm TLS cache (GAME CHANGER!)
   - Pre-allocate 16 blocks per class at init
   - Eliminates cold-start penalty
   - Effect: +180-280% improvement 🚀

Root Cause:
The bottleneck was cold-start, not the hot path! First allocation in
each class triggered a SuperSlab refill (100+ cycles). Pre-warming
eliminated this penalty, revealing Phase 7's true potential.

Files Modified:
- core/hakmem_tiny.c: Pre-warm function implementation
- core/box/hak_core_init.inc.h: Pre-warm initialization call
- core/tiny_alloc_fast.inc.h: Profiling overhead removal
- core/hakmem_phase7_config.h: Task 3 constants (NEW)
- core/hakmem_build_flags.h: Phase 7 feature flags
- Makefile: PREWARM_TLS flag, phase7 targets
- CLAUDE.md: Phase 7 success summary
- PHASE7_TASK3_RESULTS.md: Comprehensive results report (NEW)

Build:
make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 phase7-bench

🎉 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 12:54:52 +09:00
+								#include "hakmem_phase7_config.h"  // Phase 7 Task 3
-												Phase 6-2.4: Fix SuperSlab free SEGV: remove guess loop and add memory readability check; add registry atomic consistency (base as _Atomic uintptr_t with acq/rel); add debug toggles (SUPER_REG_DEBUG/REQTRACE); update CURRENT_TASK with results and next steps; capture suite results.

											
										
										
											2025-11-07 18:07:48 +09:00
 								// Debug-only SIGSEGV handler (gated by HAKMEM_DEBUG_SEGV)
 								static void hakmem_sigsegv_handler(int sig) {
 								    (void)sig;
-												Fix include order in hakmem.c - move hak_kpi_util.inc.h before hak_core_init.inc.h

Problem: hak_core_init.inc.h references KPI measurement variables
(g_latency_histogram, g_latency_samples, g_baseline_soft_pf, etc.)
but hakmem.c was including hak_kpi_util.inc.h AFTER hak_core_init.inc.h,
causing undefined reference errors.

Solution: Reorder includes so hak_kpi_util.inc.h (definition) comes
before hak_core_init.inc.h (usage).

Build result: ✅ Success (libhakmem.so 547KB, 0 errors)

Minor changes:
- Added extern __thread declarations for TLS SLL debug variables
- Added signal handler logging for debug_dump_last_push
- Improved hakmem_tiny.c structure for Phase 2 preparation

🤖 Generated with Claude Code + Task Agent

Co-Authored-By: Gemini <gemini@example.com>
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 13:28:44 +09:00
+								    const char* msg = "\n[HAKMEM] Segmentation Fault\n";
 								    (void)write(2, msg, 29);
 								#if !HAKMEM_BUILD_RELEASE
 								    // Dump Class 1 (16B) last push info for debugging
 								    tiny_debug_dump_last_push(1);
-												Phase 6-2.4: Fix SuperSlab free SEGV: remove guess loop and add memory readability check; add registry atomic consistency (base as _Atomic uintptr_t with acq/rel); add debug toggles (SUPER_REG_DEBUG/REQTRACE); update CURRENT_TASK with results and next steps; capture suite results.

											
										
										
											2025-11-07 18:07:48 +09:00
+								#endif
-												Fix include order in hakmem.c - move hak_kpi_util.inc.h before hak_core_init.inc.h

Problem: hak_core_init.inc.h references KPI measurement variables
(g_latency_histogram, g_latency_samples, g_baseline_soft_pf, etc.)
but hakmem.c was including hak_kpi_util.inc.h AFTER hak_core_init.inc.h,
causing undefined reference errors.

Solution: Reorder includes so hak_kpi_util.inc.h (definition) comes
before hak_core_init.inc.h (usage).

Build result: ✅ Success (libhakmem.so 547KB, 0 errors)

Minor changes:
- Added extern __thread declarations for TLS SLL debug variables
- Added signal handler logging for debug_dump_last_push
- Improved hakmem_tiny.c structure for Phase 2 preparation

🤖 Generated with Claude Code + Task Agent

Co-Authored-By: Gemini <gemini@example.com>
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 13:28:44 +09:00
 								    // Restore default handler and re-raise
 								    signal(sig, SIG_DFL);
 								    raise(sig);
-												Phase 6-2.4: Fix SuperSlab free SEGV: remove guess loop and add memory readability check; add registry atomic consistency (base as _Atomic uintptr_t with acq/rel); add debug toggles (SUPER_REG_DEBUG/REQTRACE); update CURRENT_TASK with results and next steps; capture suite results.

											
										
										
											2025-11-07 18:07:48 +09:00
+								}
-												Phase 7 Task 3: Pre-warm TLS cache (+180-280% improvement!)

MAJOR SUCCESS: HAKMEM now achieves 85-92% of System malloc on tiny
allocations (128-512B) and BEATS System at 146% on 1024B allocations!

Performance Results:
- Random Mixed 128B: 21M → 59M ops/s (+181%) 🚀
- Random Mixed 256B: 19M → 70M ops/s (+268%) 🚀
- Random Mixed 512B: 21M → 68M ops/s (+224%) 🚀
- Random Mixed 1024B: 21M → 65M ops/s (+210%, 146% of System!) 🏆
- Larson 1T: 2.68M ops/s (stable, no regression)

Implementation:
1. Task 3a: Remove profiling overhead in release builds
   - Wrapped RDTSC calls in #if !HAKMEM_BUILD_RELEASE
   - Compiler can eliminate profiling code completely
   - Effect: +2% (2.68M → 2.73M Larson)

2. Task 3b: Simplify refill logic
   - Use constants from hakmem_build_flags.h
   - TLS cache already optimal
   - Effect: No regression

3. Task 3c: Pre-warm TLS cache (GAME CHANGER!)
   - Pre-allocate 16 blocks per class at init
   - Eliminates cold-start penalty
   - Effect: +180-280% improvement 🚀

Root Cause:
The bottleneck was cold-start, not the hot path! First allocation in
each class triggered a SuperSlab refill (100+ cycles). Pre-warming
eliminated this penalty, revealing Phase 7's true potential.

Files Modified:
- core/hakmem_tiny.c: Pre-warm function implementation
- core/box/hak_core_init.inc.h: Pre-warm initialization call
- core/tiny_alloc_fast.inc.h: Profiling overhead removal
- core/hakmem_phase7_config.h: Task 3 constants (NEW)
- core/hakmem_build_flags.h: Phase 7 feature flags
- Makefile: PREWARM_TLS flag, phase7 targets
- CLAUDE.md: Phase 7 success summary
- PHASE7_TASK3_RESULTS.md: Comprehensive results report (NEW)

Build:
make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 phase7-bench

🎉 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 12:54:52 +09:00
+								// Phase 7 Task 3: Pre-warm TLS cache helper
 								// Pre-allocate blocks to reduce first-allocation miss penalty
 								// Note: This function is defined later in hakmem.c after sll_refill_small_from_ss is available
 								// (moved out of header to avoid linkage issues)
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								static void hak_init_impl(void);
 								static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;
 								void hak_init(void) {
 								    (void)pthread_once(&g_init_once, hak_init_impl);
 								}
 								static void hak_init_impl(void) {
-												WIP: Add TLS SLL validation and SuperSlab registry fallback

ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue.
Current status: Partial mitigation, but root cause remains.

Changes Applied:
1. SuperSlab Registry Fallback (hakmem_super_registry.h)
   - Added legacy table probe when hash map lookup misses
   - Prevents NULL returns for valid SuperSlabs during initialization
   - Status: ✅ Works but may hide underlying registration issues

2. TLS SLL Push Validation (tls_sll_box.h)
   - Reject push if SuperSlab lookup returns NULL
   - Reject push if class_idx mismatch detected
   - Added [TLS_SLL_PUSH_NO_SS] diagnostic message
   - Status: ✅ Prevents list corruption (defensive)

3. SuperSlab Allocation Class Fix (superslab_allocate.c)
   - Pass actual class_idx to sp_internal_allocate_superslab
   - Prevents dummy class=8 causing OOB access
   - Status: ✅ Root cause fix for allocation path

4. Debug Output Additions
   - First 256 push/pop operations traced
   - First 4 mismatches logged with details
   - SuperSlab registration state logged
   - Status: ✅ Diagnostic tool (not a fix)

5. TLS Hint Box Removed
   - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization)
   - Simplified to focus on stability first
   - Status: ⏳ Can be re-added after root cause fixed

Current Problem (REMAINS UNSOLVED):
- [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench
- Pointer is 16 bytes offset from expected (class 1 → class 2 boundary)
- hak_super_lookup returns NULL for that pointer
- Suggests: Use-After-Free, Double-Free, or pointer arithmetic error

Root Cause Analysis:
- Pattern: Pointer offset by +16 (one class 1 stride)
- Timing: Cumulative problem (appears after 60s, not immediately)
- Location: Header corruption detected during TLS SLL pop

Remaining Issues:
⚠️ Registry fallback is defensive (may hide registration bugs)
⚠️ Push validation prevents symptoms but not root cause
⚠️ 16-byte pointer offset source unidentified

Next Steps for Investigation:
1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths)
2. Enhanced logging at HDR_RESET point:
   - Expected vs actual pointer value
   - Pointer provenance (where it came from)
   - Allocation trace for that block
3. Verify Headerless flag is OFF throughout build
4. Check for double-offset application in conversions

Technical Assessment:
- 60% root cause fixes (allocation class, validation)
- 40% defensive mitigation (registry fallback, push rejection)

Performance Impact:
- Registry fallback: +10-30 cycles on cold path (negligible)
- Push validation: +5-10 cycles per push (acceptable)
- Overall: < 2% performance impact estimated

Related Issues:
- Phase 1 TLS Hint Box removed temporarily
- Phase 2 Headerless blocked until stability achieved

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 20:42:28 +09:00
+								    HAK_TRACE("[init_impl_enter]\n");
-												feat(Phase 1-2): Add atomic initialization wait mechanism (safety improvement)

Implements thread-safe atomic initialization tracking and a wait helper for
non-init threads to avoid libc fallback during the initialization window.

Changes:
- Convert g_initializing to _Atomic type for thread-safe access
- Add g_init_thread to identify which thread performs initialization
- Implement hak_init_wait_for_ready() helper with spin/yield mechanism
- Update hak_core_init.inc.h to use atomic operations
- Update hak_wrappers.inc.h to call wait helper instead of checking g_initializing

Results & Analysis:
- Performance: ±0% (21s → 21s, no measurable improvement)
- Safety: ✓ Prevents recursion in init window
- Investigation: Initialization overhead is <1% of total allocations
  - Expected: 2-8% improvement
  - Actual: 0% improvement (spin/yield overhead ≈ savings)
  - libc overhead: 41% → 57% (relative increase, likely sampling variation)

Key Findings from Perf Analysis:
- getenv: 0% (maintained from Phase 1-1) ✓
- libc malloc/free: ~24.54% of cycles
- libc fragmentation (malloc_consolidate/unlink_chunk): ~16% of cycles
- Total libc overhead: ~41% (difficult to optimize without changing algorithm)

Next Phase Target:
- Phase 2: Investigate libc fragmentation (malloc_consolidate 9.33%, unlink_chunk 6.90%)
- Potential approaches: hakmem Mid/ACE allocator expansion, sh8bench pattern analysis

Recommendation: Keep Phase 1-2 for safety (no performance regression), proceed to Phase 2.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 16:44:27 +09:00
+								    g_init_thread = pthread_self();
 								    atomic_store_explicit(&g_initializing, 1, memory_order_release);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
 								    // Phase 6.X P0 FIX (2025-10-24): Initialize Box 3 (Syscall Layer) FIRST!
 								    // This MUST be called before ANY allocation (Tiny/Mid/Large/Learner)
 								    // dlsym() initializes function pointers to real libc (bypasses LD_PRELOAD)
 								    hkm_syscall_init();
-												WIP: Add TLS SLL validation and SuperSlab registry fallback

ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue.
Current status: Partial mitigation, but root cause remains.

Changes Applied:
1. SuperSlab Registry Fallback (hakmem_super_registry.h)
   - Added legacy table probe when hash map lookup misses
   - Prevents NULL returns for valid SuperSlabs during initialization
   - Status: ✅ Works but may hide underlying registration issues

2. TLS SLL Push Validation (tls_sll_box.h)
   - Reject push if SuperSlab lookup returns NULL
   - Reject push if class_idx mismatch detected
   - Added [TLS_SLL_PUSH_NO_SS] diagnostic message
   - Status: ✅ Prevents list corruption (defensive)

3. SuperSlab Allocation Class Fix (superslab_allocate.c)
   - Pass actual class_idx to sp_internal_allocate_superslab
   - Prevents dummy class=8 causing OOB access
   - Status: ✅ Root cause fix for allocation path

4. Debug Output Additions
   - First 256 push/pop operations traced
   - First 4 mismatches logged with details
   - SuperSlab registration state logged
   - Status: ✅ Diagnostic tool (not a fix)

5. TLS Hint Box Removed
   - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization)
   - Simplified to focus on stability first
   - Status: ⏳ Can be re-added after root cause fixed

Current Problem (REMAINS UNSOLVED):
- [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench
- Pointer is 16 bytes offset from expected (class 1 → class 2 boundary)
- hak_super_lookup returns NULL for that pointer
- Suggests: Use-After-Free, Double-Free, or pointer arithmetic error

Root Cause Analysis:
- Pattern: Pointer offset by +16 (one class 1 stride)
- Timing: Cumulative problem (appears after 60s, not immediately)
- Location: Header corruption detected during TLS SLL pop

Remaining Issues:
⚠️ Registry fallback is defensive (may hide registration bugs)
⚠️ Push validation prevents symptoms but not root cause
⚠️ 16-byte pointer offset source unidentified

Next Steps for Investigation:
1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths)
2. Enhanced logging at HDR_RESET point:
   - Expected vs actual pointer value
   - Pointer provenance (where it came from)
   - Allocation trace for that block
3. Verify Headerless flag is OFF throughout build
4. Check for double-offset application in conversions

Technical Assessment:
- 60% root cause fixes (allocation class, validation)
- 40% defensive mitigation (registry fallback, push rejection)

Performance Impact:
- Registry fallback: +10-30 cycles on cold path (negligible)
- Push validation: +5-10 cycles per push (acceptable)
- Overall: < 2% performance impact estimated

Related Issues:
- Phase 1 TLS Hint Box removed temporarily
- Phase 2 Headerless blocked until stability achieved

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 20:42:28 +09:00
+								    HAK_TRACE("[init_impl_after_syscall_init]\n");
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
-												feat: Phase 7 + Phase 2 - Massive performance & stability improvements

Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓

Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
  Result: +180-280% improvement, 85-146% of System malloc

Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)

Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
  Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
  Result: 50% → 95% stability (19/20 4T success)

Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
  Files: core/tiny_adaptive_sizing.c/h (new)

Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
  Files: core/hakmem_bigcache.c/h
  Expected: +10-20% cache hit rate

Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)

Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis

Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files

Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 17:08:00 +09:00
+								    // CRITICAL FIX (BUG #10): Pre-detect jemalloc ONCE during init, not on hot path!
 								    // This prevents infinite recursion: malloc → hak_jemalloc_loaded → dlopen → malloc → ...
 								    // We protect dlopen's internal malloc calls with g_hakmem_lock_depth
 								    extern int g_jemalloc_loaded;  // Declared in hakmem.c
 								    if (g_jemalloc_loaded < 0) {
 								        void* h = dlopen("libjemalloc.so.2", RTLD_NOLOAD | RTLD_NOW);
 								        if (!h) h = dlopen("libjemalloc.so.1", RTLD_NOLOAD | RTLD_NOW);
 								        g_jemalloc_loaded = (h != NULL) ? 1 : 0;
 								        if (h) dlclose(h);
 								        if (g_jemalloc_loaded) {
 								            HAKMEM_LOG("Detected jemalloc: will avoid interposing\n");
 								        }
 								    }
-												WIP: Add TLS SLL validation and SuperSlab registry fallback

ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue.
Current status: Partial mitigation, but root cause remains.

Changes Applied:
1. SuperSlab Registry Fallback (hakmem_super_registry.h)
   - Added legacy table probe when hash map lookup misses
   - Prevents NULL returns for valid SuperSlabs during initialization
   - Status: ✅ Works but may hide underlying registration issues

2. TLS SLL Push Validation (tls_sll_box.h)
   - Reject push if SuperSlab lookup returns NULL
   - Reject push if class_idx mismatch detected
   - Added [TLS_SLL_PUSH_NO_SS] diagnostic message
   - Status: ✅ Prevents list corruption (defensive)

3. SuperSlab Allocation Class Fix (superslab_allocate.c)
   - Pass actual class_idx to sp_internal_allocate_superslab
   - Prevents dummy class=8 causing OOB access
   - Status: ✅ Root cause fix for allocation path

4. Debug Output Additions
   - First 256 push/pop operations traced
   - First 4 mismatches logged with details
   - SuperSlab registration state logged
   - Status: ✅ Diagnostic tool (not a fix)

5. TLS Hint Box Removed
   - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization)
   - Simplified to focus on stability first
   - Status: ⏳ Can be re-added after root cause fixed

Current Problem (REMAINS UNSOLVED):
- [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench
- Pointer is 16 bytes offset from expected (class 1 → class 2 boundary)
- hak_super_lookup returns NULL for that pointer
- Suggests: Use-After-Free, Double-Free, or pointer arithmetic error

Root Cause Analysis:
- Pattern: Pointer offset by +16 (one class 1 stride)
- Timing: Cumulative problem (appears after 60s, not immediately)
- Location: Header corruption detected during TLS SLL pop

Remaining Issues:
⚠️ Registry fallback is defensive (may hide registration bugs)
⚠️ Push validation prevents symptoms but not root cause
⚠️ 16-byte pointer offset source unidentified

Next Steps for Investigation:
1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths)
2. Enhanced logging at HDR_RESET point:
   - Expected vs actual pointer value
   - Pointer provenance (where it came from)
   - Allocation trace for that block
3. Verify Headerless flag is OFF throughout build
4. Check for double-offset application in conversions

Technical Assessment:
- 60% root cause fixes (allocation class, validation)
- 40% defensive mitigation (registry fallback, push rejection)

Performance Impact:
- Registry fallback: +10-30 cycles on cold path (negligible)
- Push validation: +5-10 cycles per push (acceptable)
- Overall: < 2% performance impact estimated

Related Issues:
- Phase 1 TLS Hint Box removed temporarily
- Phase 2 Headerless blocked until stability achieved

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 20:42:28 +09:00
+								    HAK_TRACE("[init_impl_after_jemalloc_probe]\n");
-												feat: Phase 7 + Phase 2 - Massive performance & stability improvements

Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓

Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
  Result: +180-280% improvement, 85-146% of System malloc

Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)

Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
  Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
  Result: 50% → 95% stability (19/20 4T success)

Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
  Files: core/tiny_adaptive_sizing.c/h (new)

Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
  Files: core/hakmem_bigcache.c/h
  Expected: +10-20% cache hit rate

Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)

Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis

Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files

Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 17:08:00 +09:00
-												Phase 6-2.4: Fix SuperSlab free SEGV: remove guess loop and add memory readability check; add registry atomic consistency (base as _Atomic uintptr_t with acq/rel); add debug toggles (SUPER_REG_DEBUG/REQTRACE); update CURRENT_TASK with results and next steps; capture suite results.

											
										
										
											2025-11-07 18:07:48 +09:00
+								    // Optional: one-shot SIGSEGV backtrace for early crash diagnosis
 								    do {
 								        const char* dbg = getenv("HAKMEM_DEBUG_SEGV");
 								        if (dbg && atoi(dbg) != 0) {
 								            struct sigaction sa; memset(&sa, 0, sizeof(sa));
 								            sa.sa_flags = SA_RESETHAND;
 								            sa.sa_handler = hakmem_sigsegv_handler;
 								            sigaction(SIGSEGV, &sa, NULL);
 								        }
 								    } while (0);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								    // NEW Phase 6.11.1: Initialize debug timing
 								    hkm_timing_init();
 								    // NEW Phase 6.11.1: Initialize whale fast-path cache
 								    hkm_whale_init();
 								    // NEW Phase Hybrid: Initialize Mid Range MT allocator (8-32KB, mimalloc-style)
 								    // NEW Phase 6.8: Initialize configuration system (replaces init_free_policy + init_thp_policy)
 								    hak_config_init();
 								    // Phase 6.16: Initialize FrozenPolicy (SACS-3)
 								    hkm_policy_init();
 								    // Phase 6.15 P0.3: Configure EVO sampling from environment variable
 								    // HAKMEM_EVO_SAMPLE: 0=disabled (default), N=sample every 2^N calls
 								    // Example: HAKMEM_EVO_SAMPLE=10 → sample every 1024 calls
 								    //          HAKMEM_EVO_SAMPLE=16 → sample every 65536 calls
 								    char* evo_sample_str = getenv("HAKMEM_EVO_SAMPLE");
 								    if (evo_sample_str && atoi(evo_sample_str) > 0) {
 								        int freq = atoi(evo_sample_str);
 								        if (freq >= 64) {
-												release: silence runtime logs and stabilize benches

- Fix HAKMEM_LOG gating to use  (numeric) so release builds compile out logs.
- Switch remaining prints to HAKMEM_LOG or guard with :
  - core/box/hak_core_init.inc.h (EVO sample warning, shutdown banner)
  - core/hakmem_config.c (config/feature prints)
  - core/hakmem.c (BigCache eviction prints)
  - core/hakmem_tiny_superslab.c (OOM, head init/expand, C7 init diagnostics)
  - core/hakmem_elo.c (init/evolution)
  - core/hakmem_batch.c (init/flush/stats)
  - core/hakmem_ace.c (33KB route diagnostics)
  - core/hakmem_ace_controller.c (ACE logs macro → no-op in release)
  - core/hakmem_site_rules.c (init banner)
  - core/box/hak_free_api.inc.h (unknown method error → release-gated)
- Rebuilt benches and verified quiet output for release:
  - bench_fixed_size_hakmem/system
  - bench_random_mixed_hakmem/system
  - bench_mid_large_mt_hakmem/system
  - bench_comprehensive_hakmem/system

Note: Kept debug logs available in debug builds and when explicitly toggled via env.

											
										
										
											2025-11-11 01:47:06 +09:00
+								            HAKMEM_LOG("Warning: HAKMEM_EVO_SAMPLE=%d too large, using 63\n", freq);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								            freq = 63;
 								        }
 								        g_evo_sample_mask = (1ULL << freq) - 1;
 								        HAKMEM_LOG("EVO sampling enabled: every 2^%d = %llu calls\n",
 								                   freq, (unsigned long long)(g_evo_sample_mask + 1));
 								    } else {
 								        g_evo_sample_mask = 0;  // Disabled by default
 								        HAKMEM_LOG("EVO sampling disabled (HAKMEM_EVO_SAMPLE not set or 0)\n");
 								    }
 								#ifdef __linux__
 								    // Record baseline KPIs
 								    memset(g_latency_histogram, 0, sizeof(g_latency_histogram));
 								    g_latency_samples = 0;
 								    get_page_faults(&g_baseline_soft_pf, &g_baseline_hard_pf);
 								    g_baseline_rss_kb = get_rss_kb();
 								    HAKMEM_LOG("Baseline: soft_pf=%lu, hard_pf=%lu, rss=%lu KB\n",
 								           (unsigned long)g_baseline_soft_pf,
 								           (unsigned long)g_baseline_hard_pf,
 								           (unsigned long)g_baseline_rss_kb);
 								#endif
 								    HAKMEM_LOG("Initialized (PoC version)\n");
 								    HAKMEM_LOG("Sampling rate: 1/%d\n", SAMPLING_RATE);
 								    HAKMEM_LOG("Max sites: %d\n", MAX_SITES);
-												Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.

											
										
										
											2025-11-09 18:55:50 +09:00
+								    // Build banner (one-shot)
 								    do {
 								        const char* bf = "UNKNOWN";
 								#ifdef HAKMEM_BUILD_RELEASE
 								        bf = "RELEASE";
 								#elif defined(HAKMEM_BUILD_DEBUG)
 								        bf = "DEBUG";
 								#endif
 								        HAKMEM_LOG("[Build] Flavor=%s  Flags: HEADER_CLASSIDX=%d, AGGRESSIVE_INLINE=%d, POOL_TLS_PHASE1=%d, POOL_TLS_PREWARM=%d\n",
 								                   bf,
-												Implement Phase 2: Headerless Allocator Support (Partial)

- Feature: Added HAKMEM_TINY_HEADERLESS toggle (A/B testing)
- Feature: Implemented Headerless layout logic (Offset=0)
- Refactor: Centralized layout definitions in tiny_layout_box.h
- Refactor: Abstracted pointer arithmetic in free path via ptr_conversion_box.h
- Verification: sh8bench passes in Headerless mode (No TLS_SLL_HDR_RESET)
- Known Issue: Regression in Phase 1 mode due to blind pointer conversion logic

											
										
										
											2025-12-03 12:11:27 +09:00
+								#if HAKMEM_TINY_HEADER_CLASSIDX
-												Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.

											
										
										
											2025-11-09 18:55:50 +09:00
+,
 								#else
 ,
 								#endif
 								#ifdef HAKMEM_TINY_AGGRESSIVE_INLINE
 ,
 								#else
 ,
 								#endif
 								#ifdef HAKMEM_POOL_TLS_PHASE1
 ,
 								#else
 ,
 								#endif
 								#ifdef HAKMEM_POOL_TLS_PREWARM
 
 								#else
 
 								#endif
 								                   );
 								    } while (0);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								    // Bench preset: Tiny-only (disable non-essential subsystems)
 								    {
 								        char* bt = getenv("HAKMEM_BENCH_TINY_ONLY");
 								        if (bt && atoi(bt) != 0) {
 								            g_bench_tiny_only = 1;
 								        }
 								    }
 								    // Under LD_PRELOAD, enforce safer defaults for Tiny path unless overridden
 								    {
 								        char* ldpre = getenv("LD_PRELOAD");
 								        if (ldpre && strstr(ldpre, "libhakmem.so")) {
 								            g_ldpreload_mode = 1;
 								            // Default LD-safe mode if not set: 1 (Tiny-only)
 								            char* lds = getenv("HAKMEM_LD_SAFE");
 								            if (lds) { /* NOP used in wrappers */ } else { setenv("HAKMEM_LD_SAFE", "1", 0); }
 								            if (!getenv("HAKMEM_TINY_TLS_SLL")) {
 								                setenv("HAKMEM_TINY_TLS_SLL", "0", 0);  // disable TLS SLL by default
 								            }
 								            if (!getenv("HAKMEM_TINY_USE_SUPERSLAB")) {
 								                setenv("HAKMEM_TINY_USE_SUPERSLAB", "0", 0);  // disable SuperSlab path by default
 								            }
 								        }
 								    }
 								    // Runtime safety toggle
 								    char* safe_free_env = getenv("HAKMEM_SAFE_FREE");
 								    if (safe_free_env && atoi(safe_free_env) != 0) {
 								        g_strict_free = 1;
 								        HAKMEM_LOG("Strict free safety enabled (HAKMEM_SAFE_FREE=1)\n");
 								    } else {
 								        // Heuristic: if loaded via LD_PRELOAD, enable strict free by default
 								        char* ldpre = getenv("LD_PRELOAD");
 								        if (ldpre && strstr(ldpre, "libhakmem.so")) {
 								            g_ldpreload_mode = 1;
 								            g_strict_free = 1;
 								            HAKMEM_LOG("Strict free safety auto-enabled under LD_PRELOAD\n");
 								        }
 								    }
 								    // Invalid free logging toggle (default off to avoid spam under LD_PRELOAD)
 								    char* invlog = getenv("HAKMEM_INVALID_FREE_LOG");
 								    if (invlog && atoi(invlog) != 0) {
 								        g_invalid_free_log = 1;
 								        HAKMEM_LOG("Invalid free logging enabled (HAKMEM_INVALID_FREE_LOG=1)\n");
 								    }
 								    // Phase 7.4: Cache HAKMEM_INVALID_FREE to eliminate 44% CPU overhead
 								    // Perf showed getenv() on hot path consumed 43.96% CPU time (26.41% strcmp + 17.55% getenv)
 								    char* inv = getenv("HAKMEM_INVALID_FREE");
-												feat: Add ACE allocation failure tracing and debug hooks

This commit introduces a comprehensive tracing mechanism for allocation failures within the Adaptive Cache Engine (ACE) component. This feature allows for precise identification of the root cause for Out-Of-Memory (OOM) issues related to ACE allocations.

Key changes include:
- **ACE Tracing Implementation**:
  - Added  environment variable to enable/disable detailed logging of allocation failures.
  - Instrumented , , and  to distinguish between "Threshold" (size class mismatch), "Exhaustion" (pool depletion), and "MapFail" (OS memory allocation failure).
- **Build System Fixes**:
  - Corrected  to ensure  is properly linked into , resolving an  error.
- **LD_PRELOAD Wrapper Adjustments**:
  - Investigated and understood the  wrapper's behavior under , particularly its interaction with  and  checks.
  - Enabled debugging flags for  environment to prevent unintended fallbacks to 's  for non-tiny allocations, allowing comprehensive testing of the  allocator.
- **Debugging & Verification**:
  - Introduced temporary verbose logging to pinpoint execution flow issues within  interception and  routing. These temporary logs have been removed.
  - Created  to facilitate testing of the tracing features.

This feature will significantly aid in diagnosing and resolving allocation-related OOM issues in  by providing clear insights into the failure pathways.

											
										
										
											2025-12-01 16:37:59 +09:00
+								    if (inv && strcmp(inv, "skip") == 0) {
 								        g_invalid_free_mode = 1;  // explicit opt-in to legacy skip mode
 								        HAKMEM_LOG("Invalid free mode: skip check (HAKMEM_INVALID_FREE=skip)\n");
 								    } else if (inv && strcmp(inv, "fallback") == 0) {
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								        g_invalid_free_mode = 0;  // fallback mode: route invalid frees to libc
 								        HAKMEM_LOG("Invalid free mode: fallback to libc (HAKMEM_INVALID_FREE=fallback)\n");
 								    } else {
 								        // Under LD_PRELOAD, prefer safety: default to fallback unless explicitly overridden
 								        char* ldpre = getenv("LD_PRELOAD");
 								        if (ldpre && strstr(ldpre, "libhakmem.so")) {
 								            g_ldpreload_mode = 1;
 								            g_invalid_free_mode = 0;
 								            HAKMEM_LOG("Invalid free mode: fallback to libc (auto under LD_PRELOAD)\n");
 								        } else {
-												feat: Add ACE allocation failure tracing and debug hooks

This commit introduces a comprehensive tracing mechanism for allocation failures within the Adaptive Cache Engine (ACE) component. This feature allows for precise identification of the root cause for Out-Of-Memory (OOM) issues related to ACE allocations.

Key changes include:
- **ACE Tracing Implementation**:
  - Added  environment variable to enable/disable detailed logging of allocation failures.
  - Instrumented , , and  to distinguish between "Threshold" (size class mismatch), "Exhaustion" (pool depletion), and "MapFail" (OS memory allocation failure).
- **Build System Fixes**:
  - Corrected  to ensure  is properly linked into , resolving an  error.
- **LD_PRELOAD Wrapper Adjustments**:
  - Investigated and understood the  wrapper's behavior under , particularly its interaction with  and  checks.
  - Enabled debugging flags for  environment to prevent unintended fallbacks to 's  for non-tiny allocations, allowing comprehensive testing of the  allocator.
- **Debugging & Verification**:
  - Introduced temporary verbose logging to pinpoint execution flow issues within  interception and  routing. These temporary logs have been removed.
  - Created  to facilitate testing of the tracing features.

This feature will significantly aid in diagnosing and resolving allocation-related OOM issues in  by providing clear insights into the failure pathways.

											
										
										
											2025-12-01 16:37:59 +09:00
+								            // Default: safety first (fallback), avoids routing unknown pointers into Tiny
 								            g_invalid_free_mode = 0;
 								            HAKMEM_LOG("Invalid free mode: fallback to libc (default)\n");
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								        }
 								    }
 								    // NEW Phase 6.8: Feature-gated initialization (check g_hakem_config flags)
 								    if (HAK_ENABLED_ALLOC(HAKMEM_FEATURE_POOL)) {
 								        hak_pool_init();
 								    }
 								    // NEW Phase 6.13: L2.5 LargePool (64KB-1MB allocations)
 								    hak_l25_pool_init();
 								    if (!g_bench_tiny_only && HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE)) {
 								        hak_bigcache_init();
 								        hak_bigcache_set_free_callback(bigcache_free_callback);
 								    }
 								    if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_ELO)) {
 								        hak_elo_init();
 								        // Phase 6.11.4 P0-2: Initialize cached strategy to default (strategy 0)
 								        atomic_store(&g_cached_strategy_id, 0);
 								    }
 								    if (!g_bench_tiny_only && HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE)) {
 								        hak_batch_init();
 								    }
 								    if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_EVOLUTION)) {
 								        hak_evo_init();
 								    }
 								    if (!g_bench_tiny_only) {
 								        // Phase 6.16: Initialize ACE stats (sampling) – default off
 								        hkm_ace_stats_init();
 								        // Phase 6.16: Initialize sampling profiler – default off
 								        hkm_prof_init();
 								        // Size histogram sampling (optional)
 								        hkm_size_hist_init();
 								    }
 								    if (!g_bench_tiny_only) {
 								        // Start CAP learner (optional, env-gated)
 								        hkm_learner_init();
 								    }
 								    // NEW Phase 6.10: Site Rules (MVP: always ON)
 								    // MT note: default disabled unless HAKMEM_SITE_RULES=1
 								    char* sr_env = getenv("HAKMEM_SITE_RULES");
 								    g_site_rules_enabled = (sr_env && atoi(sr_env) != 0);
 								    if (!g_bench_tiny_only && g_site_rules_enabled) {
 								        hak_site_rules_init();
 								    }
-												Phase 23 Unified Cache + PageFaultTelemetry generalization: Mid/VM page-fault bottleneck identified

Summary:
- Phase 23 Unified Cache: +30% improvement (Random Mixed 256B: 18.18M → 23.68M ops/s)
- PageFaultTelemetry: Extended to generic buckets (C0-C7, MID, L25, SSM)
- Measurement-driven decision: Mid/VM page-faults (80-100K) >> Tiny (6K) → prioritize Mid/VM optimization

Phase 23 Changes:
1. Unified Cache implementation (core/front/tiny_unified_cache.{c,h})
   - Direct SuperSlab carve (TLS SLL bypass)
   - Self-contained pop-or-refill pattern
   - ENV: HAKMEM_TINY_UNIFIED_CACHE=1, HAKMEM_TINY_UNIFIED_C{0-7}=128

2. Fast path pruning (tiny_alloc_fast.inc.h, tiny_free_fast_v2.inc.h)
   - Unified ON → direct cache access (skip all intermediate layers)
   - Alloc: unified_cache_pop_or_refill() → immediate fail to slow
   - Free: unified_cache_push() → fallback to SLL only if full

PageFaultTelemetry Changes:
3. Generic bucket architecture (core/box/pagefault_telemetry_box.{c,h})
   - PF_BUCKET_{C0-C7, MID, L25, SSM} for domain-specific measurement
   - Integration: hak_pool_try_alloc(), l25_alloc_new_run(), shared_pool_allocate_superslab_unlocked()

4. Measurement results (Random Mixed 500K / 256B):
   - Tiny C2-C7: 2-33 pages, high reuse (64-3.8 touches/page)
   - SSM: 512 pages (initialization footprint)
   - MID/L25: 0 (unused in this workload)
   - Mid/Large VM benchmarks: 80-100K page-faults (13-16x higher than Tiny)

Ring Cache Enhancements:
5. Hot Ring Cache (core/front/tiny_ring_cache.{c,h})
   - ENV: HAKMEM_TINY_HOT_RING_ENABLE=1, HAKMEM_TINY_HOT_RING_C{0-7}=size
   - Conditional compilation cleanup

Documentation:
6. Analysis reports
   - RANDOM_MIXED_BOTTLENECK_ANALYSIS.md: Page-fault breakdown
   - RANDOM_MIXED_SUMMARY.md: Phase 23 summary
   - RING_CACHE_ACTIVATION_GUIDE.md: Ring cache usage
   - CURRENT_TASK.md: Updated with Phase 23 results and Phase 24 plan

Next Steps (Phase 24):
- Target: Mid/VM PageArena/HotSpanBox (page-fault reduction 80-100K → 30-40K)
- Tiny SSM optimization deferred (low ROI, ~6K page-faults already optimal)
- Expected improvement: +30-50% for Mid/Large workloads

Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-17 02:47:58 +09:00
+								    // Phase 22: Tiny Pool initialization now LAZY (per-class on first use)
 								    // hak_tiny_init() moved to lazy_init_class() in hakmem_tiny_lazy_init.inc.h
 								    // OLD: hak_tiny_init(); (eager init of all 8 classes → 94.94% page faults)
 								    // NEW: Lazy init triggered by tiny_alloc_fast() → only used classes initialized
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
 								    // Env: optional Tiny flush on exit (memory efficiency evaluation)
 								    {
 								        char* tf = getenv("HAKMEM_TINY_FLUSH_ON_EXIT");
 								        if (tf && atoi(tf) != 0) {
 								            g_flush_tiny_on_exit = 1;
 								        }
 								        char* ud = getenv("HAKMEM_TINY_ULTRA_DEBUG");
 								        if (ud && atoi(ud) != 0) {
 								            g_ultra_debug_on_exit = 1;
 								        }
 								        // Register exit hook if any of the debug/flush toggles are on
 								        // or when path debug is requested.
 								        if (g_flush_tiny_on_exit || g_ultra_debug_on_exit || getenv("HAKMEM_TINY_PATH_DEBUG")) {
 								            atexit(hak_flush_tiny_exit);
 								        }
 								    }
 								    // NEW Phase ACE: Initialize Adaptive Control Engine
 								    hkm_ace_controller_init(&g_ace_controller);
 								    if (g_ace_controller.enabled) {
 								        hkm_ace_controller_start(&g_ace_controller);
 								        HAKMEM_LOG("ACE Learning Layer enabled and started\n");
 								    }
-												Phase 19 & 20-1: Frontend optimization + TLS cache prewarm (+16.2% total)

Phase 19: Box FrontMetrics & Box FrontPrune (A/B testing framework)
========================================================================
- Box FrontMetrics: Per-class hit rate measurement for all frontend layers
  - Implementation: core/box/front_metrics_box.{h,c}
  - ENV: HAKMEM_TINY_FRONT_METRICS=1, HAKMEM_TINY_FRONT_DUMP=1
  - Output: CSV format per-class hit rate report

- A/B Test Results (Random Mixed 16-1040B, 500K iterations):
  | Config | Throughput | vs Baseline | C2/C3 Hit Rate |
  |--------|-----------|-------------|----------------|
  | Baseline (UH+HV2) | 10.1M ops/s | - | UH=11.7%, HV2=88.3% |
  | HeapV2 only | 11.4M ops/s | +12.9% ⭐ | HV2=99.3%, SLL=0.7% |
  | UltraHot only | 6.6M ops/s | -34.4% ❌ | UH=96.4%, SLL=94.2% |

- Key Finding: UltraHot removal improves performance by +12.9%
  - Root cause: Branch prediction miss cost > UltraHot hit rate benefit
  - UltraHot check: 88.3% cases = wasted branch → CPU confusion
  - HeapV2 alone: more predictable → better pipeline efficiency

- Default Setting Change: UltraHot default OFF
  - Production: UltraHot OFF (fastest)
  - Research: HAKMEM_TINY_FRONT_ENABLE_ULTRAHOT=1 to enable
  - Code preserved (not deleted) for research/debug use

Phase 20-1: Box SS-HotPrewarm (TLS cache prewarming, +3.3%)
========================================================================
- Box SS-HotPrewarm: ENV-controlled per-class TLS cache prewarm
  - Implementation: core/box/ss_hot_prewarm_box.{h,c}
  - Default targets: C2/C3=128, C4/C5=64 (aggressive prewarm)
  - ENV: HAKMEM_TINY_PREWARM_C2, _C3, _C4, _C5, _ALL
  - Total: 384 blocks pre-allocated

- Benchmark Results (Random Mixed 256B, 500K iterations):
  | Config | Page Faults | Throughput | vs Baseline |
  |--------|-------------|------------|-------------|
  | Baseline (Prewarm OFF) | 10,399 | 15.7M ops/s | - |
  | Phase 20-1 (Prewarm ON) | 10,342 | 16.2M ops/s | +3.3% ⭐ |

  - Page fault reduction: 0.55% (expected: 50-66%, reality: minimal)
  - Performance gain: +3.3% (15.7M → 16.2M ops/s)

- Analysis:
  ❌ Page fault reduction failed:
    - User page-derived faults dominate (benchmark initialization)
    - 384 blocks prewarm = minimal impact on 10K+ total faults
    - Kernel-side cost (asm_exc_page_fault) uncontrollable from userspace

  ✅ Cache warming effect succeeded:
    - TLS SLL pre-filled → reduced initial refill cost
    - CPU cycle savings → +3.3% performance gain
    - Stability improvement: warm state from first allocation

- Decision: Keep as "light +3% box"
  - Prewarm valid: 384 blocks (C2/C3=128, C4/C5=64) preserved
  - No further aggressive scaling: RSS cost vs page fault reduction unbalanced
  - Next phase: BenchFast mode for structural upper limit measurement

Combined Performance Impact:
========================================================================
Phase 19 (HeapV2 only): +12.9% (10.1M → 11.4M ops/s)
Phase 20-1 (Prewarm ON): +3.3% (15.7M → 16.2M ops/s)
Total improvement: +16.2% vs original baseline

Files Changed:
========================================================================
Phase 19:
- core/box/front_metrics_box.{h,c} - NEW
- core/tiny_alloc_fast.inc.h - metrics + ENV gating
- PHASE19_AB_TEST_RESULTS.md - NEW (detailed A/B test report)
- PHASE19_FRONTEND_METRICS_FINDINGS.md - NEW (findings report)

Phase 20-1:
- core/box/ss_hot_prewarm_box.{h,c} - NEW
- core/box/hak_core_init.inc.h - prewarm call integration
- Makefile - ss_hot_prewarm_box.o added
- CURRENT_TASK.md - Phase 19 & 20-1 results documented

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-16 05:48:59 +09:00
+								    // Phase 20-1: Aggressive TLS SLL + SuperSlab prewarming (ChatGPT strategy)
 								    // Box SS-HotPrewarm: ENV-controlled per-class prewarm with page fault reduction
-												Phase 7 Task 3: Pre-warm TLS cache (+180-280% improvement!)

MAJOR SUCCESS: HAKMEM now achieves 85-92% of System malloc on tiny
allocations (128-512B) and BEATS System at 146% on 1024B allocations!

Performance Results:
- Random Mixed 128B: 21M → 59M ops/s (+181%) 🚀
- Random Mixed 256B: 19M → 70M ops/s (+268%) 🚀
- Random Mixed 512B: 21M → 68M ops/s (+224%) 🚀
- Random Mixed 1024B: 21M → 65M ops/s (+210%, 146% of System!) 🏆
- Larson 1T: 2.68M ops/s (stable, no regression)

Implementation:
1. Task 3a: Remove profiling overhead in release builds
   - Wrapped RDTSC calls in #if !HAKMEM_BUILD_RELEASE
   - Compiler can eliminate profiling code completely
   - Effect: +2% (2.68M → 2.73M Larson)

2. Task 3b: Simplify refill logic
   - Use constants from hakmem_build_flags.h
   - TLS cache already optimal
   - Effect: No regression

3. Task 3c: Pre-warm TLS cache (GAME CHANGER!)
   - Pre-allocate 16 blocks per class at init
   - Eliminates cold-start penalty
   - Effect: +180-280% improvement 🚀

Root Cause:
The bottleneck was cold-start, not the hot path! First allocation in
each class triggered a SuperSlab refill (100+ cycles). Pre-warming
eliminated this penalty, revealing Phase 7's true potential.

Files Modified:
- core/hakmem_tiny.c: Pre-warm function implementation
- core/box/hak_core_init.inc.h: Pre-warm initialization call
- core/tiny_alloc_fast.inc.h: Profiling overhead removal
- core/hakmem_phase7_config.h: Task 3 constants (NEW)
- core/hakmem_build_flags.h: Phase 7 feature flags
- Makefile: PREWARM_TLS flag, phase7 targets
- CLAUDE.md: Phase 7 success summary
- PHASE7_TASK3_RESULTS.md: Comprehensive results report (NEW)

Build:
make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 phase7-bench

🎉 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 12:54:52 +09:00
+								#if HAKMEM_TINY_PREWARM_TLS
-												Phase 19 & 20-1: Frontend optimization + TLS cache prewarm (+16.2% total)

Phase 19: Box FrontMetrics & Box FrontPrune (A/B testing framework)
========================================================================
- Box FrontMetrics: Per-class hit rate measurement for all frontend layers
  - Implementation: core/box/front_metrics_box.{h,c}
  - ENV: HAKMEM_TINY_FRONT_METRICS=1, HAKMEM_TINY_FRONT_DUMP=1
  - Output: CSV format per-class hit rate report

- A/B Test Results (Random Mixed 16-1040B, 500K iterations):
  | Config | Throughput | vs Baseline | C2/C3 Hit Rate |
  |--------|-----------|-------------|----------------|
  | Baseline (UH+HV2) | 10.1M ops/s | - | UH=11.7%, HV2=88.3% |
  | HeapV2 only | 11.4M ops/s | +12.9% ⭐ | HV2=99.3%, SLL=0.7% |
  | UltraHot only | 6.6M ops/s | -34.4% ❌ | UH=96.4%, SLL=94.2% |

- Key Finding: UltraHot removal improves performance by +12.9%
  - Root cause: Branch prediction miss cost > UltraHot hit rate benefit
  - UltraHot check: 88.3% cases = wasted branch → CPU confusion
  - HeapV2 alone: more predictable → better pipeline efficiency

- Default Setting Change: UltraHot default OFF
  - Production: UltraHot OFF (fastest)
  - Research: HAKMEM_TINY_FRONT_ENABLE_ULTRAHOT=1 to enable
  - Code preserved (not deleted) for research/debug use

Phase 20-1: Box SS-HotPrewarm (TLS cache prewarming, +3.3%)
========================================================================
- Box SS-HotPrewarm: ENV-controlled per-class TLS cache prewarm
  - Implementation: core/box/ss_hot_prewarm_box.{h,c}
  - Default targets: C2/C3=128, C4/C5=64 (aggressive prewarm)
  - ENV: HAKMEM_TINY_PREWARM_C2, _C3, _C4, _C5, _ALL
  - Total: 384 blocks pre-allocated

- Benchmark Results (Random Mixed 256B, 500K iterations):
  | Config | Page Faults | Throughput | vs Baseline |
  |--------|-------------|------------|-------------|
  | Baseline (Prewarm OFF) | 10,399 | 15.7M ops/s | - |
  | Phase 20-1 (Prewarm ON) | 10,342 | 16.2M ops/s | +3.3% ⭐ |

  - Page fault reduction: 0.55% (expected: 50-66%, reality: minimal)
  - Performance gain: +3.3% (15.7M → 16.2M ops/s)

- Analysis:
  ❌ Page fault reduction failed:
    - User page-derived faults dominate (benchmark initialization)
    - 384 blocks prewarm = minimal impact on 10K+ total faults
    - Kernel-side cost (asm_exc_page_fault) uncontrollable from userspace

  ✅ Cache warming effect succeeded:
    - TLS SLL pre-filled → reduced initial refill cost
    - CPU cycle savings → +3.3% performance gain
    - Stability improvement: warm state from first allocation

- Decision: Keep as "light +3% box"
  - Prewarm valid: 384 blocks (C2/C3=128, C4/C5=64) preserved
  - No further aggressive scaling: RSS cost vs page fault reduction unbalanced
  - Next phase: BenchFast mode for structural upper limit measurement

Combined Performance Impact:
========================================================================
Phase 19 (HeapV2 only): +12.9% (10.1M → 11.4M ops/s)
Phase 20-1 (Prewarm ON): +3.3% (15.7M → 16.2M ops/s)
Total improvement: +16.2% vs original baseline

Files Changed:
========================================================================
Phase 19:
- core/box/front_metrics_box.{h,c} - NEW
- core/tiny_alloc_fast.inc.h - metrics + ENV gating
- PHASE19_AB_TEST_RESULTS.md - NEW (detailed A/B test report)
- PHASE19_FRONTEND_METRICS_FINDINGS.md - NEW (findings report)

Phase 20-1:
- core/box/ss_hot_prewarm_box.{h,c} - NEW
- core/box/hak_core_init.inc.h - prewarm call integration
- Makefile - ss_hot_prewarm_box.o added
- CURRENT_TASK.md - Phase 19 & 20-1 results documented

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-16 05:48:59 +09:00
+								    #include "box/ss_hot_prewarm_box.h"
 								    int total_prewarmed = box_ss_hot_prewarm_all();
 								    HAKMEM_LOG("TLS cache pre-warmed: %d blocks total (Phase 20-1)\n", total_prewarmed);
-												Infrastructure and build updates

- Update build configuration and flags
- Add missing header files and dependencies
- Update TLS list implementation with proper scoping
- Fix various compilation warnings and issues
- Update debug ring and tiny allocation infrastructure
- Update benchmark results documentation

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>

											
										
										
											2025-11-11 21:49:05 +09:00
+								    // After TLS prewarm, cascade some hot blocks into SFC to raise early hit rate
 								    {
 								        extern int g_sfc_enabled;
 								        if (g_sfc_enabled) {
 								            extern void sfc_cascade_from_tls_initial(void);
 								            sfc_cascade_from_tls_initial();
 								        }
 								    }
-												Phase 7 Task 3: Pre-warm TLS cache (+180-280% improvement!)

MAJOR SUCCESS: HAKMEM now achieves 85-92% of System malloc on tiny
allocations (128-512B) and BEATS System at 146% on 1024B allocations!

Performance Results:
- Random Mixed 128B: 21M → 59M ops/s (+181%) 🚀
- Random Mixed 256B: 19M → 70M ops/s (+268%) 🚀
- Random Mixed 512B: 21M → 68M ops/s (+224%) 🚀
- Random Mixed 1024B: 21M → 65M ops/s (+210%, 146% of System!) 🏆
- Larson 1T: 2.68M ops/s (stable, no regression)

Implementation:
1. Task 3a: Remove profiling overhead in release builds
   - Wrapped RDTSC calls in #if !HAKMEM_BUILD_RELEASE
   - Compiler can eliminate profiling code completely
   - Effect: +2% (2.68M → 2.73M Larson)

2. Task 3b: Simplify refill logic
   - Use constants from hakmem_build_flags.h
   - TLS cache already optimal
   - Effect: No regression

3. Task 3c: Pre-warm TLS cache (GAME CHANGER!)
   - Pre-allocate 16 blocks per class at init
   - Eliminates cold-start penalty
   - Effect: +180-280% improvement 🚀

Root Cause:
The bottleneck was cold-start, not the hot path! First allocation in
each class triggered a SuperSlab refill (100+ cycles). Pre-warming
eliminated this penalty, revealing Phase 7's true potential.

Files Modified:
- core/hakmem_tiny.c: Pre-warm function implementation
- core/box/hak_core_init.inc.h: Pre-warm initialization call
- core/tiny_alloc_fast.inc.h: Profiling overhead removal
- core/hakmem_phase7_config.h: Task 3 constants (NEW)
- core/hakmem_build_flags.h: Phase 7 feature flags
- Makefile: PREWARM_TLS flag, phase7 targets
- CLAUDE.md: Phase 7 success summary
- PHASE7_TASK3_RESULTS.md: Comprehensive results report (NEW)

Build:
make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 phase7-bench

🎉 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 12:54:52 +09:00
+								#endif
-												feat(Phase 1-2): Add atomic initialization wait mechanism (safety improvement)

Implements thread-safe atomic initialization tracking and a wait helper for
non-init threads to avoid libc fallback during the initialization window.

Changes:
- Convert g_initializing to _Atomic type for thread-safe access
- Add g_init_thread to identify which thread performs initialization
- Implement hak_init_wait_for_ready() helper with spin/yield mechanism
- Update hak_core_init.inc.h to use atomic operations
- Update hak_wrappers.inc.h to call wait helper instead of checking g_initializing

Results & Analysis:
- Performance: ±0% (21s → 21s, no measurable improvement)
- Safety: ✓ Prevents recursion in init window
- Investigation: Initialization overhead is <1% of total allocations
  - Expected: 2-8% improvement
  - Actual: 0% improvement (spin/yield overhead ≈ savings)
  - libc overhead: 41% → 57% (relative increase, likely sampling variation)

Key Findings from Perf Analysis:
- getenv: 0% (maintained from Phase 1-1) ✓
- libc malloc/free: ~24.54% of cycles
- libc fragmentation (malloc_consolidate/unlink_chunk): ~16% of cycles
- Total libc overhead: ~41% (difficult to optimize without changing algorithm)

Next Phase Target:
- Phase 2: Investigate libc fragmentation (malloc_consolidate 9.33%, unlink_chunk 6.90%)
- Potential approaches: hakmem Mid/ACE allocator expansion, sh8bench pattern analysis

Recommendation: Keep Phase 1-2 for safety (no performance regression), proceed to Phase 2.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 16:44:27 +09:00
+								    atomic_store_explicit(&g_initializing, 0, memory_order_release);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								    // Publish that initialization is complete
 								    atomic_thread_fence(memory_order_seq_cst);
 								    g_initialized = 1;
 								}
 								void hak_shutdown(void) {
 								    if (!g_initialized) return;
 								    // NEW Phase ACE: Shutdown Adaptive Control Engine FIRST (before other subsystems)
 								    hkm_ace_controller_destroy(&g_ace_controller);
 								    if (!g_bench_tiny_only) {
-												release: silence runtime logs and stabilize benches

- Fix HAKMEM_LOG gating to use  (numeric) so release builds compile out logs.
- Switch remaining prints to HAKMEM_LOG or guard with :
  - core/box/hak_core_init.inc.h (EVO sample warning, shutdown banner)
  - core/hakmem_config.c (config/feature prints)
  - core/hakmem.c (BigCache eviction prints)
  - core/hakmem_tiny_superslab.c (OOM, head init/expand, C7 init diagnostics)
  - core/hakmem_elo.c (init/evolution)
  - core/hakmem_batch.c (init/flush/stats)
  - core/hakmem_ace.c (33KB route diagnostics)
  - core/hakmem_ace_controller.c (ACE logs macro → no-op in release)
  - core/hakmem_site_rules.c (init banner)
  - core/box/hak_free_api.inc.h (unknown method error → release-gated)
- Rebuilt benches and verified quiet output for release:
  - bench_fixed_size_hakmem/system
  - bench_random_mixed_hakmem/system
  - bench_mid_large_mt_hakmem/system
  - bench_comprehensive_hakmem/system

Note: Kept debug logs available in debug builds and when explicitly toggled via env.

											
										
										
											2025-11-11 01:47:06 +09:00
+								        HAKMEM_LOG("Shutting down...\n");
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								        hak_print_stats();
 								    }
 								    // NEW Phase 6.9: Shutdown L2 Pool
 								    if (!g_bench_tiny_only) hak_pool_shutdown();
 								    // NEW Phase 6.13: Shutdown L2.5 LargePool
 								    if (!g_bench_tiny_only) hak_l25_pool_shutdown();
 								    // NEW: Shutdown BigCache Box
 								    if (!g_bench_tiny_only) hak_bigcache_shutdown();
 								    // NEW Phase 6.2: Shutdown ELO Strategy Selection
 								    if (!g_bench_tiny_only) hak_elo_shutdown();
 								    // NEW Phase 6.3: Shutdown madvise Batching
 								    if (!g_bench_tiny_only) hak_batch_shutdown();
 								    // NEW Phase 6.10: Shutdown Site Rules
 								    if (!g_bench_tiny_only) hak_site_rules_shutdown();
 								    // NEW Phase 6.12: Print Tiny Pool statistics
 								    if (!g_bench_tiny_only) hak_tiny_print_stats();
 								    // NEW Phase 6.11.1: Print whale cache statistics
 								    if (!g_bench_tiny_only) {
 								        hkm_whale_dump_stats();
 								        // NEW Phase 6.11.1: Shutdown whale cache
 								        hkm_whale_shutdown();
 								    }
 								    // NEW Phase 6.11.1: Shutdown debug timing (must be last!)
 								    if (!g_bench_tiny_only) hkm_timing_shutdown();
 								    // Phase 6.16: Dump sampling profiler
 								    if (!g_bench_tiny_only) hkm_prof_shutdown();
 								    // Stop learner thread
 								    if (!g_bench_tiny_only) hkm_learner_shutdown();
 								    // Stop Tiny background components (e.g., Intelligence Engine)
 								    hak_tiny_shutdown();
 								    g_initialized = 0;
 								}
 								#endif // HAK_CORE_INIT_INC_H