hakmem/core/hakmem_tiny_init.inc

// hakmem_tiny_init.inc
// Note: uses TLS ops inline helpers for prewarm when class5 hotpath is enabled
#include "hakmem_tiny_tls_ops.h"
#include "box/prewarm_box.h"  // Box Prewarm API (Priority 3)
// Phase 2D-2: Initialization function extraction
//
// This file contains the hak_tiny_init() function extracted from hakmem_tiny.c
// to improve code organization. Reduces main file by 450 lines (24%).
//
// Cold path only - called once at startup.

void hak_tiny_init(void) {
    if (g_tiny_initialized) return;

    // Step 1: Simple initialization (static global is already zero-initialized)
    g_tiny_initialized = 1;

    // Hot-class toggle: class5 (256B) dedicated TLS fast path
    // Default ON; allow runtime override via HAKMEM_TINY_HOTPATH_CLASS5
    {
        const char* hp5 = getenv("HAKMEM_TINY_HOTPATH_CLASS5");
        if (hp5 && *hp5) {
            g_tiny_hotpath_class5 = (atoi(hp5) != 0) ? 1 : 0;
        }
    }

    // Reset fast-cache defaults and apply preset (if provided)
    tiny_config_reset_defaults();
    char* preset_env = getenv("HAKMEM_TINY_PRESET");
    if (preset_env) {
        if (strcasecmp(preset_env, "TIGHT") == 0) {
            TINY_PRESET_TIGHT();
        } else if (strcasecmp(preset_env, "ULTRA_TIGHT") == 0 ||
                   strcasecmp(preset_env, "ULTRATIGHT") == 0) {
            TINY_PRESET_ULTRA_TIGHT();
        } else {
            TINY_PRESET_BALANCED();
        }
    }

    // Phase 6.14: Read environment variable for Registry ON/OFF
    char* env = getenv("HAKMEM_USE_REGISTRY");
    if (env) {
        g_use_registry = atoi(env);
    } else {
        g_use_registry = 1;  // Default ON for multi-thread safety
    }

    // Phase 6.15: Runtime toggle to allow Tiny within wrappers
    // HAKMEM_WRAP_TINY=1 → enable Tiny fast-path during wrapper calls
    char* wrap_env = getenv("HAKMEM_WRAP_TINY");
    if (wrap_env && atoi(wrap_env) != 0) {
        g_wrap_tiny_enabled = 1;
    }
    char* wrap_refill_env = getenv("HAKMEM_WRAP_TINY_REFILL");
    if (wrap_refill_env && atoi(wrap_refill_env) != 0) {
        g_wrap_tiny_refill = 1;
    }
    // Remote-drain knobs
    char* rth = getenv("HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD");
    if (rth) { int v = atoi(rth); if (v > 0) g_remote_drain_thresh = v; }
    char* rr = getenv("HAKMEM_TINY_REMOTE_DRAIN_TRYRATE");
    if (rr) { int v = atoi(rr); if (v > 0) g_remote_drain_tryrate = v; }
    char* cs = getenv("HAKMEM_TINY_COUNT_SAMPLE");
    if (cs) { int v = atoi(cs); if (v>=0 && v<=16) g_tiny_count_sample_exp = v; }

    int mem_diet_enabled = 1;  // Default: Enable for memory efficiency
    char* memdiet_env = getenv("HAKMEM_TINY_MEM_DIET");
    if (memdiet_env && atoi(memdiet_env) == 0) {
        mem_diet_enabled = 0;  // Allow disabling via env
    }
    if (mem_diet_enabled) {
        if (g_mag_cap_limit > 64) g_mag_cap_limit = 64;
    }

    // Optional: Magazine cap limit (runtime lower bound)
    char* mag_env = getenv("HAKMEM_TINY_MAG_CAP");
    if (mag_env) {
        int val = atoi(mag_env);
        if (val > 0 && val < g_mag_cap_limit) g_mag_cap_limit = val;
    }

    // Phase X: Initialize TLS free-list defaults
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        TinyTLSList* tls = &g_tls_lists[i];
        tls->head = NULL;
        tls->count = 0;
        uint32_t base_cap = (uint32_t)tiny_default_cap(i);
        uint32_t class_max = (uint32_t)tiny_cap_max_for_class(i);
        if (base_cap > class_max) base_cap = class_max;
        if ((uint32_t)g_mag_cap_limit < base_cap) base_cap = (uint32_t)g_mag_cap_limit;
        if (g_mag_cap_override[i] > 0) {
            uint32_t ov = (uint32_t)g_mag_cap_override[i];
            if (ov > class_max) ov = class_max;
            if (ov > (uint32_t)g_mag_cap_limit) ov = (uint32_t)g_mag_cap_limit;
            if (ov != 0u) base_cap = ov;
        }
        if (base_cap == 0u) base_cap = 32u;
        tls->cap = base_cap;
        tls->refill_low = tiny_tls_default_refill(base_cap);
        tls->spill_high = tiny_tls_default_spill(base_cap);
        tiny_tls_publish_targets(i, base_cap);
    }
    // Optional: override TLS parameters for hot class 5 (256B)
    if (g_tiny_hotpath_class5) {
        TinyTLSList* tls5 = &g_tls_lists[5];
        int cap_def = 512;     // thick cache for hot class
        int refill_def = 128;  // refill low-water mark
        int spill_def = 0;     // 0 → use cap as hard spill threshold
        const char* ecap = getenv("HAKMEM_TINY_CLASS5_TLS_CAP");
        const char* eref = getenv("HAKMEM_TINY_CLASS5_TLS_REFILL");
        const char* espl = getenv("HAKMEM_TINY_CLASS5_TLS_SPILL");
        if (ecap && *ecap) cap_def = atoi(ecap);
        if (eref && *eref) refill_def = atoi(eref);
        if (espl && *espl) spill_def = atoi(espl);
        if (cap_def < 64) cap_def = 64; if (cap_def > 4096) cap_def = 4096;
        if (refill_def < 16) refill_def = 16; if (refill_def > cap_def) refill_def = cap_def;
        if (spill_def < 0) spill_def = 0; if (spill_def > cap_def) spill_def = cap_def;
        tls5->cap = (uint32_t)cap_def;
        tls5->refill_low = (uint32_t)refill_def;
        tls5->spill_high = (uint32_t)spill_def;  // 0 → use cap logic in helper
        tiny_tls_publish_targets(5, (uint32_t)cap_def);

        // Optional: one-shot TLS prewarm for class5
        // Env: HAKMEM_TINY_CLASS5_PREWARM=<n> (default 128, 0 disables)
        int prewarm = 128;
        const char* pw = getenv("HAKMEM_TINY_CLASS5_PREWARM");
        if (pw && *pw) prewarm = atoi(pw);
        if (prewarm < 0) prewarm = 0;
        if (prewarm > (int)tls5->cap) prewarm = (int)tls5->cap;

        if (prewarm > 0) {
            // ✅ NEW: Use Box Prewarm API (safe, simple, handles all initialization)
            // Box Prewarm guarantees:
            //   - Correct initialization order (capacity system initialized first)
            //   - No orphaned blocks (atomic carve-and-push)
            //   - No double-free risk (all-or-nothing semantics)
            //   - Clear error handling
            int taken = box_prewarm_tls(5, prewarm);

            #if !HAKMEM_BUILD_RELEASE
            // Debug logging (optional)
            fprintf(stderr, "[PREWARM] class=5 requested=%d taken=%d\n", prewarm, taken);
            #endif
            (void)taken; // Suppress unused warning in release builds
        }
    }
    if (mem_diet_enabled) {
        tiny_apply_mem_diet();
    }

    // Enable signal-triggered stats dump if requested (SIGUSR1)
    hak_tiny_enable_signal_dump();

    // Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
    // Allow runtime disable/enable via env (0=off, 1=on)
    // Phase 6-2.5 FIX: SuperSlab is independent from diet mode (both are performance-critical)
    // - SuperSlab: Fast allocation/free (defaults to 1, set in hakmem_config.c:334)
    // - Diet mode: Magazine capacity limits only (doesn't disable subsystems)
    char* superslab_env = getenv("HAKMEM_TINY_USE_SUPERSLAB");
    if (superslab_env) {
        g_use_superslab = (atoi(superslab_env) != 0) ? 1 : 0;
    }

    // Initialize Super Front Cache (SFC) with bench-friendly defaults
    // Enabled by default; can be disabled via HAKMEM_SFC_ENABLE=0
    {
        extern void sfc_init(void);
        sfc_init();
    }
    // Note: Diet mode no longer overrides g_use_superslab (removed lines 104-105)
    // SuperSlab defaults to 1 unless explicitly disabled via env var
    // One-shot hint: publish/adopt requires SuperSlab ON
    {
        static int hint_once = 0;
        if (!hint_once) {
            const char* must_adopt = getenv("HAKMEM_TINY_MUST_ADOPT");
            if ((!superslab_env || g_use_superslab == 0) && must_adopt && atoi(must_adopt) != 0) {
                fprintf(stderr, "[HINT] HAKMEM_TINY_USE_SUPERSLAB=0: publish/adopt pipeline is disabled. Set =1 for mailbox/adopt.\n");
            }
            hint_once = 1;
        }
    }
    {
        char* tlslist_env = getenv("HAKMEM_TINY_TLS_LIST");
        if (tlslist_env) {
            g_tls_list_enable = (atoi(tlslist_env) != 0) ? 1 : 0;
        }
    }
    // Phase 9.4: TLS SLL toggle (default ON)
    char* sll_env = getenv("HAKMEM_TINY_TLS_SLL");
    if (sll_env && atoi(sll_env) == 0) {
        g_tls_sll_enable = 0;
    }
    // Path debug enabled?
    {
        char* pd = getenv("HAKMEM_TINY_PATH_DEBUG");
        g_path_debug_enabled = (pd && atoi(pd) != 0) ? 1 : 0;
    }
    // Ultra-Bump TLS shadow（既定ON、envでOFF可能）
    {
        char* ub = getenv("HAKMEM_TINY_BUMP_SHADOW");
        if (ub) { g_ultra_bump_shadow = (atoi(ub) != 0) ? 1 : 0; }
        char* bc = getenv("HAKMEM_TINY_BUMP_CHUNK");
        if (bc) { int v = atoi(bc); if (v > 0 && v < 32768) g_bump_chunk = v; }
    }
    // Refill-one-on-miss（チェーン生成を避け、1個だけ確保して返す）
    {
        char* ro = getenv("HAKMEM_TINY_REFILL_ONE_ON_MISS");
        if (ro) g_refill_one_on_miss = (atoi(ro) != 0) ? 1 : 0;
    }
    // SLL multiplier (hot tiny classes)
    char* sllmul = getenv("HAKMEM_SLL_MULTIPLIER");
    if (sllmul) {
        int v = atoi(sllmul);
        if (v < 1) {
            v = 1;
        } else if (v > 16) {
            v = 16;  // guardrail
        }
        g_sll_multiplier = v;
    }

    // HotMag enable / tuning（既定OFF, envでON可）
    {
        char* hm = getenv("HAKMEM_TINY_HOTMAG");
        if (hm) g_hotmag_enable = (atoi(hm) != 0) ? 1 : 0;
        char* hmcap = getenv("HAKMEM_TINY_HOTMAG_CAP");
        if (hmcap) {
            int v = atoi(hmcap);
            if (v < 16) v = 16;
            else if (v > 1024) v = 1024;
            g_hotmag_cap_default = v;
        }
        char* hmrefill = getenv("HAKMEM_TINY_HOTMAG_REFILL");
        if (hmrefill) {
            int v = atoi(hmrefill);
            if (v < 0) v = 0;
            if (v > g_hotmag_cap_default) v = g_hotmag_cap_default;
            g_hotmag_refill_default = v;
        }
        if (g_hotmag_refill_default > g_hotmag_cap_default) {
            g_hotmag_refill_default = g_hotmag_cap_default;
        }
        if (g_hotmag_refill_default < 0) g_hotmag_refill_default = 0;

        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
            uint16_t cap = hotmag_effective_cap(k);
            g_hotmag_cap_current[k] = cap;
            g_hotmag_cap_locked[k] = 0;
            uint16_t refill = (uint16_t)g_hotmag_refill_default;
            if (refill > cap) refill = cap;
            g_hotmag_refill_current[k] = refill;
            g_hotmag_refill_locked[k] = 0;
            g_hotmag_class_en[k] = (k <= 3) ? 1 : 0;
        }

        // Heuristic defaults for the three hottest classes when not overridden
        if (!g_hotmag_cap_locked[0]) {
            uint16_t cap = g_hotmag_cap_current[0];
            uint16_t cap_target = (g_hotmag_cap_default > 48) ? 48 : (uint16_t)g_hotmag_cap_default;
            if (cap_target < 16) cap_target = 16;
            if (cap_target < cap) g_hotmag_cap_current[0] = cap_target;
        }
        if (!g_hotmag_cap_locked[1]) {
            uint16_t cap = g_hotmag_cap_current[1];
            uint16_t cap_target = (g_hotmag_cap_default > 80) ? 80 : (uint16_t)g_hotmag_cap_default;
            if (cap_target < 32) cap_target = 32;
            if (cap_target < cap) g_hotmag_cap_current[1] = cap_target;
        }
        if (!g_hotmag_cap_locked[2]) {
            uint16_t cap = g_hotmag_cap_current[2];
            uint16_t cap_target = (g_hotmag_cap_default > 112) ? 112 : (uint16_t)g_hotmag_cap_default;
            if (cap_target < 48) cap_target = 48;
            if (cap_target < cap) g_hotmag_cap_current[2] = cap_target;
        }

        if (!g_hotmag_refill_locked[0]) {
            g_hotmag_refill_current[0] = 0;
        }
        if (!g_hotmag_refill_locked[1]) {
            uint16_t cap = g_hotmag_cap_current[1];
            uint16_t ref = (g_hotmag_refill_default > 0) ? (uint16_t)g_hotmag_refill_default : 0;
            if (ref > 0) {
                uint16_t limit = (cap > 20) ? 20 : cap;
                if (ref > limit) ref = limit;
                if (ref > cap) ref = cap;
            }
            g_hotmag_refill_current[1] = ref;
        }
        if (!g_hotmag_refill_locked[2]) {
            uint16_t cap = g_hotmag_cap_current[2];
            uint16_t ref = (g_hotmag_refill_default > 0) ? (uint16_t)g_hotmag_refill_default : 0;
            if (ref > 0) {
                uint16_t limit = (cap > 40) ? 40 : cap;
                if (ref > limit) ref = limit;
                if (ref > cap) ref = cap;
            }
            g_hotmag_refill_current[2] = ref;
        }

        // Default: disable class 2 (32B) HotMag entirely unless explicitly enabled by env
        if (!getenv("HAKMEM_TINY_HOTMAG_C2")) {
            g_hotmag_class_en[2] = 0;
        }

        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
            char key_cap[64];
            snprintf(key_cap, sizeof(key_cap), "HAKMEM_TINY_HOTMAG_CAP_C%d", k);
            char* cap_env = getenv(key_cap);
            if (cap_env) {
                int v = atoi(cap_env);
                if (v < 16) v = 16;
                else if (v > 1024) v = 1024;
                g_hotmag_cap_current[k] = (uint16_t)v;
                g_hotmag_cap_locked[k] = 1;
                if (!g_hotmag_refill_locked[k] && g_hotmag_refill_current[k] > g_hotmag_cap_current[k]) {
                    g_hotmag_refill_current[k] = g_hotmag_cap_current[k];
                }
            }
            char key_ref[64];
            snprintf(key_ref, sizeof(key_ref), "HAKMEM_TINY_HOTMAG_REFILL_C%d", k);
            char* ref_env = getenv(key_ref);
            if (ref_env) {
                int v = atoi(ref_env);
                if (v < 0) v = 0;
                if (v > g_hotmag_cap_current[k]) v = g_hotmag_cap_current[k];
                g_hotmag_refill_current[k] = (uint16_t)v;
                g_hotmag_refill_locked[k] = 1;
            }
            char key_en[64];
            snprintf(key_en, sizeof(key_en), "HAKMEM_TINY_HOTMAG_C%d", k);
            char* en_env = getenv(key_en);
            if (en_env) {
                g_hotmag_class_en[k] = (uint8_t)((atoi(en_env) != 0) ? 1 : 0);
            }
        }

        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
            if (g_hotmag_enable && hkm_is_hot_class(k)) {
                g_tls_hot_mag[k].cap = g_hotmag_cap_current[k];
            } else {
                g_tls_hot_mag[k].cap = 0; // lazy init
            }
            g_tls_hot_mag[k].top = 0;
        }
    }

    // Ultra-Simple front enable（既定OFF, A/B用）
    {
        char* us = getenv("HAKMEM_TINY_ULTRA_SIMPLE");
        if (us) g_ultra_simple = (atoi(us) != 0) ? 1 : 0;
        // zero-initialized by default
    }

    // Background Refill Bin（既定OFF, A/B用）
    {
        char* bb = getenv("HAKMEM_TINY_BG_BIN");
        if (bb) g_bg_bin_enable = (atoi(bb) != 0) ? 1 : 0;
        char* bt = getenv("HAKMEM_TINY_BG_TARGET");
        if (bt) { int v = atoi(bt); if (v > 0 && v <= 4096) g_bg_bin_target = v; }
        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
            atomic_store_explicit(&g_bg_bin_head[k], (uintptr_t)0, memory_order_relaxed);
        }
        if (g_bg_bin_enable && !g_bg_bin_started) {
            if (pthread_create(&g_bg_bin_thread, NULL, tiny_bg_refill_main, NULL) == 0) {
                g_bg_bin_started = 1;
            } else {
                g_bg_bin_enable = 0; // disable on failure
            }
        }
    }
    // Background Spill/Drain (integrated into bg thread)
    // EXTRACTED: bg_spill init moved to hakmem_tiny_bg_spill.c (Phase 2C-2)
    {
        bg_spill_init();  // Initialize bg_spill module from environment

        // Remote target queue init (Phase 2C-1)
        char* br = getenv("HAKMEM_TINY_BG_REMOTE");
        if (br) g_bg_remote_enable = (atoi(br) != 0) ? 1 : 0;
        char* rb = getenv("HAKMEM_TINY_BG_REMOTE_BATCH");
        if (rb) { int v = atoi(rb); if (v > 0 && v <= 4096) g_bg_remote_batch = v; }
        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
            atomic_store_explicit(&g_remote_target_head[k], (uintptr_t)0, memory_order_relaxed);
            atomic_store_explicit(&g_remote_target_len[k], 0u, memory_order_relaxed);
        }

        // bg thread already started above if bg_bin_enable=1; if only spill is enabled, start thread
        if (g_bg_spill_enable && !g_bg_bin_started) {
            if (pthread_create(&g_bg_bin_thread, NULL, tiny_bg_refill_main, NULL) == 0) {
                g_bg_bin_started = 1;
                g_bg_bin_enable = 1; // reuse loop
            } else {
                g_bg_spill_enable = 0;
            }
        }
    }
    // Optional prefetch enable
    {
        char* pf = getenv("HAKMEM_TINY_PREFETCH");
        if (pf && atoi(pf) != 0) g_tiny_prefetch = 1;
    }
    // Refill batch tuning
    char* rmax = getenv("HAKMEM_TINY_REFILL_MAX");
    if (rmax) { int v = atoi(rmax); if (v > 0) g_tiny_refill_max = v; }
    char* rmaxh = getenv("HAKMEM_TINY_REFILL_MAX_HOT");
    if (rmaxh) { int v = atoi(rmaxh); if (v > 0) g_tiny_refill_max_hot = v; }
    // Per-class overrides: HAKMEM_TINY_REFILL_MAX_C{0..7}, HAKMEM_TINY_REFILL_MAX_HOT_C{0..7}
    for (int k = 0; k < TINY_NUM_CLASSES; k++) {
        char key1[64]; snprintf(key1, sizeof(key1), "HAKMEM_TINY_REFILL_MAX_C%d", k);
        char* v1 = getenv(key1); if (v1) { int vv = atoi(v1); if (vv > 0) g_refill_max_c[k] = vv; }
        char key2[64]; snprintf(key2, sizeof(key2), "HAKMEM_TINY_REFILL_MAX_HOT_C%d", k);
        char* v2 = getenv(key2); if (v2) { int vv = atoi(v2); if (vv > 0) g_refill_max_hot_c[k] = vv; }
    }
    // Stats sampling rate (compile-time gated) via env HAKMEM_TINY_STAT_RATE_LG
#if defined(HAKMEM_ENABLE_STATS) && defined(HAKMEM_TINY_STAT_SAMPLING)
    {
        char* sr = getenv("HAKMEM_TINY_STAT_RATE_LG");
        if (sr) { int lg = atoi(sr); if (lg >= 0 && lg <= 31) g_stat_rate_lg = lg; }
        // 関数ポインタ選択（分岐をホットパスから排除）
        g_stat_alloc_fn = (g_stat_rate_lg == 0) ? hkm_stat_alloc_always : hkm_stat_alloc_sampled;
    }
#elif defined(HAKMEM_ENABLE_STATS)
    // サンプリング未使用時は毎回更新
    // FIXME: g_stat_alloc_fn and hkm_stat_alloc_always not yet implemented
    // Stats are recorded via hkm_stat_alloc() in HAK_RET_ALLOC macro instead
    // g_stat_alloc_fn = hkm_stat_alloc_always;
#endif

    // Spill hysteresis（freeホットパスでgetenvしない）
    {
        char* sh = getenv("HAKMEM_TINY_SPILL_HYST");
        if (sh) { int v = atoi(sh); if (v < 0) v = 0; g_spill_hyst = v; }
    }
    char* ultra_env = getenv("HAKMEM_TINY_ULTRA");
    if (ultra_env && atoi(ultra_env) != 0) {
        g_tiny_ultra = 1;
    }
    char* uval = getenv("HAKMEM_TINY_ULTRA_VALIDATE");
    if (uval && atoi(uval) != 0) {
        g_ultra_validate = 1;
    }

    // Ultra env overrides: per-class batch and sll_cap
    // HAKMEM_TINY_ULTRA_BATCH_C{0..7}, HAKMEM_TINY_ULTRA_SLL_CAP_C{0..7}
    char var[64];
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        snprintf(var, sizeof(var), "HAKMEM_TINY_ULTRA_BATCH_C%d", i);
        char* vb = getenv(var);
        if (vb) { int v = atoi(vb); if (v > 0) g_ultra_batch_override[i] = v; }
        snprintf(var, sizeof(var), "HAKMEM_TINY_ULTRA_SLL_CAP_C%d", i);
        char* vc = getenv(var);
        if (vc) { int v = atoi(vc); if (v > 0) g_ultra_sll_cap_override[i] = v; }
        // Normal-path per-class overrides
        snprintf(var, sizeof(var), "HAKMEM_TINY_MAG_CAP_C%d", i);
        char* vm = getenv(var);
        if (vm) { int v = atoi(vm); if (v > 0 && v <= TINY_TLS_MAG_CAP) g_mag_cap_override[i] = v; }
        snprintf(var, sizeof(var), "HAKMEM_TINY_SLL_CAP_C%d", i);
        char* vs = getenv(var);
        if (vs) { int v = atoi(vs); if (v > 0 && v <= TINY_TLS_MAG_CAP) g_sll_cap_override[i] = v; }

        // Front refill count per-class override (fast path tuning)
        snprintf(var, sizeof(var), "HAKMEM_TINY_REFILL_COUNT_C%d", i);
        char* rc = getenv(var);
        if (rc) { int v = atoi(rc); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_class[i] = v; }
    }

    // Front refill count globals
    // Phase 10: Set aggressive defaults for hot and mid classes
    {
        char* g = getenv("HAKMEM_TINY_REFILL_COUNT");
        if (g) { int v = atoi(g); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_global = v; }
        else { g_refill_count_global = 64; }  // Phase 10: default 64 (was 16)

        char* h = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
        if (h) { int v = atoi(h); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_hot = v; }
        else { g_refill_count_hot = 128; }  // Phase 10: default 128 for hot classes (C0-C3)

        char* m = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
        if (m) { int v = atoi(m); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_mid = v; }
        else { g_refill_count_mid = 96; }  // Phase 10: default 96 for mid classes (C4-C7)
    }
    // Sensible default for class 7 (1024B): favor larger refill to reduce refills/syscalls
    if (g_refill_count_class[7] == 0) {
        g_refill_count_class[7] = 128;  // Phase 10: increased from 64 to 128
    }
    {
        char* fast_env = getenv("HAKMEM_TINY_FAST");
        if (fast_env && atoi(fast_env) == 0) g_fast_enable = 0;
        int fast_global = -1;
        char* fast_cap_env = getenv("HAKMEM_TINY_FAST_CAP");
        if (fast_cap_env) {
            int v = atoi(fast_cap_env);
            if (v >= 0 && v <= TINY_TLS_MAG_CAP) fast_global = v;
        }
        for (int i = 0; i < TINY_NUM_CLASSES; i++) {
            uint16_t cap = g_fast_cap_defaults[i];
            if (fast_global >= 0) cap = (uint16_t)fast_global;
            snprintf(var, sizeof(var), "HAKMEM_TINY_FAST_CAP_C%d", i);
            char* fc = getenv(var);
            if (fc) {
                int v = atoi(fc);
                if (v < 0) v = 0;
                if (v > TINY_TLS_MAG_CAP) v = TINY_TLS_MAG_CAP;
                cap = (uint16_t)v;
                g_fast_cap_locked[i] = 1;
            } else if (fast_global >= 0) {
                g_fast_cap_locked[i] = 1;
            } else {
                g_fast_cap_locked[i] = 0;
            }
            g_fast_cap[i] = cap;
        }
    }

    {
        const char* dbg_fast = getenv("HAKMEM_TINY_DEBUG_FAST0");
        if (dbg_fast && atoi(dbg_fast) != 0) {
            g_debug_fast0 = 1;
            g_fast_enable = 0;
            g_hotmag_enable = 0;
            g_tls_list_enable = 0;
        }
        const char* dbg_remote = getenv("HAKMEM_TINY_DEBUG_REMOTE_GUARD");
        if (dbg_remote && atoi(dbg_remote) != 0) {
            g_debug_remote_guard = 1;
        }
        const char* rf_force = getenv("HAKMEM_TINY_RF_FORCE_NOTIFY");
        if (rf_force && atoi(rf_force) != 0) {
            extern int g_remote_force_notify;
            g_remote_force_notify = 1;
        }
        const char* safe_free = getenv("HAKMEM_SAFE_FREE");
        if (safe_free && atoi(safe_free) != 0) {
            extern int g_tiny_safe_free; g_tiny_safe_free = 1;
        }
        const char* safe_free_strict = getenv("HAKMEM_SAFE_FREE_STRICT");
        if (safe_free_strict && atoi(safe_free_strict) != 0) {
            extern int g_tiny_safe_free_strict; g_tiny_safe_free_strict = 1;
        }
        const char* force_remote = getenv("HAKMEM_TINY_FORCE_REMOTE");
        if (force_remote && atoi(force_remote) != 0) {
            extern int g_tiny_force_remote; g_tiny_force_remote = 1;
        }
        // Remote side-table (debug only)
        tiny_remote_side_init_from_env();
    }
    static int g_super_trace = -1;
    if (__builtin_expect(g_super_trace == -1, 0)) {
        const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE");
        g_super_trace = (tr && atoi(tr) != 0) ? 1 : 0;
    }
    if (g_super_trace) {
        static int logged_once = 0;
        if (!logged_once) {
            fprintf(stderr, "[SUPERTRACE] mem_diet=%d env=%s g_use_superslab=%d fast_enable=%d cap0=%u cap1=%u cap2=%u cap3=%u cap4=%u reslist=%d\n",
                    mem_diet_enabled,
                    superslab_env ? superslab_env : "(null)",
                    g_use_superslab,
                    g_fast_enable,
                    (unsigned)g_fast_cap[0],
                    (unsigned)g_fast_cap[1],
                    (unsigned)g_fast_cap[2],
                    (unsigned)g_fast_cap[3],
                    (unsigned)g_fast_cap[4],
                    g_tls_list_enable);
            logged_once = 1;
        }
    }
    tiny_ace_init_defaults();
    char* fc_env = getenv("HAKMEM_TINY_FASTCACHE");
    if (fc_env && atoi(fc_env) != 0) {
        g_fastcache_enable = 1;
    }
    char* fe_env = getenv("HAKMEM_TINY_FRONTEND");
    if (fe_env && atoi(fe_env) != 0) {
        g_frontend_enable = 1;
    }
    // TinyQuickSlot opt-in
    {
        char* q = getenv("HAKMEM_TINY_QUICK");
        if (q && atoi(q) != 0) g_quick_enable = 1;
    }

    tiny_obs_start_if_needed();

    // Deferred Intelligence Engine
    char* ie = getenv("HAKMEM_INT_ENGINE");
    if (ie && atoi(ie) != 0) {
        g_int_engine = 1;
        // Initialize frontend fill targets to zero (let engine grow if hot)
        for (int i = 0; i < TINY_NUM_CLASSES; i++) atomic_store(&g_frontend_fill_target[i], 0);
        // Event logging knobs (optional)
        char* its = getenv("HAKMEM_INT_EVENT_TS");
        if (its && atoi(its) != 0) g_int_event_ts = 1;
        char* ism = getenv("HAKMEM_INT_SAMPLE");
        if (ism) { int n = atoi(ism); if (n > 0 && n < 31) g_int_sample_mask = ((1u << n) - 1u); }
        if (pthread_create(&g_int_thread, NULL, intelligence_engine_main, NULL) == 0) {
            g_int_started = 1;
        }
    }

    // Step 2: Initialize Slab Registry (only if enabled)
    if (g_use_registry) {
        memset(g_slab_registry, 0, sizeof(g_slab_registry));
    }

    // Initialize per-class locks
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        pthread_mutex_init(&g_tiny_class_locks[i].m, NULL);
    }

    // Phase 8.3: Initialize ACE (Adaptive Cache Engine) state
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        g_ss_ace[i].current_lg = 20;  // Start with 1MB SuperSlabs
        g_ss_ace[i].target_lg = 20;   // Default to 1MB
        g_ss_ace[i].hot_score = 0;
        g_ss_ace[i].alloc_count = 0;
        g_ss_ace[i].refill_count = 0;
        g_ss_ace[i].spill_count = 0;
        g_ss_ace[i].live_blocks = 0;
        g_ss_ace[i].last_tick_ns = 0;
    }

    // Lite P1: Pre-allocate Tier 1 (8-64B) hot classes only
    // This avoids initialization overhead for common small allocations
    // Classes 0-3: 8B, 16B, 32B, 64B (256KB total, not 512KB)
    for (int class_idx = 0; class_idx < 4; class_idx++) {
        TinySlab* slab = allocate_new_slab(class_idx);
        if (slab) {
            slab->next = g_tiny_pool.free_slabs[class_idx];
            g_tiny_pool.free_slabs[class_idx] = slab;
        }
    }

    // Phase 11: Initialize SuperSlab Registry and LRU Cache
    if (g_use_superslab) {
        extern void hak_super_registry_init(void);
        extern void hak_ss_lru_init(void);
        extern void hak_ss_prewarm_init(void);

        hak_super_registry_init();
        hak_ss_lru_init();

        // Phase 11: Prewarm SuperSlabs to eliminate mmap/munmap churn
        // ENV: HAKMEM_PREWARM_SUPERSLABS=<count> (e.g., 32, 128)
        hak_ss_prewarm_init();
    }

    if (__builtin_expect(route_enabled_runtime(), 0)) {
        tiny_debug_ring_record(TINY_RING_EVENT_ROUTE, (uint16_t)0xFFFFu, NULL, (uintptr_t)0x494E4954u);
    }
}
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								// hakmem_tiny_init.inc
-												Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default

## Major Changes

### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies

### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed

### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review

### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
  - Hotpath works (+24% vs baseline) after POOL_TLS fix
  - Still 9.2x slower than system malloc due to:
    * Heavy initialization (23.85% of cycles)
    * Syscall overhead (2,382 syscalls per 100K ops)
    * Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
    * 9.4x more instructions than system malloc

### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation

## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)

## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-12 01:01:23 +09:00
+								// Note: uses TLS ops inline helpers for prewarm when class5 hotpath is enabled
 								#include "hakmem_tiny_tls_ops.h"
-												Box API Phase 1-3: Capacity Manager, Carve-Push, Prewarm 実装

Priority 1-3のBox Modulesを実装し、安全なpre-warming APIを提供。
既存の複雑なprewarmコードを1行のBox API呼び出しに置き換え。

## 新規Box Modules

1. **Box Capacity Manager** (capacity_box.h/c)
   - TLS SLL容量の一元管理
   - adaptive_sizing初期化保証
   - Double-free バグ防止

2. **Box Carve-And-Push** (carve_push_box.h/c)
   - アトミックなblock carve + TLS SLL push
   - All-or-nothing semantics
   - Rollback保証（partial failure防止）

3. **Box Prewarm** (prewarm_box.h/c)
   - 安全なTLS cache pre-warming
   - 初期化依存性を隠蔽
   - シンプルなAPI (1関数呼び出し)

## コード簡略化

hakmem_tiny_init.inc: 20行 → 1行
```c
// BEFORE: 複雑なP0分岐とエラー処理
adaptive_sizing_init();
if (prewarm > 0) {
    #if HAKMEM_TINY_P0_BATCH_REFILL
        int taken = sll_refill_batch_from_ss(5, prewarm);
    #else
        int taken = sll_refill_small_from_ss(5, prewarm);
    #endif
}

// AFTER: Box API 1行
int taken = box_prewarm_tls(5, prewarm);
```

## シンボルExport修正

hakmem_tiny.c: 5つのシンボルをstatic → non-static
- g_tls_slabs[] (TLS slab配列)
- g_sll_multiplier (SLL容量乗数)
- g_sll_cap_override[] (容量オーバーライド)
- superslab_refill() (SuperSlab再充填)
- ss_active_add() (アクティブカウンタ)

## ビルドシステム

Makefile: TINY_BENCH_OBJS_BASEに3つのBox modules追加
- core/box/capacity_box.o
- core/box/carve_push_box.o
- core/box/prewarm_box.o

## 動作確認

✅ Debug build成功
✅ Box Prewarm API動作確認
   [PREWARM] class=5 requested=128 taken=32

## 次のステップ

- Box Refill Manager (Priority 4)
- Box SuperSlab Allocator (Priority 5)
- Release build修正（tiny_debug_ring_record）

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 01:45:30 +09:00
+								#include "box/prewarm_box.h"  // Box Prewarm API (Priority 3)
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								// Phase 2D-2: Initialization function extraction
 								//
 								// This file contains the hak_tiny_init() function extracted from hakmem_tiny.c
 								// to improve code organization. Reduces main file by 450 lines (24%).
 								//
 								// Cold path only - called once at startup.
 								void hak_tiny_init(void) {
 								    if (g_tiny_initialized) return;
 								    // Step 1: Simple initialization (static global is already zero-initialized)
 								    g_tiny_initialized = 1;
-												Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default

## Major Changes

### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies

### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed

### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review

### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
  - Hotpath works (+24% vs baseline) after POOL_TLS fix
  - Still 9.2x slower than system malloc due to:
    * Heavy initialization (23.85% of cycles)
    * Syscall overhead (2,382 syscalls per 100K ops)
    * Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
    * 9.4x more instructions than system malloc

### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation

## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)

## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-12 01:01:23 +09:00
+								    // Hot-class toggle: class5 (256B) dedicated TLS fast path
 								    // Default ON; allow runtime override via HAKMEM_TINY_HOTPATH_CLASS5
 								    {
 								        const char* hp5 = getenv("HAKMEM_TINY_HOTPATH_CLASS5");
 								        if (hp5 && *hp5) {
 								            g_tiny_hotpath_class5 = (atoi(hp5) != 0) ? 1 : 0;
 								        }
 								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    // Reset fast-cache defaults and apply preset (if provided)
 								    tiny_config_reset_defaults();
 								    char* preset_env = getenv("HAKMEM_TINY_PRESET");
 								    if (preset_env) {
 								        if (strcasecmp(preset_env, "TIGHT") == 0) {
 								            TINY_PRESET_TIGHT();
 								        } else if (strcasecmp(preset_env, "ULTRA_TIGHT") == 0 ||
 								                   strcasecmp(preset_env, "ULTRATIGHT") == 0) {
 								            TINY_PRESET_ULTRA_TIGHT();
 								        } else {
 								            TINY_PRESET_BALANCED();
 								        }
 								    }
 								    // Phase 6.14: Read environment variable for Registry ON/OFF
 								    char* env = getenv("HAKMEM_USE_REGISTRY");
 								    if (env) {
 								        g_use_registry = atoi(env);
 								    } else {
 								        g_use_registry = 1;  // Default ON for multi-thread safety
 								    }
 								    // Phase 6.15: Runtime toggle to allow Tiny within wrappers
 								    // HAKMEM_WRAP_TINY=1 → enable Tiny fast-path during wrapper calls
 								    char* wrap_env = getenv("HAKMEM_WRAP_TINY");
 								    if (wrap_env && atoi(wrap_env) != 0) {
 								        g_wrap_tiny_enabled = 1;
 								    }
 								    char* wrap_refill_env = getenv("HAKMEM_WRAP_TINY_REFILL");
 								    if (wrap_refill_env && atoi(wrap_refill_env) != 0) {
 								        g_wrap_tiny_refill = 1;
 								    }
 								    // Remote-drain knobs
 								    char* rth = getenv("HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD");
 								    if (rth) { int v = atoi(rth); if (v > 0) g_remote_drain_thresh = v; }
 								    char* rr = getenv("HAKMEM_TINY_REMOTE_DRAIN_TRYRATE");
 								    if (rr) { int v = atoi(rr); if (v > 0) g_remote_drain_tryrate = v; }
 								    char* cs = getenv("HAKMEM_TINY_COUNT_SAMPLE");
 								    if (cs) { int v = atoi(cs); if (v>=0 && v<=16) g_tiny_count_sample_exp = v; }
 								    int mem_diet_enabled = 1;  // Default: Enable for memory efficiency
 								    char* memdiet_env = getenv("HAKMEM_TINY_MEM_DIET");
 								    if (memdiet_env && atoi(memdiet_env) == 0) {
 								        mem_diet_enabled = 0;  // Allow disabling via env
 								    }
 								    if (mem_diet_enabled) {
 								        if (g_mag_cap_limit > 64) g_mag_cap_limit = 64;
 								    }
 								    // Optional: Magazine cap limit (runtime lower bound)
 								    char* mag_env = getenv("HAKMEM_TINY_MAG_CAP");
 								    if (mag_env) {
 								        int val = atoi(mag_env);
 								        if (val > 0 && val < g_mag_cap_limit) g_mag_cap_limit = val;
 								    }
 								    // Phase X: Initialize TLS free-list defaults
 								    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
 								        TinyTLSList* tls = &g_tls_lists[i];
 								        tls->head = NULL;
 								        tls->count = 0;
 								        uint32_t base_cap = (uint32_t)tiny_default_cap(i);
 								        uint32_t class_max = (uint32_t)tiny_cap_max_for_class(i);
 								        if (base_cap > class_max) base_cap = class_max;
 								        if ((uint32_t)g_mag_cap_limit < base_cap) base_cap = (uint32_t)g_mag_cap_limit;
 								        if (g_mag_cap_override[i] > 0) {
 								            uint32_t ov = (uint32_t)g_mag_cap_override[i];
 								            if (ov > class_max) ov = class_max;
 								            if (ov > (uint32_t)g_mag_cap_limit) ov = (uint32_t)g_mag_cap_limit;
 								            if (ov != 0u) base_cap = ov;
 								        }
 								        if (base_cap == 0u) base_cap = 32u;
 								        tls->cap = base_cap;
 								        tls->refill_low = tiny_tls_default_refill(base_cap);
 								        tls->spill_high = tiny_tls_default_spill(base_cap);
 								        tiny_tls_publish_targets(i, base_cap);
 								    }
-												Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default

## Major Changes

### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies

### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed

### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review

### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
  - Hotpath works (+24% vs baseline) after POOL_TLS fix
  - Still 9.2x slower than system malloc due to:
    * Heavy initialization (23.85% of cycles)
    * Syscall overhead (2,382 syscalls per 100K ops)
    * Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
    * 9.4x more instructions than system malloc

### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation

## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)

## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-12 01:01:23 +09:00
+								    // Optional: override TLS parameters for hot class 5 (256B)
 								    if (g_tiny_hotpath_class5) {
 								        TinyTLSList* tls5 = &g_tls_lists[5];
 								        int cap_def = 512;     // thick cache for hot class
 								        int refill_def = 128;  // refill low-water mark
 								        int spill_def = 0;     // 0 → use cap as hard spill threshold
 								        const char* ecap = getenv("HAKMEM_TINY_CLASS5_TLS_CAP");
 								        const char* eref = getenv("HAKMEM_TINY_CLASS5_TLS_REFILL");
 								        const char* espl = getenv("HAKMEM_TINY_CLASS5_TLS_SPILL");
 								        if (ecap && *ecap) cap_def = atoi(ecap);
 								        if (eref && *eref) refill_def = atoi(eref);
 								        if (espl && *espl) spill_def = atoi(espl);
 								        if (cap_def < 64) cap_def = 64; if (cap_def > 4096) cap_def = 4096;
 								        if (refill_def < 16) refill_def = 16; if (refill_def > cap_def) refill_def = cap_def;
 								        if (spill_def < 0) spill_def = 0; if (spill_def > cap_def) spill_def = cap_def;
 								        tls5->cap = (uint32_t)cap_def;
 								        tls5->refill_low = (uint32_t)refill_def;
 								        tls5->spill_high = (uint32_t)spill_def;  // 0 → use cap logic in helper
 								        tiny_tls_publish_targets(5, (uint32_t)cap_def);
 								        // Optional: one-shot TLS prewarm for class5
 								        // Env: HAKMEM_TINY_CLASS5_PREWARM=<n> (default 128, 0 disables)
 								        int prewarm = 128;
 								        const char* pw = getenv("HAKMEM_TINY_CLASS5_PREWARM");
 								        if (pw && *pw) prewarm = atoi(pw);
 								        if (prewarm < 0) prewarm = 0;
 								        if (prewarm > (int)tls5->cap) prewarm = (int)tls5->cap;
-												Box API Phase 1-3: Capacity Manager, Carve-Push, Prewarm 実装

Priority 1-3のBox Modulesを実装し、安全なpre-warming APIを提供。
既存の複雑なprewarmコードを1行のBox API呼び出しに置き換え。

## 新規Box Modules

1. **Box Capacity Manager** (capacity_box.h/c)
   - TLS SLL容量の一元管理
   - adaptive_sizing初期化保証
   - Double-free バグ防止

2. **Box Carve-And-Push** (carve_push_box.h/c)
   - アトミックなblock carve + TLS SLL push
   - All-or-nothing semantics
   - Rollback保証（partial failure防止）

3. **Box Prewarm** (prewarm_box.h/c)
   - 安全なTLS cache pre-warming
   - 初期化依存性を隠蔽
   - シンプルなAPI (1関数呼び出し)

## コード簡略化

hakmem_tiny_init.inc: 20行 → 1行
```c
// BEFORE: 複雑なP0分岐とエラー処理
adaptive_sizing_init();
if (prewarm > 0) {
    #if HAKMEM_TINY_P0_BATCH_REFILL
        int taken = sll_refill_batch_from_ss(5, prewarm);
    #else
        int taken = sll_refill_small_from_ss(5, prewarm);
    #endif
}

// AFTER: Box API 1行
int taken = box_prewarm_tls(5, prewarm);
```

## シンボルExport修正

hakmem_tiny.c: 5つのシンボルをstatic → non-static
- g_tls_slabs[] (TLS slab配列)
- g_sll_multiplier (SLL容量乗数)
- g_sll_cap_override[] (容量オーバーライド)
- superslab_refill() (SuperSlab再充填)
- ss_active_add() (アクティブカウンタ)

## ビルドシステム

Makefile: TINY_BENCH_OBJS_BASEに3つのBox modules追加
- core/box/capacity_box.o
- core/box/carve_push_box.o
- core/box/prewarm_box.o

## 動作確認

✅ Debug build成功
✅ Box Prewarm API動作確認
   [PREWARM] class=5 requested=128 taken=32

## 次のステップ

- Box Refill Manager (Priority 4)
- Box SuperSlab Allocator (Priority 5)
- Release build修正（tiny_debug_ring_record）

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 01:45:30 +09:00
-												Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default

## Major Changes

### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies

### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed

### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review

### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
  - Hotpath works (+24% vs baseline) after POOL_TLS fix
  - Still 9.2x slower than system malloc due to:
    * Heavy initialization (23.85% of cycles)
    * Syscall overhead (2,382 syscalls per 100K ops)
    * Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
    * 9.4x more instructions than system malloc

### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation

## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)

## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-12 01:01:23 +09:00
+								        if (prewarm > 0) {
-												Box API Phase 1-3: Capacity Manager, Carve-Push, Prewarm 実装

Priority 1-3のBox Modulesを実装し、安全なpre-warming APIを提供。
既存の複雑なprewarmコードを1行のBox API呼び出しに置き換え。

## 新規Box Modules

1. **Box Capacity Manager** (capacity_box.h/c)
   - TLS SLL容量の一元管理
   - adaptive_sizing初期化保証
   - Double-free バグ防止

2. **Box Carve-And-Push** (carve_push_box.h/c)
   - アトミックなblock carve + TLS SLL push
   - All-or-nothing semantics
   - Rollback保証（partial failure防止）

3. **Box Prewarm** (prewarm_box.h/c)
   - 安全なTLS cache pre-warming
   - 初期化依存性を隠蔽
   - シンプルなAPI (1関数呼び出し)

## コード簡略化

hakmem_tiny_init.inc: 20行 → 1行
```c
// BEFORE: 複雑なP0分岐とエラー処理
adaptive_sizing_init();
if (prewarm > 0) {
    #if HAKMEM_TINY_P0_BATCH_REFILL
        int taken = sll_refill_batch_from_ss(5, prewarm);
    #else
        int taken = sll_refill_small_from_ss(5, prewarm);
    #endif
}

// AFTER: Box API 1行
int taken = box_prewarm_tls(5, prewarm);
```

## シンボルExport修正

hakmem_tiny.c: 5つのシンボルをstatic → non-static
- g_tls_slabs[] (TLS slab配列)
- g_sll_multiplier (SLL容量乗数)
- g_sll_cap_override[] (容量オーバーライド)
- superslab_refill() (SuperSlab再充填)
- ss_active_add() (アクティブカウンタ)

## ビルドシステム

Makefile: TINY_BENCH_OBJS_BASEに3つのBox modules追加
- core/box/capacity_box.o
- core/box/carve_push_box.o
- core/box/prewarm_box.o

## 動作確認

✅ Debug build成功
✅ Box Prewarm API動作確認
   [PREWARM] class=5 requested=128 taken=32

## 次のステップ

- Box Refill Manager (Priority 4)
- Box SuperSlab Allocator (Priority 5)
- Release build修正（tiny_debug_ring_record）

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 01:45:30 +09:00
+								            // ✅ NEW: Use Box Prewarm API (safe, simple, handles all initialization)
 								            // Box Prewarm guarantees:
 								            //   - Correct initialization order (capacity system initialized first)
 								            //   - No orphaned blocks (atomic carve-and-push)
 								            //   - No double-free risk (all-or-nothing semantics)
 								            //   - Clear error handling
 								            int taken = box_prewarm_tls(5, prewarm);
 								            #if !HAKMEM_BUILD_RELEASE
 								            // Debug logging (optional)
 								            fprintf(stderr, "[PREWARM] class=5 requested=%d taken=%d\n", prewarm, taken);
 								            #endif
 								            (void)taken; // Suppress unused warning in release builds
-												Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default

## Major Changes

### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies

### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed

### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review

### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
  - Hotpath works (+24% vs baseline) after POOL_TLS fix
  - Still 9.2x slower than system malloc due to:
    * Heavy initialization (23.85% of cycles)
    * Syscall overhead (2,382 syscalls per 100K ops)
    * Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
    * 9.4x more instructions than system malloc

### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation

## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)

## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-12 01:01:23 +09:00
+								        }
 								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    if (mem_diet_enabled) {
 								        tiny_apply_mem_diet();
 								    }
 								    // Enable signal-triggered stats dump if requested (SIGUSR1)
 								    hak_tiny_enable_signal_dump();
 								    // Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
 								    // Allow runtime disable/enable via env (0=off, 1=on)
-												Phase 6-2.3~6-2.5: Critical bug fixes + SuperSlab optimization (WIP)

## Phase 6-2.3: Fix 4T Larson crash (active counter bug) ✅
**Problem:** 4T Larson crashed with "free(): invalid pointer", OOM errors
**Root cause:** core/hakmem_tiny_refill_p0.inc.h:103
  - P0 batch refill moved freelist blocks to TLS cache
  - Active counter NOT incremented → double-decrement on free
  - Counter underflows → SuperSlab appears full → OOM → crash
**Fix:** Added ss_active_add(tls->ss, from_freelist);
**Result:** 4T stable at 838K ops/s ✅

## Phase 6-2.4: Fix SEGV in random_mixed/mid_large_mt benchmarks ✅
**Problem:** bench_random_mixed_hakmem, bench_mid_large_mt_hakmem → immediate SEGV
**Root cause #1:** core/box/hak_free_api.inc.h:92-95
  - "Guess loop" dereferenced unmapped memory when registry lookup failed
**Root cause #2:** core/box/hak_free_api.inc.h:115
  - Header magic check dereferenced unmapped memory
**Fix:**
  1. Removed dangerous guess loop (lines 92-95)
  2. Added hak_is_memory_readable() check before dereferencing header
     (core/hakmem_internal.h:277-294 - uses mincore() syscall)
**Result:**
  - random_mixed (2KB): SEGV → 2.22M ops/s ✅
  - random_mixed (4KB): SEGV → 2.58M ops/s ✅
  - Larson 4T: no regression (838K ops/s) ✅

## Phase 6-2.5: Performance investigation + SuperSlab fix (WIP) ⚠️
**Problem:** Severe performance gaps (19-26x slower than system malloc)
**Investigation:** Task agent identified root cause
  - hak_is_memory_readable() syscall overhead (100-300 cycles per free)
  - ALL frees hit unmapped_header_fallback path
  - SuperSlab lookup NEVER called
  - Why? g_use_superslab = 0 (disabled by diet mode)

**Root cause:** core/hakmem_tiny_init.inc:104-105
  - Diet mode (default ON) disables SuperSlab
  - SuperSlab defaults to 1 (hakmem_config.c:334)
  - BUT diet mode overrides it to 0 during init

**Fix:** Separate SuperSlab from diet mode
  - SuperSlab: Performance-critical (fast alloc/free)
  - Diet mode: Memory efficiency (magazine capacity limits only)
  - Both are independent features, should not interfere

**Status:** ⚠️ INCOMPLETE - New SEGV discovered after fix
  - SuperSlab lookup now works (confirmed via debug output)
  - But benchmark crashes (Exit 139) after ~20 lookups
  - Needs further investigation

**Files modified:**
- core/hakmem_tiny_init.inc:99-109 - Removed diet mode override
- PERFORMANCE_INVESTIGATION_REPORT.md - Task agent analysis (303x instruction gap)

**Next steps:**
- Investigate new SEGV (likely SuperSlab free path bug)
- OR: Revert Phase 6-2.5 changes if blocking progress

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 20:31:01 +09:00
+								    // Phase 6-2.5 FIX: SuperSlab is independent from diet mode (both are performance-critical)
 								    // - SuperSlab: Fast allocation/free (defaults to 1, set in hakmem_config.c:334)
 								    // - Diet mode: Magazine capacity limits only (doesn't disable subsystems)
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    char* superslab_env = getenv("HAKMEM_TINY_USE_SUPERSLAB");
 								    if (superslab_env) {
 								        g_use_superslab = (atoi(superslab_env) != 0) ? 1 : 0;
 								    }
-												Infrastructure and build updates

- Update build configuration and flags
- Add missing header files and dependencies
- Update TLS list implementation with proper scoping
- Fix various compilation warnings and issues
- Update debug ring and tiny allocation infrastructure
- Update benchmark results documentation

Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com>

											
										
										
											2025-11-11 21:49:05 +09:00
 								    // Initialize Super Front Cache (SFC) with bench-friendly defaults
 								    // Enabled by default; can be disabled via HAKMEM_SFC_ENABLE=0
 								    {
 								        extern void sfc_init(void);
 								        sfc_init();
 								    }
-												Phase 6-2.3~6-2.5: Critical bug fixes + SuperSlab optimization (WIP)

## Phase 6-2.3: Fix 4T Larson crash (active counter bug) ✅
**Problem:** 4T Larson crashed with "free(): invalid pointer", OOM errors
**Root cause:** core/hakmem_tiny_refill_p0.inc.h:103
  - P0 batch refill moved freelist blocks to TLS cache
  - Active counter NOT incremented → double-decrement on free
  - Counter underflows → SuperSlab appears full → OOM → crash
**Fix:** Added ss_active_add(tls->ss, from_freelist);
**Result:** 4T stable at 838K ops/s ✅

## Phase 6-2.4: Fix SEGV in random_mixed/mid_large_mt benchmarks ✅
**Problem:** bench_random_mixed_hakmem, bench_mid_large_mt_hakmem → immediate SEGV
**Root cause #1:** core/box/hak_free_api.inc.h:92-95
  - "Guess loop" dereferenced unmapped memory when registry lookup failed
**Root cause #2:** core/box/hak_free_api.inc.h:115
  - Header magic check dereferenced unmapped memory
**Fix:**
  1. Removed dangerous guess loop (lines 92-95)
  2. Added hak_is_memory_readable() check before dereferencing header
     (core/hakmem_internal.h:277-294 - uses mincore() syscall)
**Result:**
  - random_mixed (2KB): SEGV → 2.22M ops/s ✅
  - random_mixed (4KB): SEGV → 2.58M ops/s ✅
  - Larson 4T: no regression (838K ops/s) ✅

## Phase 6-2.5: Performance investigation + SuperSlab fix (WIP) ⚠️
**Problem:** Severe performance gaps (19-26x slower than system malloc)
**Investigation:** Task agent identified root cause
  - hak_is_memory_readable() syscall overhead (100-300 cycles per free)
  - ALL frees hit unmapped_header_fallback path
  - SuperSlab lookup NEVER called
  - Why? g_use_superslab = 0 (disabled by diet mode)

**Root cause:** core/hakmem_tiny_init.inc:104-105
  - Diet mode (default ON) disables SuperSlab
  - SuperSlab defaults to 1 (hakmem_config.c:334)
  - BUT diet mode overrides it to 0 during init

**Fix:** Separate SuperSlab from diet mode
  - SuperSlab: Performance-critical (fast alloc/free)
  - Diet mode: Memory efficiency (magazine capacity limits only)
  - Both are independent features, should not interfere

**Status:** ⚠️ INCOMPLETE - New SEGV discovered after fix
  - SuperSlab lookup now works (confirmed via debug output)
  - But benchmark crashes (Exit 139) after ~20 lookups
  - Needs further investigation

**Files modified:**
- core/hakmem_tiny_init.inc:99-109 - Removed diet mode override
- PERFORMANCE_INVESTIGATION_REPORT.md - Task agent analysis (303x instruction gap)

**Next steps:**
- Investigate new SEGV (likely SuperSlab free path bug)
- OR: Revert Phase 6-2.5 changes if blocking progress

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 20:31:01 +09:00
+								    // Note: Diet mode no longer overrides g_use_superslab (removed lines 104-105)
 								    // SuperSlab defaults to 1 unless explicitly disabled via env var
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    // One-shot hint: publish/adopt requires SuperSlab ON
 								    {
 								        static int hint_once = 0;
 								        if (!hint_once) {
 								            const char* must_adopt = getenv("HAKMEM_TINY_MUST_ADOPT");
 								            if ((!superslab_env || g_use_superslab == 0) && must_adopt && atoi(must_adopt) != 0) {
 								                fprintf(stderr, "[HINT] HAKMEM_TINY_USE_SUPERSLAB=0: publish/adopt pipeline is disabled. Set =1 for mailbox/adopt.\n");
 								            }
 								            hint_once = 1;
 								        }
 								    }
 								    {
 								        char* tlslist_env = getenv("HAKMEM_TINY_TLS_LIST");
 								        if (tlslist_env) {
 								            g_tls_list_enable = (atoi(tlslist_env) != 0) ? 1 : 0;
 								        }
 								    }
 								    // Phase 9.4: TLS SLL toggle (default ON)
 								    char* sll_env = getenv("HAKMEM_TINY_TLS_SLL");
 								    if (sll_env && atoi(sll_env) == 0) {
 								        g_tls_sll_enable = 0;
 								    }
 								    // Path debug enabled?
 								    {
 								        char* pd = getenv("HAKMEM_TINY_PATH_DEBUG");
 								        g_path_debug_enabled = (pd && atoi(pd) != 0) ? 1 : 0;
 								    }
 								    // Ultra-Bump TLS shadow（既定ON、envでOFF可能）
 								    {
 								        char* ub = getenv("HAKMEM_TINY_BUMP_SHADOW");
 								        if (ub) { g_ultra_bump_shadow = (atoi(ub) != 0) ? 1 : 0; }
 								        char* bc = getenv("HAKMEM_TINY_BUMP_CHUNK");
 								        if (bc) { int v = atoi(bc); if (v > 0 && v < 32768) g_bump_chunk = v; }
 								    }
 								    // Refill-one-on-miss（チェーン生成を避け、1個だけ確保して返す）
 								    {
 								        char* ro = getenv("HAKMEM_TINY_REFILL_ONE_ON_MISS");
 								        if (ro) g_refill_one_on_miss = (atoi(ro) != 0) ? 1 : 0;
 								    }
 								    // SLL multiplier (hot tiny classes)
 								    char* sllmul = getenv("HAKMEM_SLL_MULTIPLIER");
 								    if (sllmul) {
 								        int v = atoi(sllmul);
 								        if (v < 1) {
 								            v = 1;
 								        } else if (v > 16) {
 								            v = 16;  // guardrail
 								        }
 								        g_sll_multiplier = v;
 								    }
 								    // HotMag enable / tuning（既定OFF, envでON可）
 								    {
 								        char* hm = getenv("HAKMEM_TINY_HOTMAG");
 								        if (hm) g_hotmag_enable = (atoi(hm) != 0) ? 1 : 0;
 								        char* hmcap = getenv("HAKMEM_TINY_HOTMAG_CAP");
 								        if (hmcap) {
 								            int v = atoi(hmcap);
 								            if (v < 16) v = 16;
 								            else if (v > 1024) v = 1024;
 								            g_hotmag_cap_default = v;
 								        }
 								        char* hmrefill = getenv("HAKMEM_TINY_HOTMAG_REFILL");
 								        if (hmrefill) {
 								            int v = atoi(hmrefill);
 								            if (v < 0) v = 0;
 								            if (v > g_hotmag_cap_default) v = g_hotmag_cap_default;
 								            g_hotmag_refill_default = v;
 								        }
 								        if (g_hotmag_refill_default > g_hotmag_cap_default) {
 								            g_hotmag_refill_default = g_hotmag_cap_default;
 								        }
 								        if (g_hotmag_refill_default < 0) g_hotmag_refill_default = 0;
 								        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
 								            uint16_t cap = hotmag_effective_cap(k);
 								            g_hotmag_cap_current[k] = cap;
 								            g_hotmag_cap_locked[k] = 0;
 								            uint16_t refill = (uint16_t)g_hotmag_refill_default;
 								            if (refill > cap) refill = cap;
 								            g_hotmag_refill_current[k] = refill;
 								            g_hotmag_refill_locked[k] = 0;
 								            g_hotmag_class_en[k] = (k <= 3) ? 1 : 0;
 								        }
 								        // Heuristic defaults for the three hottest classes when not overridden
 								        if (!g_hotmag_cap_locked[0]) {
 								            uint16_t cap = g_hotmag_cap_current[0];
 								            uint16_t cap_target = (g_hotmag_cap_default > 48) ? 48 : (uint16_t)g_hotmag_cap_default;
 								            if (cap_target < 16) cap_target = 16;
 								            if (cap_target < cap) g_hotmag_cap_current[0] = cap_target;
 								        }
 								        if (!g_hotmag_cap_locked[1]) {
 								            uint16_t cap = g_hotmag_cap_current[1];
 								            uint16_t cap_target = (g_hotmag_cap_default > 80) ? 80 : (uint16_t)g_hotmag_cap_default;
 								            if (cap_target < 32) cap_target = 32;
 								            if (cap_target < cap) g_hotmag_cap_current[1] = cap_target;
 								        }
 								        if (!g_hotmag_cap_locked[2]) {
 								            uint16_t cap = g_hotmag_cap_current[2];
 								            uint16_t cap_target = (g_hotmag_cap_default > 112) ? 112 : (uint16_t)g_hotmag_cap_default;
 								            if (cap_target < 48) cap_target = 48;
 								            if (cap_target < cap) g_hotmag_cap_current[2] = cap_target;
 								        }
 								        if (!g_hotmag_refill_locked[0]) {
 								            g_hotmag_refill_current[0] = 0;
 								        }
 								        if (!g_hotmag_refill_locked[1]) {
 								            uint16_t cap = g_hotmag_cap_current[1];
 								            uint16_t ref = (g_hotmag_refill_default > 0) ? (uint16_t)g_hotmag_refill_default : 0;
 								            if (ref > 0) {
 								                uint16_t limit = (cap > 20) ? 20 : cap;
 								                if (ref > limit) ref = limit;
 								                if (ref > cap) ref = cap;
 								            }
 								            g_hotmag_refill_current[1] = ref;
 								        }
 								        if (!g_hotmag_refill_locked[2]) {
 								            uint16_t cap = g_hotmag_cap_current[2];
 								            uint16_t ref = (g_hotmag_refill_default > 0) ? (uint16_t)g_hotmag_refill_default : 0;
 								            if (ref > 0) {
 								                uint16_t limit = (cap > 40) ? 40 : cap;
 								                if (ref > limit) ref = limit;
 								                if (ref > cap) ref = cap;
 								            }
 								            g_hotmag_refill_current[2] = ref;
 								        }
 								        // Default: disable class 2 (32B) HotMag entirely unless explicitly enabled by env
 								        if (!getenv("HAKMEM_TINY_HOTMAG_C2")) {
 								            g_hotmag_class_en[2] = 0;
 								        }
 								        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
 								            char key_cap[64];
 								            snprintf(key_cap, sizeof(key_cap), "HAKMEM_TINY_HOTMAG_CAP_C%d", k);
 								            char* cap_env = getenv(key_cap);
 								            if (cap_env) {
 								                int v = atoi(cap_env);
 								                if (v < 16) v = 16;
 								                else if (v > 1024) v = 1024;
 								                g_hotmag_cap_current[k] = (uint16_t)v;
 								                g_hotmag_cap_locked[k] = 1;
 								                if (!g_hotmag_refill_locked[k] && g_hotmag_refill_current[k] > g_hotmag_cap_current[k]) {
 								                    g_hotmag_refill_current[k] = g_hotmag_cap_current[k];
 								                }
 								            }
 								            char key_ref[64];
 								            snprintf(key_ref, sizeof(key_ref), "HAKMEM_TINY_HOTMAG_REFILL_C%d", k);
 								            char* ref_env = getenv(key_ref);
 								            if (ref_env) {
 								                int v = atoi(ref_env);
 								                if (v < 0) v = 0;
 								                if (v > g_hotmag_cap_current[k]) v = g_hotmag_cap_current[k];
 								                g_hotmag_refill_current[k] = (uint16_t)v;
 								                g_hotmag_refill_locked[k] = 1;
 								            }
 								            char key_en[64];
 								            snprintf(key_en, sizeof(key_en), "HAKMEM_TINY_HOTMAG_C%d", k);
 								            char* en_env = getenv(key_en);
 								            if (en_env) {
 								                g_hotmag_class_en[k] = (uint8_t)((atoi(en_env) != 0) ? 1 : 0);
 								            }
 								        }
 								        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
 								            if (g_hotmag_enable && hkm_is_hot_class(k)) {
 								                g_tls_hot_mag[k].cap = g_hotmag_cap_current[k];
 								            } else {
 								                g_tls_hot_mag[k].cap = 0; // lazy init
 								            }
 								            g_tls_hot_mag[k].top = 0;
 								        }
 								    }
 								    // Ultra-Simple front enable（既定OFF, A/B用）
 								    {
 								        char* us = getenv("HAKMEM_TINY_ULTRA_SIMPLE");
 								        if (us) g_ultra_simple = (atoi(us) != 0) ? 1 : 0;
 								        // zero-initialized by default
 								    }
 								    // Background Refill Bin（既定OFF, A/B用）
 								    {
 								        char* bb = getenv("HAKMEM_TINY_BG_BIN");
 								        if (bb) g_bg_bin_enable = (atoi(bb) != 0) ? 1 : 0;
 								        char* bt = getenv("HAKMEM_TINY_BG_TARGET");
 								        if (bt) { int v = atoi(bt); if (v > 0 && v <= 4096) g_bg_bin_target = v; }
 								        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
 								            atomic_store_explicit(&g_bg_bin_head[k], (uintptr_t)0, memory_order_relaxed);
 								        }
 								        if (g_bg_bin_enable && !g_bg_bin_started) {
 								            if (pthread_create(&g_bg_bin_thread, NULL, tiny_bg_refill_main, NULL) == 0) {
 								                g_bg_bin_started = 1;
 								            } else {
 								                g_bg_bin_enable = 0; // disable on failure
 								            }
 								        }
 								    }
 								    // Background Spill/Drain (integrated into bg thread)
 								    // EXTRACTED: bg_spill init moved to hakmem_tiny_bg_spill.c (Phase 2C-2)
 								    {
 								        bg_spill_init();  // Initialize bg_spill module from environment
 								        // Remote target queue init (Phase 2C-1)
 								        char* br = getenv("HAKMEM_TINY_BG_REMOTE");
 								        if (br) g_bg_remote_enable = (atoi(br) != 0) ? 1 : 0;
 								        char* rb = getenv("HAKMEM_TINY_BG_REMOTE_BATCH");
 								        if (rb) { int v = atoi(rb); if (v > 0 && v <= 4096) g_bg_remote_batch = v; }
 								        for (int k = 0; k < TINY_NUM_CLASSES; k++) {
 								            atomic_store_explicit(&g_remote_target_head[k], (uintptr_t)0, memory_order_relaxed);
 								            atomic_store_explicit(&g_remote_target_len[k], 0u, memory_order_relaxed);
 								        }
 								        // bg thread already started above if bg_bin_enable=1; if only spill is enabled, start thread
 								        if (g_bg_spill_enable && !g_bg_bin_started) {
 								            if (pthread_create(&g_bg_bin_thread, NULL, tiny_bg_refill_main, NULL) == 0) {
 								                g_bg_bin_started = 1;
 								                g_bg_bin_enable = 1; // reuse loop
 								            } else {
 								                g_bg_spill_enable = 0;
 								            }
 								        }
 								    }
 								    // Optional prefetch enable
 								    {
 								        char* pf = getenv("HAKMEM_TINY_PREFETCH");
 								        if (pf && atoi(pf) != 0) g_tiny_prefetch = 1;
 								    }
 								    // Refill batch tuning
 								    char* rmax = getenv("HAKMEM_TINY_REFILL_MAX");
 								    if (rmax) { int v = atoi(rmax); if (v > 0) g_tiny_refill_max = v; }
 								    char* rmaxh = getenv("HAKMEM_TINY_REFILL_MAX_HOT");
 								    if (rmaxh) { int v = atoi(rmaxh); if (v > 0) g_tiny_refill_max_hot = v; }
 								    // Per-class overrides: HAKMEM_TINY_REFILL_MAX_C{0..7}, HAKMEM_TINY_REFILL_MAX_HOT_C{0..7}
 								    for (int k = 0; k < TINY_NUM_CLASSES; k++) {
 								        char key1[64]; snprintf(key1, sizeof(key1), "HAKMEM_TINY_REFILL_MAX_C%d", k);
 								        char* v1 = getenv(key1); if (v1) { int vv = atoi(v1); if (vv > 0) g_refill_max_c[k] = vv; }
 								        char key2[64]; snprintf(key2, sizeof(key2), "HAKMEM_TINY_REFILL_MAX_HOT_C%d", k);
 								        char* v2 = getenv(key2); if (v2) { int vv = atoi(v2); if (vv > 0) g_refill_max_hot_c[k] = vv; }
 								    }
 								    // Stats sampling rate (compile-time gated) via env HAKMEM_TINY_STAT_RATE_LG
 								#if defined(HAKMEM_ENABLE_STATS) && defined(HAKMEM_TINY_STAT_SAMPLING)
 								    {
 								        char* sr = getenv("HAKMEM_TINY_STAT_RATE_LG");
 								        if (sr) { int lg = atoi(sr); if (lg >= 0 && lg <= 31) g_stat_rate_lg = lg; }
 								        // 関数ポインタ選択（分岐をホットパスから排除）
 								        g_stat_alloc_fn = (g_stat_rate_lg == 0) ? hkm_stat_alloc_always : hkm_stat_alloc_sampled;
 								    }
 								#elif defined(HAKMEM_ENABLE_STATS)
 								    // サンプリング未使用時は毎回更新
 								    // FIXME: g_stat_alloc_fn and hkm_stat_alloc_always not yet implemented
 								    // Stats are recorded via hkm_stat_alloc() in HAK_RET_ALLOC macro instead
 								    // g_stat_alloc_fn = hkm_stat_alloc_always;
 								#endif
 								    // Spill hysteresis（freeホットパスでgetenvしない）
 								    {
 								        char* sh = getenv("HAKMEM_TINY_SPILL_HYST");
 								        if (sh) { int v = atoi(sh); if (v < 0) v = 0; g_spill_hyst = v; }
 								    }
 								    char* ultra_env = getenv("HAKMEM_TINY_ULTRA");
 								    if (ultra_env && atoi(ultra_env) != 0) {
 								        g_tiny_ultra = 1;
 								    }
 								    char* uval = getenv("HAKMEM_TINY_ULTRA_VALIDATE");
 								    if (uval && atoi(uval) != 0) {
 								        g_ultra_validate = 1;
 								    }
 								    // Ultra env overrides: per-class batch and sll_cap
 								    // HAKMEM_TINY_ULTRA_BATCH_C{0..7}, HAKMEM_TINY_ULTRA_SLL_CAP_C{0..7}
 								    char var[64];
 								    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
 								        snprintf(var, sizeof(var), "HAKMEM_TINY_ULTRA_BATCH_C%d", i);
 								        char* vb = getenv(var);
 								        if (vb) { int v = atoi(vb); if (v > 0) g_ultra_batch_override[i] = v; }
 								        snprintf(var, sizeof(var), "HAKMEM_TINY_ULTRA_SLL_CAP_C%d", i);
 								        char* vc = getenv(var);
 								        if (vc) { int v = atoi(vc); if (v > 0) g_ultra_sll_cap_override[i] = v; }
 								        // Normal-path per-class overrides
 								        snprintf(var, sizeof(var), "HAKMEM_TINY_MAG_CAP_C%d", i);
 								        char* vm = getenv(var);
 								        if (vm) { int v = atoi(vm); if (v > 0 && v <= TINY_TLS_MAG_CAP) g_mag_cap_override[i] = v; }
 								        snprintf(var, sizeof(var), "HAKMEM_TINY_SLL_CAP_C%d", i);
 								        char* vs = getenv(var);
 								        if (vs) { int v = atoi(vs); if (v > 0 && v <= TINY_TLS_MAG_CAP) g_sll_cap_override[i] = v; }
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
 								        // Front refill count per-class override (fast path tuning)
 								        snprintf(var, sizeof(var), "HAKMEM_TINY_REFILL_COUNT_C%d", i);
 								        char* rc = getenv(var);
 								        if (rc) { int v = atoi(rc); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_class[i] = v; }
 								    }
 								    // Front refill count globals
-												Phase 10: TLS/SFC aggressive cache tuning (syscall reduction failed)

Goal: Reduce backend transitions by increasing frontend hit rate
Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn)

Implementation:

1. Cache capacity expansion (2-8x per-class)
   - Hot classes (C0-C3): 4x increase (512 slots)
   - Medium classes (C4-C6): 2-3x increase
   - Class 7 (1KB): 2x increase (128 slots)
   - Fast cache: 2x default capacity

2. Refill batch size increase (4-8x)
   - Global default: 16 → 64 (4x)
   - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT
   - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID
   - Class 7: 64 → 128 (2x)
   - SFC refill: 64 → 128 (2x)

3. Adaptive sizing aggressive parameters
   - Grow threshold: 80% → 70% (expand earlier)
   - Shrink threshold: 20% → 10% (shrink less)
   - Growth rate: 2x → 1.5x (smoother growth)
   - Max capacity: 2048 → 4096 (2x ceiling)
   - Adapt frequency: Every 10 → 5 refills (more responsive)

Performance Results (100K iterations):

Before (Phase 9):
- Performance: 9.71M ops/s
- Syscalls: 1,729 (mmap:877, munmap:852)

After (Phase 10):
- Default settings: 8.77M ops/s (-9.7%) ⚠️
- Optimal ENV: 9.89M ops/s (+2%) ✅
- Syscalls: 1,729 (unchanged) ❌

Optimal ENV configuration:
export HAKMEM_TINY_REFILL_COUNT_HOT=256
export HAKMEM_TINY_REFILL_COUNT_MID=192

Root Cause Analysis:

Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn:
- 877 SuperSlabs allocated (877MB via mmap)
- Phase 9 LRU cache not utilized (no frees during benchmark)
- All SuperSlabs retained until program exit
- System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap)

Conclusion:

TLS/SFC tuning cannot solve SuperSlab allocation policy problem.
Next step: Phase 11 SuperSlab Prewarm strategy to eliminate
mmap/munmap during benchmark execution.

ChatGPT review: Strategy validated, Option A (Prewarm) recommended.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 14:25:54 +09:00
+								    // Phase 10: Set aggressive defaults for hot and mid classes
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								    {
 								        char* g = getenv("HAKMEM_TINY_REFILL_COUNT");
 								        if (g) { int v = atoi(g); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_global = v; }
-												Phase 10: TLS/SFC aggressive cache tuning (syscall reduction failed)

Goal: Reduce backend transitions by increasing frontend hit rate
Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn)

Implementation:

1. Cache capacity expansion (2-8x per-class)
   - Hot classes (C0-C3): 4x increase (512 slots)
   - Medium classes (C4-C6): 2-3x increase
   - Class 7 (1KB): 2x increase (128 slots)
   - Fast cache: 2x default capacity

2. Refill batch size increase (4-8x)
   - Global default: 16 → 64 (4x)
   - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT
   - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID
   - Class 7: 64 → 128 (2x)
   - SFC refill: 64 → 128 (2x)

3. Adaptive sizing aggressive parameters
   - Grow threshold: 80% → 70% (expand earlier)
   - Shrink threshold: 20% → 10% (shrink less)
   - Growth rate: 2x → 1.5x (smoother growth)
   - Max capacity: 2048 → 4096 (2x ceiling)
   - Adapt frequency: Every 10 → 5 refills (more responsive)

Performance Results (100K iterations):

Before (Phase 9):
- Performance: 9.71M ops/s
- Syscalls: 1,729 (mmap:877, munmap:852)

After (Phase 10):
- Default settings: 8.77M ops/s (-9.7%) ⚠️
- Optimal ENV: 9.89M ops/s (+2%) ✅
- Syscalls: 1,729 (unchanged) ❌

Optimal ENV configuration:
export HAKMEM_TINY_REFILL_COUNT_HOT=256
export HAKMEM_TINY_REFILL_COUNT_MID=192

Root Cause Analysis:

Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn:
- 877 SuperSlabs allocated (877MB via mmap)
- Phase 9 LRU cache not utilized (no frees during benchmark)
- All SuperSlabs retained until program exit
- System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap)

Conclusion:

TLS/SFC tuning cannot solve SuperSlab allocation policy problem.
Next step: Phase 11 SuperSlab Prewarm strategy to eliminate
mmap/munmap during benchmark execution.

ChatGPT review: Strategy validated, Option A (Prewarm) recommended.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 14:25:54 +09:00
+								        else { g_refill_count_global = 64; }  // Phase 10: default 64 (was 16)
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								        char* h = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
 								        if (h) { int v = atoi(h); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_hot = v; }
-												Phase 10: TLS/SFC aggressive cache tuning (syscall reduction failed)

Goal: Reduce backend transitions by increasing frontend hit rate
Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn)

Implementation:

1. Cache capacity expansion (2-8x per-class)
   - Hot classes (C0-C3): 4x increase (512 slots)
   - Medium classes (C4-C6): 2-3x increase
   - Class 7 (1KB): 2x increase (128 slots)
   - Fast cache: 2x default capacity

2. Refill batch size increase (4-8x)
   - Global default: 16 → 64 (4x)
   - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT
   - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID
   - Class 7: 64 → 128 (2x)
   - SFC refill: 64 → 128 (2x)

3. Adaptive sizing aggressive parameters
   - Grow threshold: 80% → 70% (expand earlier)
   - Shrink threshold: 20% → 10% (shrink less)
   - Growth rate: 2x → 1.5x (smoother growth)
   - Max capacity: 2048 → 4096 (2x ceiling)
   - Adapt frequency: Every 10 → 5 refills (more responsive)

Performance Results (100K iterations):

Before (Phase 9):
- Performance: 9.71M ops/s
- Syscalls: 1,729 (mmap:877, munmap:852)

After (Phase 10):
- Default settings: 8.77M ops/s (-9.7%) ⚠️
- Optimal ENV: 9.89M ops/s (+2%) ✅
- Syscalls: 1,729 (unchanged) ❌

Optimal ENV configuration:
export HAKMEM_TINY_REFILL_COUNT_HOT=256
export HAKMEM_TINY_REFILL_COUNT_MID=192

Root Cause Analysis:

Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn:
- 877 SuperSlabs allocated (877MB via mmap)
- Phase 9 LRU cache not utilized (no frees during benchmark)
- All SuperSlabs retained until program exit
- System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap)

Conclusion:

TLS/SFC tuning cannot solve SuperSlab allocation policy problem.
Next step: Phase 11 SuperSlab Prewarm strategy to eliminate
mmap/munmap during benchmark execution.

ChatGPT review: Strategy validated, Option A (Prewarm) recommended.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 14:25:54 +09:00
+								        else { g_refill_count_hot = 128; }  // Phase 10: default 128 for hot classes (C0-C3)
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								        char* m = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
 								        if (m) { int v = atoi(m); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_mid = v; }
-												Phase 10: TLS/SFC aggressive cache tuning (syscall reduction failed)

Goal: Reduce backend transitions by increasing frontend hit rate
Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn)

Implementation:

1. Cache capacity expansion (2-8x per-class)
   - Hot classes (C0-C3): 4x increase (512 slots)
   - Medium classes (C4-C6): 2-3x increase
   - Class 7 (1KB): 2x increase (128 slots)
   - Fast cache: 2x default capacity

2. Refill batch size increase (4-8x)
   - Global default: 16 → 64 (4x)
   - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT
   - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID
   - Class 7: 64 → 128 (2x)
   - SFC refill: 64 → 128 (2x)

3. Adaptive sizing aggressive parameters
   - Grow threshold: 80% → 70% (expand earlier)
   - Shrink threshold: 20% → 10% (shrink less)
   - Growth rate: 2x → 1.5x (smoother growth)
   - Max capacity: 2048 → 4096 (2x ceiling)
   - Adapt frequency: Every 10 → 5 refills (more responsive)

Performance Results (100K iterations):

Before (Phase 9):
- Performance: 9.71M ops/s
- Syscalls: 1,729 (mmap:877, munmap:852)

After (Phase 10):
- Default settings: 8.77M ops/s (-9.7%) ⚠️
- Optimal ENV: 9.89M ops/s (+2%) ✅
- Syscalls: 1,729 (unchanged) ❌

Optimal ENV configuration:
export HAKMEM_TINY_REFILL_COUNT_HOT=256
export HAKMEM_TINY_REFILL_COUNT_MID=192

Root Cause Analysis:

Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn:
- 877 SuperSlabs allocated (877MB via mmap)
- Phase 9 LRU cache not utilized (no frees during benchmark)
- All SuperSlabs retained until program exit
- System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap)

Conclusion:

TLS/SFC tuning cannot solve SuperSlab allocation policy problem.
Next step: Phase 11 SuperSlab Prewarm strategy to eliminate
mmap/munmap during benchmark execution.

ChatGPT review: Strategy validated, Option A (Prewarm) recommended.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 14:25:54 +09:00
+								        else { g_refill_count_mid = 96; }  // Phase 10: default 96 for mid classes (C4-C7)
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    }
-												Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.

											
										
										
											2025-11-09 18:55:50 +09:00
+								    // Sensible default for class 7 (1024B): favor larger refill to reduce refills/syscalls
 								    if (g_refill_count_class[7] == 0) {
-												Phase 10: TLS/SFC aggressive cache tuning (syscall reduction failed)

Goal: Reduce backend transitions by increasing frontend hit rate
Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn)

Implementation:

1. Cache capacity expansion (2-8x per-class)
   - Hot classes (C0-C3): 4x increase (512 slots)
   - Medium classes (C4-C6): 2-3x increase
   - Class 7 (1KB): 2x increase (128 slots)
   - Fast cache: 2x default capacity

2. Refill batch size increase (4-8x)
   - Global default: 16 → 64 (4x)
   - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT
   - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID
   - Class 7: 64 → 128 (2x)
   - SFC refill: 64 → 128 (2x)

3. Adaptive sizing aggressive parameters
   - Grow threshold: 80% → 70% (expand earlier)
   - Shrink threshold: 20% → 10% (shrink less)
   - Growth rate: 2x → 1.5x (smoother growth)
   - Max capacity: 2048 → 4096 (2x ceiling)
   - Adapt frequency: Every 10 → 5 refills (more responsive)

Performance Results (100K iterations):

Before (Phase 9):
- Performance: 9.71M ops/s
- Syscalls: 1,729 (mmap:877, munmap:852)

After (Phase 10):
- Default settings: 8.77M ops/s (-9.7%) ⚠️
- Optimal ENV: 9.89M ops/s (+2%) ✅
- Syscalls: 1,729 (unchanged) ❌

Optimal ENV configuration:
export HAKMEM_TINY_REFILL_COUNT_HOT=256
export HAKMEM_TINY_REFILL_COUNT_MID=192

Root Cause Analysis:

Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn:
- 877 SuperSlabs allocated (877MB via mmap)
- Phase 9 LRU cache not utilized (no frees during benchmark)
- All SuperSlabs retained until program exit
- System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap)

Conclusion:

TLS/SFC tuning cannot solve SuperSlab allocation policy problem.
Next step: Phase 11 SuperSlab Prewarm strategy to eliminate
mmap/munmap during benchmark execution.

ChatGPT review: Strategy validated, Option A (Prewarm) recommended.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 14:25:54 +09:00
+								        g_refill_count_class[7] = 128;  // Phase 10: increased from 64 to 128
-												Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.

											
										
										
											2025-11-09 18:55:50 +09:00
+								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    {
 								        char* fast_env = getenv("HAKMEM_TINY_FAST");
 								        if (fast_env && atoi(fast_env) == 0) g_fast_enable = 0;
 								        int fast_global = -1;
 								        char* fast_cap_env = getenv("HAKMEM_TINY_FAST_CAP");
 								        if (fast_cap_env) {
 								            int v = atoi(fast_cap_env);
 								            if (v >= 0 && v <= TINY_TLS_MAG_CAP) fast_global = v;
 								        }
 								        for (int i = 0; i < TINY_NUM_CLASSES; i++) {
 								            uint16_t cap = g_fast_cap_defaults[i];
 								            if (fast_global >= 0) cap = (uint16_t)fast_global;
 								            snprintf(var, sizeof(var), "HAKMEM_TINY_FAST_CAP_C%d", i);
 								            char* fc = getenv(var);
 								            if (fc) {
 								                int v = atoi(fc);
 								                if (v < 0) v = 0;
 								                if (v > TINY_TLS_MAG_CAP) v = TINY_TLS_MAG_CAP;
 								                cap = (uint16_t)v;
 								                g_fast_cap_locked[i] = 1;
 								            } else if (fast_global >= 0) {
 								                g_fast_cap_locked[i] = 1;
 								            } else {
 								                g_fast_cap_locked[i] = 0;
 								            }
 								            g_fast_cap[i] = cap;
 								        }
 								    }
 								    {
 								        const char* dbg_fast = getenv("HAKMEM_TINY_DEBUG_FAST0");
 								        if (dbg_fast && atoi(dbg_fast) != 0) {
 								            g_debug_fast0 = 1;
 								            g_fast_enable = 0;
 								            g_hotmag_enable = 0;
 								            g_tls_list_enable = 0;
 								        }
 								        const char* dbg_remote = getenv("HAKMEM_TINY_DEBUG_REMOTE_GUARD");
 								        if (dbg_remote && atoi(dbg_remote) != 0) {
 								            g_debug_remote_guard = 1;
 								        }
 								        const char* rf_force = getenv("HAKMEM_TINY_RF_FORCE_NOTIFY");
 								        if (rf_force && atoi(rf_force) != 0) {
 								            extern int g_remote_force_notify;
 								            g_remote_force_notify = 1;
 								        }
 								        const char* safe_free = getenv("HAKMEM_SAFE_FREE");
 								        if (safe_free && atoi(safe_free) != 0) {
 								            extern int g_tiny_safe_free; g_tiny_safe_free = 1;
 								        }
 								        const char* safe_free_strict = getenv("HAKMEM_SAFE_FREE_STRICT");
 								        if (safe_free_strict && atoi(safe_free_strict) != 0) {
 								            extern int g_tiny_safe_free_strict; g_tiny_safe_free_strict = 1;
 								        }
 								        const char* force_remote = getenv("HAKMEM_TINY_FORCE_REMOTE");
 								        if (force_remote && atoi(force_remote) != 0) {
 								            extern int g_tiny_force_remote; g_tiny_force_remote = 1;
 								        }
 								        // Remote side-table (debug only)
 								        tiny_remote_side_init_from_env();
 								    }
 								    static int g_super_trace = -1;
 								    if (__builtin_expect(g_super_trace == -1, 0)) {
 								        const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE");
 								        g_super_trace = (tr && atoi(tr) != 0) ? 1 : 0;
 								    }
 								    if (g_super_trace) {
 								        static int logged_once = 0;
 								        if (!logged_once) {
 								            fprintf(stderr, "[SUPERTRACE] mem_diet=%d env=%s g_use_superslab=%d fast_enable=%d cap0=%u cap1=%u cap2=%u cap3=%u cap4=%u reslist=%d\n",
 								                    mem_diet_enabled,
 								                    superslab_env ? superslab_env : "(null)",
 								                    g_use_superslab,
 								                    g_fast_enable,
 								                    (unsigned)g_fast_cap[0],
 								                    (unsigned)g_fast_cap[1],
 								                    (unsigned)g_fast_cap[2],
 								                    (unsigned)g_fast_cap[3],
 								                    (unsigned)g_fast_cap[4],
 								                    g_tls_list_enable);
 								            logged_once = 1;
 								        }
 								    }
 								    tiny_ace_init_defaults();
 								    char* fc_env = getenv("HAKMEM_TINY_FASTCACHE");
 								    if (fc_env && atoi(fc_env) != 0) {
 								        g_fastcache_enable = 1;
 								    }
 								    char* fe_env = getenv("HAKMEM_TINY_FRONTEND");
 								    if (fe_env && atoi(fe_env) != 0) {
 								        g_frontend_enable = 1;
 								    }
 								    // TinyQuickSlot opt-in
 								    {
 								        char* q = getenv("HAKMEM_TINY_QUICK");
 								        if (q && atoi(q) != 0) g_quick_enable = 1;
 								    }
 								    tiny_obs_start_if_needed();
 								    // Deferred Intelligence Engine
 								    char* ie = getenv("HAKMEM_INT_ENGINE");
 								    if (ie && atoi(ie) != 0) {
 								        g_int_engine = 1;
 								        // Initialize frontend fill targets to zero (let engine grow if hot)
 								        for (int i = 0; i < TINY_NUM_CLASSES; i++) atomic_store(&g_frontend_fill_target[i], 0);
 								        // Event logging knobs (optional)
 								        char* its = getenv("HAKMEM_INT_EVENT_TS");
 								        if (its && atoi(its) != 0) g_int_event_ts = 1;
 								        char* ism = getenv("HAKMEM_INT_SAMPLE");
 								        if (ism) { int n = atoi(ism); if (n > 0 && n < 31) g_int_sample_mask = ((1u << n) - 1u); }
 								        if (pthread_create(&g_int_thread, NULL, intelligence_engine_main, NULL) == 0) {
 								            g_int_started = 1;
 								        }
 								    }
 								    // Step 2: Initialize Slab Registry (only if enabled)
 								    if (g_use_registry) {
 								        memset(g_slab_registry, 0, sizeof(g_slab_registry));
 								    }
 								    // Initialize per-class locks
 								    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
 								        pthread_mutex_init(&g_tiny_class_locks[i].m, NULL);
 								    }
 								    // Phase 8.3: Initialize ACE (Adaptive Cache Engine) state
 								    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
 								        g_ss_ace[i].current_lg = 20;  // Start with 1MB SuperSlabs
 								        g_ss_ace[i].target_lg = 20;   // Default to 1MB
 								        g_ss_ace[i].hot_score = 0;
 								        g_ss_ace[i].alloc_count = 0;
 								        g_ss_ace[i].refill_count = 0;
 								        g_ss_ace[i].spill_count = 0;
 								        g_ss_ace[i].live_blocks = 0;
 								        g_ss_ace[i].last_tick_ns = 0;
 								    }
 								    // Lite P1: Pre-allocate Tier 1 (8-64B) hot classes only
 								    // This avoids initialization overhead for common small allocations
 								    // Classes 0-3: 8B, 16B, 32B, 64B (256KB total, not 512KB)
 								    for (int class_idx = 0; class_idx < 4; class_idx++) {
 								        TinySlab* slab = allocate_new_slab(class_idx);
 								        if (slab) {
 								            slab->next = g_tiny_pool.free_slabs[class_idx];
 								            g_tiny_pool.free_slabs[class_idx] = slab;
 								        }
 								    }
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
-												Phase 11: SuperSlab Prewarm implementation (startup pre-allocation)

## Summary
Pre-allocate SuperSlabs at startup to eliminate runtime mmap overhead.
Result: +6.4% improvement (8.82M → 9.38M ops/s) but still 9x slower than System malloc.

## Key Findings (Lesson Learned)
- Syscall reduction strategy targeted WRONG bottleneck
- Real bottleneck: SuperSlab allocation churn (877 SuperSlabs needed)
- Prewarm reduces mmap frequency but doesn't solve fundamental architecture issue

## Implementation
- Two-phase allocation with atomic bypass flag
- Environment variable: HAKMEM_PREWARM_SUPERSLABS (default: 0)
- Best result: Prewarm=8 → 9.38M ops/s (+6.4%)

## Next Step
Pivot to Phase 12: Shared SuperSlab Pool (mimalloc-style)
- Expected: 877 → 100-200 SuperSlabs (-70-80%)
- This addresses ROOT CAUSE (allocation churn) not symptoms

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-13 14:45:43 +09:00
+								    // Phase 11: Initialize SuperSlab Registry and LRU Cache
 								    if (g_use_superslab) {
 								        extern void hak_super_registry_init(void);
 								        extern void hak_ss_lru_init(void);
 								        extern void hak_ss_prewarm_init(void);
 								        hak_super_registry_init();
 								        hak_ss_lru_init();
 								        // Phase 11: Prewarm SuperSlabs to eliminate mmap/munmap churn
 								        // ENV: HAKMEM_PREWARM_SUPERSLABS=<count> (e.g., 32, 128)
 								        hak_ss_prewarm_init();
 								    }
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								    if (__builtin_expect(route_enabled_runtime(), 0)) {
 								        tiny_debug_ring_record(TINY_RING_EVENT_ROUTE, (uint16_t)0xFFFFu, NULL, (uintptr_t)0x494E4954u);
 								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								}