Refactor: Extract 5 Box modules from hakmem_tiny.c (-52% size reduction)

Split hakmem_tiny.c (2081 lines) into focused modules for better maintainability. ## Changes **hakmem_tiny.c**: 2081 → 995 lines (-1086 lines, -52% reduction) ## Extracted Modules (5 boxes) 1. **config_box** (211 lines) - Size class tables, integrity counters - Debug flags, benchmark macros - HAK_RET_ALLOC/HAK_STAT_FREE instrumentation 2. **publish_box** (419 lines) - Publish/Adopt counters and statistics - Bench mailbox, partial ring - Live cap/Hot slot management - TLS helper functions (tiny_tls_default_*) 3. **globals_box** (256 lines) - Global variable declarations (~70 variables) - TinyPool instance and initialization flag - TLS variables (g_tls_lists, g_fast_head, g_fast_count) - SuperSlab configuration (partial ring, empty reserves) - Adopt gate functions 4. **phase6_wrappers_box** (122 lines) - Phase 6 Box Theory wrapper layer - hak_tiny_alloc_fast_wrapper() - hak_tiny_free_fast_wrapper() - Diagnostic instrumentation 5. **ace_guard_box** (100 lines) - ACE Learning Layer (hkm_ace_set_drain_threshold) - FastCache API (tiny_fc_room, tiny_fc_push_bulk) - Tiny Guard debugging system (5 functions) ## Benefits - **Readability**: Giant 2k file → focused 1k core + 5 coherent modules - **Maintainability**: Each box has clear responsibility and boundaries - **Build**: All modules compile successfully ✅ ## Technical Details - Phase 1: ChatGPT extracted config_box + publish_box (-625 lines) - Phase 2-4: Claude extracted globals_box + phase6_wrappers_box + ace_guard_box (-461 lines) - All extractions use .inc files (same translation unit, preserves static/TLS linkage) - Fixed Makefile: Added tiny_sizeclass_hist_box.o to OBJS_BASE and BENCH_HAKMEM_OBJS_BASE 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 01:16:45 +09:00
parent b3a156879a
commit 6b6ad69aca
7 changed files with 1123 additions and 1101 deletions
--- a/4
+++ b/4
@ -190,7 +190,7 @@ LDFLAGS += $(EXTRA_LDFLAGS)
 # Targets
 TARGET = test_hakmem
-OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
+OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
 OBJS = $(OBJS_BASE)
 # Shared library
@ -222,7 +222,7 @@ endif
 # Benchmark targets
 BENCH_HAKMEM = bench_allocators_hakmem
 BENCH_SYSTEM = bench_allocators_system
-BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
+BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
 BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE)
 ifeq ($(POOL_TLS_PHASE1),1)
 BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
--- a/core/hakmem_tiny.c
+++ b/core/hakmem_tiny.c
--- a/core/hakmem_tiny_ace_guard_box.inc
+++ b/core/hakmem_tiny_ace_guard_box.inc
@ -0,0 +1,100 @@
 // ============================================================================
 // ACE Learning Layer: Runtime parameter setters
 // ============================================================================
 void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) {
    // Validate inputs
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) {
        return;
    }
    if (threshold < 16 || threshold > 2048) {
        return;
    }
    // Set per-class threshold (used by remote free drain logic)
    g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
 }
 #include "tiny_fc_api.h"
 int tiny_fc_room(int class_idx) {
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
    TinyFastCache* fc = &g_fast_cache[class_idx];
    // Effective per-class cap comes from g_fast_cap (env-tunable),
    // clamped by the static storage capacity TINY_FASTCACHE_CAP.
    uint16_t eff_cap = g_fast_cap[class_idx];
    if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP;
    int room = (int)eff_cap - fc->top;
    return room > 0 ? room : 0;
 }
 int tiny_fc_push_bulk(int class_idx, void** arr, int n) {
    if (!arr || n <= 0) return 0;
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
    TinyFastCache* fc = &g_fast_cache[class_idx];
    uint16_t eff_cap = g_fast_cap[class_idx];
    if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP;
    int room = (int)eff_cap - fc->top;
    if (room <= 0) return 0;
    int take = n < room ? n : room;
    // Forward fill with light unrolling to reduce branch overhead
    int i = 0;
    for (; i + 3 < take; i += 4) {
        fc->items[fc->top++] = arr[i];
        fc->items[fc->top++] = arr[i + 1];
        fc->items[fc->top++] = arr[i + 2];
        fc->items[fc->top++] = arr[i + 3];
    }
    for (; i < take; i++) {
        fc->items[fc->top++] = arr[i];
    }
    return take;
 }
 // ========= Tiny Guard (targeted debug; low overhead when disabled) =========
 static int g_tiny_guard_enabled = -1;
 static int g_tiny_guard_class = 2;
 static int g_tiny_guard_limit = 8;
 static __thread int g_tiny_guard_seen = 0;
 static inline int tiny_guard_enabled_runtime(void) {
    if (__builtin_expect(g_tiny_guard_enabled == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_GUARD");
        g_tiny_guard_enabled = (e && *e && *e != '0') ? 1 : 0;
        const char* ec = getenv("HAKMEM_TINY_GUARD_CLASS");
        if (ec && *ec) g_tiny_guard_class = atoi(ec);
        const char* el = getenv("HAKMEM_TINY_GUARD_MAX");
        if (el && *el) g_tiny_guard_limit = atoi(el);
        if (g_tiny_guard_limit <= 0) g_tiny_guard_limit = 8;
    }
    return g_tiny_guard_enabled;
 }
 int tiny_guard_is_enabled(void) { return tiny_guard_enabled_runtime(); }
 static void tiny_guard_dump_bytes(const char* tag, const uint8_t* p, size_t n) {
    fprintf(stderr, "[TGUARD] %s:", tag);
    for (size_t i = 0; i < n; i++) fprintf(stderr, " %02x", p[i]);
    fprintf(stderr, "\n");
 }
 void tiny_guard_on_alloc(int cls, void* base, void* user, size_t stride) {
    if (!tiny_guard_enabled_runtime() || cls != g_tiny_guard_class) return;
    if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return;
    uint8_t* b = (uint8_t*)base;
    uint8_t* u = (uint8_t*)user;
    fprintf(stderr, "[TGUARD] alloc cls=%d base=%p user=%p stride=%zu hdr=%02x\n",
            cls, base, user, stride, b[0]);
    // 隣接ヘッダ可視化（前後）
    tiny_guard_dump_bytes("around_base", b, (stride >= 8 ? 8 : stride));
    tiny_guard_dump_bytes("next_header", b + stride, 4);
 }
 void tiny_guard_on_invalid(void* user_ptr, uint8_t hdr) {
    if (!tiny_guard_enabled_runtime()) return;
    if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return;
    uint8_t* u = (uint8_t*)user_ptr;
    fprintf(stderr, "[TGUARD] invalid header at user=%p hdr=%02x prev=%02x next=%02x\n",
            user_ptr, hdr, *(u - 2), *(u));
    tiny_guard_dump_bytes("dump_before", u - 8, 8);
    tiny_guard_dump_bytes("dump_after", u, 8);
 }
--- a/core/hakmem_tiny_config_box.inc
+++ b/core/hakmem_tiny_config_box.inc
@ -0,0 +1,211 @@
 // hakmem_tiny_config_box.inc
 // Box: Tiny allocator configuration, debug counters, and return helpers.
 // Extracted from hakmem_tiny.c to reduce file size and isolate config logic.
 // ============================================================================
 // Size class table (Box 3 dependency)
 // ============================================================================
 // Phase E1-CORRECT: ALL classes have 1-byte header
 // These sizes represent TOTAL BLOCK SIZE (stride) = [Header 1B][Data N-1B]
 // Usable data = stride - 1 (implicit)
 const size_t g_tiny_class_sizes[TINY_NUM_CLASSES] = {
    8,      // Class 0:   8B total = [Header 1B][Data  7B]
    16,     // Class 1:  16B total = [Header 1B][Data 15B]
    32,     // Class 2:  32B total = [Header 1B][Data 31B]
    64,     // Class 3:  64B total = [Header 1B][Data 63B]
    128,    // Class 4: 128B total = [Header 1B][Data 127B]
    256,    // Class 5: 256B total = [Header 1B][Data 255B]
    512,    // Class 6: 512B total = [Header 1B][Data 511B]
    1024    // Class 7: 1024B total = [Header 1B][Data 1023B]
 };
 // ============================================================================
 // Phase 16: Dynamic Tiny Max Size (ENV: HAKMEM_TINY_MAX_CLASS)
 // Phase 17-1: Auto-adjust when Small-Mid enabled
 // ============================================================================
 // Forward declaration for Small-Mid check
 extern bool smallmid_is_enabled(void);
 // Get dynamic max size for Tiny allocator based on ENV configuration
 // Default: 1023B (C0-C7), can be reduced to 255B (C0-C5)
 // Phase 17-1: Auto-reduces to 255B when Small-Mid is enabled
 size_t tiny_get_max_size(void) {
    static int g_max_class = -1;
    if (__builtin_expect(g_max_class == -1, 0)) {
        const char* env = getenv("HAKMEM_TINY_MAX_CLASS");
        if (env && *env) {
            int max_class = atoi(env);
            if (max_class >= 0 && max_class < TINY_NUM_CLASSES) {
                g_max_class = max_class;
            } else {
                g_max_class = 7;  // Default: all classes (C0-C7)
            }
        } else {
            g_max_class = 7;  // Default: all classes
        }
    }
    // Phase 17-1: Auto-adjust when Small-Mid enabled
    // Small-Mid handles 256B-1KB, so Tiny should only handle 0-255B
    int effective_class = g_max_class;
    if (smallmid_is_enabled() && effective_class > 5) {
        effective_class = 5;  // Limit to C0-C5 (0-255B)
    }
    // Map class to max usable size (stride - 1)
    // C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B
    static const size_t class_to_max_size[TINY_NUM_CLASSES] = {
        7, 15, 31, 63, 127, 255, 511, 1023
    };
    return class_to_max_size[effective_class];
 }
 // ============================================================================
 // PRIORITY 1-4: Integrity Check Counters
 // ============================================================================
 _Atomic uint64_t g_integrity_check_class_bounds = 0;
 _Atomic uint64_t g_integrity_check_freelist = 0;
 _Atomic uint64_t g_integrity_check_canary = 0;
 _Atomic uint64_t g_integrity_check_header = 0;
 // Build-time gate for debug counters (path/ultra). Default OFF.
 #ifndef HAKMEM_DEBUG_COUNTERS
 #define HAKMEM_DEBUG_COUNTERS 0
 #endif
 int g_debug_fast0 = 0;
 int g_debug_remote_guard = 0;
 int g_remote_force_notify = 0;
 // Tiny free safety (debug)
 int g_tiny_safe_free = 0;         // Default OFF for performance; env: HAKMEM_SAFE_FREE=1 でON
 int g_tiny_safe_free_strict = 0;  // env: HAKMEM_SAFE_FREE_STRICT=1
 int g_tiny_force_remote = 0;      // env: HAKMEM_TINY_FORCE_REMOTE=1
 // Build-time gate: Minimal Tiny front (bench-only)
 static inline int superslab_trace_enabled(void) {
    static int g_ss_trace_flag = -1;
    if (__builtin_expect(g_ss_trace_flag == -1, 0)) {
        const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE");
        g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0;
    }
    return g_ss_trace_flag;
 }
 // When enabled, physically excludes optional front tiers from the hot path
 // (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving:
 //   SLL → TLS Magazine → SuperSlab → (remaining slow path)
 #ifndef HAKMEM_TINY_MINIMAL_FRONT
 #define HAKMEM_TINY_MINIMAL_FRONT 1
 #endif
 // Strict front: compile-out optional front tiers but keep baseline structure intact
 #ifndef HAKMEM_TINY_STRICT_FRONT
 #define HAKMEM_TINY_STRICT_FRONT 0
 #endif
 // Bench-only fast path knobs (defaults)
 #ifndef HAKMEM_TINY_BENCH_REFILL
 #define HAKMEM_TINY_BENCH_REFILL 8
 #endif
 // Optional per-class overrides (bench-only)
 #ifndef HAKMEM_TINY_BENCH_REFILL8
 #define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL
 #endif
 #ifndef HAKMEM_TINY_BENCH_REFILL16
 #define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL
 #endif
 #ifndef HAKMEM_TINY_BENCH_REFILL32
 #define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL
 #endif
 #ifndef HAKMEM_TINY_BENCH_REFILL64
 #define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL
 #endif
 // Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class)
 #ifndef HAKMEM_TINY_BENCH_WARMUP8
 #define HAKMEM_TINY_BENCH_WARMUP8 64
 #endif
 #ifndef HAKMEM_TINY_BENCH_WARMUP16
 #define HAKMEM_TINY_BENCH_WARMUP16 96
 #endif
 #ifndef HAKMEM_TINY_BENCH_WARMUP32
 #define HAKMEM_TINY_BENCH_WARMUP32 160
 #endif
 #ifndef HAKMEM_TINY_BENCH_WARMUP64
 #define HAKMEM_TINY_BENCH_WARMUP64 192
 #endif
 #ifdef HAKMEM_TINY_BENCH_FASTPATH
 static __thread unsigned char g_tls_bench_warm_done[4];
 #endif
 #if HAKMEM_DEBUG_COUNTERS
 #define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)
 #define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)
 #else
 #define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)
 #define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)
 #endif
 // Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0)
 #if HAKMEM_DEBUG_COUNTERS
 #define HAK_DBG_INC(var) do { (var)++; } while(0)
 #else
 #define HAK_DBG_INC(var) do { (void)0; } while(0)
 #endif
 // Return helper: record tiny alloc stat (guarded) then return pointer
 static inline void tiny_debug_track_alloc_ret(int cls, void* ptr);
 // ========== HAK_RET_ALLOC: Single Definition Point ==========
 // Choose implementation based on HAKMEM_TINY_HEADER_CLASSIDX
 // - Phase 7 enabled: Write header and return user pointer
 // - Phase 7 disabled: Legacy behavior (stats + route + return)
 #if HAKMEM_TINY_HEADER_CLASSIDX
    #if HAKMEM_BUILD_RELEASE
        // Phase E1-CORRECT: ALL classes have 1-byte headers (including C7)
        // Ultra-fast inline macro (3-4 instructions)
        #define HAK_RET_ALLOC(cls, base_ptr) do { \
            *(uint8_t*)(base_ptr) = HEADER_MAGIC | ((cls) & HEADER_CLASS_MASK); \
            return (void*)((uint8_t*)(base_ptr) + 1); \
        } while(0)
    #else
        // Debug: Keep full validation via tiny_region_id_write_header()
        #define HAK_RET_ALLOC(cls, ptr) return tiny_region_id_write_header((ptr), (cls))
    #endif
 #else
    // Legacy: Stats and routing before return
    #ifdef HAKMEM_ENABLE_STATS
        // Optional: sampling（ビルド時に有効化）。ホットパスは直接インライン呼び出し（間接分岐なし）。
        #ifdef HAKMEM_TINY_STAT_SAMPLING
            static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES];
            static int g_stat_rate_lg = 0;  // 0=毎回、それ以外=2^lgごと
            static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) {
                if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; }
                unsigned m = (1u << g_stat_rate_lg) - 1u;
                if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls);
            }
        #else
            static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); }
        #endif
        #define HAK_RET_ALLOC(cls, ptr) do { \
            tiny_debug_track_alloc_ret((cls), (ptr)); \
            hkm_stat_alloc((cls)); \
            ROUTE_COMMIT((cls), 0x7F); \
            return (ptr); \
        } while(0)
    #else
        #define HAK_RET_ALLOC(cls, ptr) do { \
            tiny_debug_track_alloc_ret((cls), (ptr)); \
            ROUTE_COMMIT((cls), 0x7F); \
            return (ptr); \
        } while(0)
    #endif
 #endif  // HAKMEM_TINY_HEADER_CLASSIDX
 // Free-side stats: compile-time zero when stats disabled
 #ifdef HAKMEM_ENABLE_STATS
 #define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0)
 #else
 #define HAK_STAT_FREE(cls) do { } while(0)
 #endif
--- a/core/hakmem_tiny_globals_box.inc
+++ b/core/hakmem_tiny_globals_box.inc
@ -0,0 +1,256 @@
 // ============================================================================
 // Global State
 // ============================================================================
 // Global pool instance (extern declared in hakmem_tiny.h)
 TinyPool g_tiny_pool;
 int g_tiny_initialized = 0;  // Not static (extern in header for inline access)
 // Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
 // Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
 //
 // Results:
 //   Phase 1 (Push - deferred free): +1 instruction, zero benefit
 //   Phase 2 (Pull - background refill): +77 instructions, -3% performance
 //
 // Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
 //   - TLS Magazine capacity: 2048 items
 //   - Benchmark working set: 100 items
 //   - Magazine hit rate: 100% after warmup
 //   - Slow path never executed!
 //
 // Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
 //   - glibc: ~40 instructions/op (5-7× faster)
 //   - Gap is architectural (bitmap vs free-list, research features)
 //
 // Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
 // Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
 // Decision: Enable by default (proven production-ready)
 static int g_wrap_tiny_enabled = 1;  // ON by default (faster than glibc!)
 // Optional: allow limited trylock-based refill during wrapper calls
 static int g_wrap_tiny_refill = 0;
 // Remote-free drain controls
 static int g_remote_drain_thresh = 32;   // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
 static int g_remote_drain_tryrate = 16;  // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
 // ACE Learning Layer: Per-class remote drain thresholds
 int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
 // Sampled counter updates (Phase 3: Replaced with batched TLS counters)
 // Old: XOR RNG sampling (10-15 ns overhead)
 // New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
 static int g_tiny_count_sample_exp = 8;   // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
 // Step 2: Slab Registry (Hash Table)
 SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
 PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
 // Registry lock
 pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
 // Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
 // O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
 // Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
 static int g_use_registry = 1;  // Default ON for thread-safety
 // TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
 // Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
 static int g_tiny_refill_max = 64;        // HAKMEM_TINY_REFILL_MAX (default 64)
 static int g_tiny_refill_max_hot = 192;   // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
 // hakmem_tiny_tls_list.h already included at top
 static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
 static int g_tls_list_enable = 0;  // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
 static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
 static int g_fast_enable = 1;
 static int g_fastcache_enable = 1;           // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
 static uint16_t g_fast_cap[TINY_NUM_CLASSES];
 static int g_ultra_bump_shadow = 0;          // HAKMEM_TINY_BUMP_SHADOW=1
 static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
 typedef void* (*TinyHotAllocFn)(void);
 static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
 static __thread void* g_fast_head[TINY_NUM_CLASSES];
 static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
 static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
 uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
 uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
 uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
 uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
 uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
 uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
 // Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
 // NEW: Per-thread active slabs (up to 2 per class)
 static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
 static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
 static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
    TinySlab* cand = g_tls_active_slab_a[class_idx];
    if (cand) {
        uintptr_t base = (uintptr_t)cand->base;
        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
            return cand;
        }
    }
    cand = g_tls_active_slab_b[class_idx];
    if (cand) {
        uintptr_t base = (uintptr_t)cand->base;
        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
            return cand;
        }
    }
    return NULL;
 }
 // Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
 // Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
 extern int g_use_superslab;
 #if !HAKMEM_BUILD_RELEASE
 static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
    if (!ptr) return;
    // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
    void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
    if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
        SuperSlab* ss = hak_super_lookup(ptr);
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
            tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
        } else {
            int slab_idx = slab_index_for(ss, base_ptr);
            if (slab_idx < 0) {
                tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
            } else {
                // Fail-Fast: class vs SuperSlab size_class must be consistent.
    TinySlabMeta* meta = &ss->slabs[slab_idx];
    if (meta->class_idx != (uint8_t)cls) {
        tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
    }
                size_t blk = g_tiny_class_sizes[cls];
                uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
                uintptr_t delta = (uintptr_t)base_ptr - base;
                if (blk == 0 || (delta % blk) != 0) {
                    tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
                } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
                    tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
                }
            }
        }
    }
    if (!__builtin_expect(g_debug_remote_guard, 0)) return;
    if (!g_use_superslab) return;
    SuperSlab* ss = hak_super_lookup(ptr);
    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
    int slab_idx = slab_index_for(ss, base_ptr);
    if (slab_idx >= 0) {
        tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
    }
 }
 #else
 static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
 #endif
 // Debug counters for SuperSlab investigation
 #if HAKMEM_DEBUG_COUNTERS
 int g_superslab_alloc_count = 0;
 int g_superslab_fail_count = 0;
 int g_superslab_free_count = 0;   // Phase 7.6: Track SuperSlab frees
 int g_empty_superslab_count = 0;  // Phase 7.6: Track empty SuperSlabs detected
 int g_magazine_push_count = 0;    // Phase 7.6: Track Magazine pushes
 int g_tiny_free_with_slab_count = 0;  // Phase 7.6: Track tiny_free_with_slab calls
 #endif
 // Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
 // Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
 // Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
 #define EMPTY_SUPERSLAB_RESERVE 0  // Keep up to N empty SuperSlabs per class (default)
 static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES];  // One empty SuperSlab per class
 static int g_empty_counts[TINY_NUM_CLASSES] = {0};  // Count of empty SuperSlabs
 static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
 static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
 static int g_ss_partial_enable = 1;  // Enable partial SuperSlab release by default
 static uint32_t g_ss_partial_interval = 4;
 static _Atomic uint32_t g_ss_partial_epoch = 0;
 // Phase 6.24: Unified TLS slab cache (Medium fix)
 // Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
 // Phase E4: 64B alignment for L1 cache optimization
 __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
 static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
 static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
 static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
 static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
 static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
 static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
 static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
 // ----------------------------------------------------------------------------
 // Per-class partial SuperSlab slot (single-slot publish/adopt)
 // ----------------------------------------------------------------------------
 // Small ring of partial SuperSlabs per class (publish/adopt)
 #ifndef SS_PARTIAL_RING
 #define SS_PARTIAL_RING 64
 #endif
 static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
 static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
 static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
 static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES];
 static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN
 // Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
 unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
 unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
 _Atomic int g_ss_remote_seen = 0;  // becomes 1 when any remote free occurs
 static int g_ss_adopt_env = -2;    // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
 static _Atomic int g_ss_adopt_runtime = 0;  // 0=inactive, 1=active
 static _Atomic int g_ss_adopt_log_once = 0;
 static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
    if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
        fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
                reason ? reason : "unknown", class_idx);
    }
 }
 static inline void tiny_adopt_gate_parse_env(void) {
    if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
        const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
        if (!env || *env == '\0') {
            g_ss_adopt_env = 0;  // auto
        } else if (*env == '0') {
            g_ss_adopt_env = -1; // forced OFF
            atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
        } else {
            g_ss_adopt_env = 1;  // forced ON
            atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
            tiny_adopt_gate_log_activation("env", -1);
        }
    }
 }
 int tiny_adopt_gate_should_publish(void) {
    tiny_adopt_gate_parse_env();
    if (g_ss_adopt_env == 1) return 1;
    if (g_ss_adopt_env == -1) return 0;
    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
 }
 int tiny_adopt_gate_should_adopt(void) {
    tiny_adopt_gate_parse_env();
    if (g_ss_adopt_env == 1) return 1;
    if (g_ss_adopt_env == -1) return 0;
    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
 }
 void tiny_adopt_gate_on_remote_seen(int class_idx) {
    tiny_adopt_gate_parse_env();
    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
    if (g_ss_adopt_env == -1) return;
    int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
    if (prev == 0) {
        tiny_adopt_gate_log_activation("remote", class_idx);
    }
 }
 // Publish/adopt instrumentation, bench mailboxes, and TLS target helpers
--- a/core/hakmem_tiny_phase6_wrappers_box.inc
+++ b/core/hakmem_tiny_phase6_wrappers_box.inc
@ -0,0 +1,122 @@
    // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check
 #if HAKMEM_TINY_PHASE6_BOX_REFACTOR
    #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
        #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options"
    #endif
    // Box 1: Atomic Operations (Layer 0 - Foundation)
    #include "tiny_atomic.h"
    // Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions)
    #include "tiny_alloc_fast.inc.h"
    // Box 6: Free Fast Path (Layer 2 - 2-3 instructions)
    #include "tiny_free_fast.inc.h"
    // ---------------- Refill count (Front) global config ----------------
    // Parsed once at init; hot path reads plain ints (no getenv).
    int g_refill_count_global = 0;              // HAKMEM_TINY_REFILL_COUNT
    int g_refill_count_hot = 0;                 // HAKMEM_TINY_REFILL_COUNT_HOT
    int g_refill_count_mid = 0;                 // HAKMEM_TINY_REFILL_COUNT_MID
    int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7}
    // Export wrapper functions for hakmem.c to call
    // Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining
    void* hak_tiny_alloc_fast_wrapper(size_t size) {
        // Phase E5: Ultra fast path (8-instruction alloc, bypasses all layers)
        // Enable with: HAKMEM_ULTRA_FAST_PATH=1 (compile-time)
        #if HAKMEM_ULTRA_FAST_PATH
            void* ret = tiny_alloc_fast_ultra(size);
            if (ret) return ret;
            // Miss → fallback to full fast path
        #endif
        // Bench-only ultra-short path: bypass diagnostics and pointer tracking
        // Enable with: HAKMEM_BENCH_FAST_FRONT=1
        static int g_bench_fast_front = -1;
        if (__builtin_expect(g_bench_fast_front == -1, 0)) {
            const char* e = getenv("HAKMEM_BENCH_FAST_FRONT");
            g_bench_fast_front = (e && *e && *e != '0') ? 1 : 0;
        }
        if (__builtin_expect(g_bench_fast_front, 0)) {
            return tiny_alloc_fast(size);
        }
        static _Atomic uint64_t wrapper_call_count = 0;
        uint64_t call_num = atomic_fetch_add(&wrapper_call_count, 1);
        // Pointer tracking init (first call only)
        PTR_TRACK_INIT();
        // PRIORITY 3: Periodic canary validation (every 1000 ops)
        periodic_canary_check(call_num, "hak_tiny_alloc_fast_wrapper");
        // Box I: Periodic full integrity check (every 5000 ops)
        #if HAKMEM_INTEGRITY_LEVEL >= 3
        if ((call_num % 5000) == 0) {
            extern void integrity_periodic_full_check(const char*);
            integrity_periodic_full_check("periodic check in alloc wrapper");
        }
        #endif
        #if !HAKMEM_BUILD_RELEASE
        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
            fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu size=%zu\n", call_num, size);
            fflush(stderr);
        }
        #endif
        void* result = tiny_alloc_fast(size);
        #if !HAKMEM_BUILD_RELEASE
        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
            fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu returned %p\n", call_num, result);
            fflush(stderr);
        }
        #endif
        return result;
    }
    void hak_tiny_free_fast_wrapper(void* ptr) {
        // Phase E5: Ultra fast path (6-8 instruction free)
        #if HAKMEM_ULTRA_FAST_PATH
            tiny_free_fast_ultra(ptr);
            return;
        #endif
        static _Atomic uint64_t free_call_count = 0;
        uint64_t call_num = atomic_fetch_add(&free_call_count, 1);
        if (call_num > 14135 && call_num < 14145) {
            fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu ptr=%p\n", call_num, ptr);
            fflush(stderr);
        }
        tiny_free_fast(ptr);
        if (call_num > 14135 && call_num < 14145) {
            fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu completed\n", call_num);
            fflush(stderr);
        }
    }
 #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
    // Phase 6-1.5: Alignment guessing (legacy)
    // Refill count globals (needed for compatibility)
    int g_refill_count_global = 0;
    int g_refill_count_hot = 0;
    int g_refill_count_mid = 0;
    int g_refill_count_class[TINY_NUM_CLASSES] = {0};
    #include "hakmem_tiny_ultra_simple.inc"
    // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking)
    void* hak_tiny_alloc_fast_wrapper(size_t size) {
        return hak_tiny_alloc_ultra_simple(size);
    }
    void hak_tiny_free_fast_wrapper(void* ptr) {
        hak_tiny_free_ultra_simple(ptr);
    }
 #elif defined(HAKMEM_TINY_PHASE6_METADATA)
    // Phase 6-1.6: Metadata header (recommended)
    #include "hakmem_tiny_metadata.inc"
 #endif
--- a/core/hakmem_tiny_publish_box.inc
+++ b/core/hakmem_tiny_publish_box.inc
@ -0,0 +1,419 @@
 // hakmem_tiny_publish_box.inc
 // Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers.
 // Extracted from hakmem_tiny.c to keep hot-path logic focused.
 // TLS hint: last adopted SuperSlab/slab to avoid rescans
 #include "tiny_sticky.h"
 // Mailbox box
 #include "box/mailbox_box.h"
 // Publish pipeline counters (visibility)
 unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES]       = {0};
 unsigned long long g_pub_same_empty[TINY_NUM_CLASSES]         = {0};
 unsigned long long g_remote_transitions[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0};
 unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0};
 // Slab-ring counters (debug)
 unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0};
 unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0};
 unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES]    = {0};
 // Slab entry encoding helpers (used by Bench/Slab-ring paths)
 static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) {
    return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu);
 }
 static inline SuperSlab* slab_entry_ss(uintptr_t ent) {
    // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base
    return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
 }
 static inline int slab_entry_idx(uintptr_t ent) {
    return (int)(ent & 0x3Fu);
 }
 // ----------------------------------------------------------------------------
 // Bench Mode Publish Mailbox (single-slot per class)
 // ----------------------------------------------------------------------------
 static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1
 static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES];
 #ifndef BENCH_MAILBOX_WIDTH
 #define BENCH_MAILBOX_WIDTH 16
 #endif
 static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH];
 static inline int bench_mode_enabled(void) {
    if (__builtin_expect(g_bench_mode == -1, 0)) {
        const char* b = getenv("HAKMEM_TINY_BENCH_MODE");
        g_bench_mode = (b && atoi(b) != 0) ? 1 : 0;
    }
    return g_bench_mode;
 }
 static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) {
    if (!bench_mode_enabled()) return;
    uintptr_t ent = slab_entry_make(ss, slab_idx);
    uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed);
    idx &= (BENCH_MAILBOX_WIDTH - 1);
    atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release);
 }
 static inline uintptr_t bench_pub_pop(int class_idx) {
    if (!bench_mode_enabled()) return (uintptr_t)0;
    for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) {
        uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
        if (ent) return ent;
    }
    return 0;
 }
 // ----------------------------------------------------------------------------
 // Slab-Granular Partial Publish/Adopt (encoded entries)
 // ----------------------------------------------------------------------------
 #ifndef SLAB_PARTIAL_RING
 #define SLAB_PARTIAL_RING 128
 #endif
 static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING];
 static _Atomic(uint32_t)  g_slab_partial_rr2[TINY_NUM_CLASSES];
 // ----------------------------------------------------------------------------
 // Refill-stage counters (per class)
 // ----------------------------------------------------------------------------
 unsigned long long g_rf_total_calls[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES]       = {0};
 unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES]        = {0};
 unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES]       = {0};
 unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES]    = {0};
 // Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0)
 unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0};
 unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0};
 unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0};
 // Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE)
 unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0};
 unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0};
 unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES]    = {0};
 unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES]  = {0};
 // Refill item source breakdown (freelist vs carve)
 unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
 unsigned long long g_rf_carve_items[TINY_NUM_CLASSES]    = {0};
 static int g_rf_trace_en = -1;
 static inline int rf_trace_enabled(void) {
    if (__builtin_expect(g_rf_trace_en == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_RF_TRACE");
        g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0;
    }
    return g_rf_trace_en;
 }
 static inline unsigned long long rf_now_ns(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec;
 }
 // Publish-side counters (debug)
 unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0};
 unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES]  = {0};
 // Free pipeline counters
 unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
 unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_free_via_mag[TINY_NUM_CLASSES]       = {0};
 // Front Gate Breakdown (debug counters)
 unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_front_sll_hit[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
 unsigned long long g_front_mag_hit[TINY_NUM_CLASSES]   = {0};
 // Free-side trigger counters
 unsigned long long g_first_free_transitions[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES]  = {0};
 // Adopt/Registry gate counters
 unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
 unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
 unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_fast_push_hits[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_fast_push_full[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0};
 unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0};
 unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0};
 unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0};
 unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES]         = {0};
 unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES]           = {0};
 unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES]         = {0};
 unsigned long long g_fast_lookup_none                            = 0;
 // ----------------------------------------------------------------------------
 // Live Superslab cap (must-adopt-before-mmap support)
 // ----------------------------------------------------------------------------
 static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value
 __thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0};
 static inline int live_cap_for_class(int class_idx) {
    if (__builtin_expect(g_live_cap_env == -2, 0)) {
        const char* s = getenv("HAKMEM_SS_LIVE_CAP");
        if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); }
    }
    (void)class_idx;
    return g_live_cap_env;
 }
 // ----------------------------------------------------------------------------
 // Hot Slot (global simple path)
 // ----------------------------------------------------------------------------
 static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot)
 static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES];
 static inline int hot_slot_enabled(void) {
    if (__builtin_expect(g_hot_slot_en == -1, 0)) {
        const char* s = getenv("HAKMEM_HOT_SLOT");
        g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0;
    }
    return g_hot_slot_en || bench_mode_enabled();
 }
 static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) {
    if (!hot_slot_enabled()) return;
    uintptr_t ent = slab_entry_make(ss, slab_idx);
    atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release);
 }
 static inline uintptr_t hot_slot_pop(int class_idx) {
    if (!hot_slot_enabled()) return (uintptr_t)0;
    return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel);
 }
 static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
    if (!ss) return;
    uintptr_t ent = slab_entry_make(ss, slab_idx);
    for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
        uintptr_t expected = 0;
        if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent,
                                                    memory_order_release, memory_order_relaxed)) {
            g_slab_publish_dbg[class_idx]++;
            return;
        }
    }
    // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot
    uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
    uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel);
    if (old) {
        for (int t = 0; t < 8; t++) {
            uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
            uintptr_t expected = 0;
            if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old,
                                                      memory_order_release, memory_order_relaxed)) {
                g_slab_requeue_dbg[class_idx]++;
                old = 0; break;
            }
        }
    }
    g_slab_publish_dbg[class_idx]++;
 }
 static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
    for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
        uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
        if (ent) return ent;
    }
    return 0;
 }
 void ss_partial_publish(int class_idx, SuperSlab* ss) {
    if (!ss) return;
    // Gate by listed flag to avoid repeated publishes of the same SS
    unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel);
    if (prev != 0u) return; // already listed
    // CRITICAL: Release ownership of all slabs so adopters can claim them!
    // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0).
    // The publishing thread must stop using this SS after publishing.
    int cap_pub = ss_slabs_capacity(ss);
    for (int s = 0; s < cap_pub; s++) {
        // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split
        TinySlabMeta* meta = ss_slab_meta_ptr(ss, s);
        uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE);
        if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) {
            uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev;
            tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE,
                                   (uint16_t)ss_slab_meta_class_idx_get(ss, s),
                                   meta,
                                   aux);
        }
    }
    // CRITICAL: Unbind current thread's TLS if it points to this SS!
    // Otherwise, the publishing thread will continue allocating from the published SS,
    // racing with adopters who acquire ownership.
    extern __thread TinyTLSSlab g_tls_slabs[];
    if (g_tls_slabs[class_idx].ss == ss) {
        g_tls_slabs[class_idx].ss = NULL;
        g_tls_slabs[class_idx].meta = NULL;
        g_tls_slabs[class_idx].slab_base = NULL;
        g_tls_slabs[class_idx].slab_idx = 0;
    }
    // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1)
    int best = -1; uint32_t best_score = 0;
    for (int s = 0; s < cap_pub; s++) {
        TinySlabMeta* m = &ss->slabs[s];
        uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
        int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0);
        unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u;
        uint32_t score = rc
                        + (m->freelist ? (1u<<30) : 0u)
                        + (listed ? (1u<<29) : 0u)
                        + (has_remote ? 1u : 0u);
        if (score > best_score) { best_score = score; best = s; }
    }
    if (best >= 0 && best < 256) {
        ss->publish_hint = (uint8_t)best;
        // Box: Ready push — provide slab-level candidate to adopters
        tiny_ready_push(class_idx, ss, best);
    } else {
        ss->publish_hint = 0xFF;
    }
    for (int i = 0; i < SS_PARTIAL_RING; i++) {
        SuperSlab* expected = NULL;
        if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
                                                    memory_order_release, memory_order_relaxed)) {
            g_ss_publish_dbg[class_idx]++;
            return;  // published
        }
    }
    // Ring full: replace one entry in round-robin to avoid dropping supply
    uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed);
    idx %= SS_PARTIAL_RING;
    SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel);
    if (old) {
        // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads
        // that just adopted from it. Draining without ownership checks causes freelist corruption.
        // The adopter will drain when needed (with proper ownership checks in tiny_refill.h).
        //
        // Previous code (UNSAFE):
        //   for (int s = 0; s < cap; s++) {
        //       ss_remote_drain_to_freelist(old, s);  // ← Race with concurrent adopter!
        //   }
        // Keep listed=1 while in overflow so it stays eligible for adopt
        // Push old into overflow stack (待機箱)
        SuperSlab* head;
        do {
            head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
            old->partial_next = head;
        } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old,
                                                        memory_order_release, memory_order_relaxed));
    }
    g_ss_publish_dbg[class_idx]++;
 }
 SuperSlab* ss_partial_adopt(int class_idx) {
    for (int i = 0; i < SS_PARTIAL_RING; i++) {
        SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel);
        if (ss) {
            // Clear listed flag on adopt to allow future publish of this SS
            atomic_store_explicit(&ss->listed, 0u, memory_order_release);
            g_ss_adopt_dbg[class_idx]++;
            return ss;
        }
    }
    // Fallback: adopt from overflow stack (LIFO)
    while (1) {
        SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
        if (!head) break;
        SuperSlab* next = head->partial_next;
        if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next,
                                                  memory_order_acq_rel, memory_order_relaxed)) {
            atomic_store_explicit(&head->listed, 0u, memory_order_release);
            g_ss_adopt_dbg[class_idx]++;
            return head;
        }
    }
    return NULL;
 }
 static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) {
    // Canonical binding under Phase 12:
    // - Per-slab TinySlabMeta.class_idx defines class for this slab
    // - slab_idx is the owning slab index within ss
    // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx)
    tls->ss = ss;
    tls->slab_idx = (uint8_t)slab_idx;
    tls->meta = &ss->slabs[slab_idx];
    tls->slab_base = tiny_slab_base_for(ss, slab_idx);
 }
 static inline uint32_t tiny_tls_default_refill(uint32_t cap) {
    if (cap == 0u) return 8u;
    uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u;
    if (low < 4u) low = 4u;
    return low;
 }
 static inline uint32_t tiny_tls_default_spill(uint32_t cap) {
    if (cap == 0u) return 0u;
    uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u);
    if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP;
    if (spill < cap) spill = cap;
    return (uint32_t)spill;
 }
 static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) {
    atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release);
    atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed);
    atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed);
    atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
 }
 static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) {
    atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release);
    atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
 }
 static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) {
    uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire);
    if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) {
        return;
    }
    uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire);
    if (target_cap != 0u && tls->cap != target_cap) {
        tls->cap = target_cap;
        uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed);
        if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap);
        tls->refill_low = target_refill;
        uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed);
        if (target_spill < target_cap) target_spill = target_cap;
        tls->spill_high = target_spill;
    }
    uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire);
    if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) {
        g_tls_trim_seen[class_idx] = trim_epoch;
        if (tls->count > tls->cap) {
            tls_list_spill_excess(class_idx, tls);
        }
    }
    g_tls_param_seen[class_idx] = seq;
 }