Refactor: Extract 5 Box modules from hakmem_tiny.c (-52% size reduction)

Split hakmem_tiny.c (2081 lines) into focused modules for better maintainability. ## Changes **hakmem_tiny.c**: 2081 → 995 lines (-1086 lines, -52% reduction) ## Extracted Modules (5 boxes) 1. **config_box** (211 lines) - Size class tables, integrity counters - Debug flags, benchmark macros - HAK_RET_ALLOC/HAK_STAT_FREE instrumentation 2. **publish_box** (419 lines) - Publish/Adopt counters and statistics - Bench mailbox, partial ring - Live cap/Hot slot management - TLS helper functions (tiny_tls_default_*) 3. **globals_box** (256 lines) - Global variable declarations (~70 variables) - TinyPool instance and initialization flag - TLS variables (g_tls_lists, g_fast_head, g_fast_count) - SuperSlab configuration (partial ring, empty reserves) - Adopt gate functions 4. **phase6_wrappers_box** (122 lines) - Phase 6 Box Theory wrapper layer - hak_tiny_alloc_fast_wrapper() - hak_tiny_free_fast_wrapper() - Diagnostic instrumentation 5. **ace_guard_box** (100 lines) - ACE Learning Layer (hkm_ace_set_drain_threshold) - FastCache API (tiny_fc_room, tiny_fc_push_bulk) - Tiny Guard debugging system (5 functions) ## Benefits - **Readability**: Giant 2k file → focused 1k core + 5 coherent modules - **Maintainability**: Each box has clear responsibility and boundaries - **Build**: All modules compile successfully ✅ ## Technical Details - Phase 1: ChatGPT extracted config_box + publish_box (-625 lines) - Phase 2-4: Claude extracted globals_box + phase6_wrappers_box + ace_guard_box (-461 lines) - All extractions use .inc files (same translation unit, preserves static/TLS linkage) - Fixed Makefile: Added tiny_sizeclass_hist_box.o to OBJS_BASE and BENCH_HAKMEM_OBJS_BASE 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 01:16:45 +09:00
parent b3a156879a
commit 6b6ad69aca
7 changed files with 1123 additions and 1101 deletions
--- a/4
+++ b/4
@ -190,7 +190,7 @@ LDFLAGS += $(EXTRA_LDFLAGS)

 # Targets
 TARGET = test_hakmem
-OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
+OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
 OBJS = $(OBJS_BASE)

 # Shared library
@ -222,7 +222,7 @@ endif
 # Benchmark targets
 BENCH_HAKMEM = bench_allocators_hakmem
 BENCH_SYSTEM = bench_allocators_system
-BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
+BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
 BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE)
 ifeq ($(POOL_TLS_PHASE1),1)
 BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
--- a/core/hakmem_tiny.c
+++ b/core/hakmem_tiny.c
--- a/core/hakmem_tiny_ace_guard_box.inc
+++ b/core/hakmem_tiny_ace_guard_box.inc
@ -0,0 +1,100 @@
+// ============================================================================
+// ACE Learning Layer: Runtime parameter setters
+// ============================================================================
+
+void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) {
+    // Validate inputs
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) {
+        return;
+    }
+    if (threshold < 16 || threshold > 2048) {
+        return;
+    }
+
+    // Set per-class threshold (used by remote free drain logic)
+    g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
+}
+#include "tiny_fc_api.h"
+int tiny_fc_room(int class_idx) {
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
+    TinyFastCache* fc = &g_fast_cache[class_idx];
+    // Effective per-class cap comes from g_fast_cap (env-tunable),
+    // clamped by the static storage capacity TINY_FASTCACHE_CAP.
+    uint16_t eff_cap = g_fast_cap[class_idx];
+    if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP;
+    int room = (int)eff_cap - fc->top;
+    return room > 0 ? room : 0;
+}
+
+int tiny_fc_push_bulk(int class_idx, void** arr, int n) {
+    if (!arr || n <= 0) return 0;
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
+    TinyFastCache* fc = &g_fast_cache[class_idx];
+    uint16_t eff_cap = g_fast_cap[class_idx];
+    if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP;
+    int room = (int)eff_cap - fc->top;
+    if (room <= 0) return 0;
+    int take = n < room ? n : room;
+    // Forward fill with light unrolling to reduce branch overhead
+    int i = 0;
+    for (; i + 3 < take; i += 4) {
+        fc->items[fc->top++] = arr[i];
+        fc->items[fc->top++] = arr[i + 1];
+        fc->items[fc->top++] = arr[i + 2];
+        fc->items[fc->top++] = arr[i + 3];
+    }
+    for (; i < take; i++) {
+        fc->items[fc->top++] = arr[i];
+    }
+    return take;
+}
+
+// ========= Tiny Guard (targeted debug; low overhead when disabled) =========
+static int g_tiny_guard_enabled = -1;
+static int g_tiny_guard_class = 2;
+static int g_tiny_guard_limit = 8;
+static __thread int g_tiny_guard_seen = 0;
+
+static inline int tiny_guard_enabled_runtime(void) {
+    if (__builtin_expect(g_tiny_guard_enabled == -1, 0)) {
+        const char* e = getenv("HAKMEM_TINY_GUARD");
+        g_tiny_guard_enabled = (e && *e && *e != '0') ? 1 : 0;
+        const char* ec = getenv("HAKMEM_TINY_GUARD_CLASS");
+        if (ec && *ec) g_tiny_guard_class = atoi(ec);
+        const char* el = getenv("HAKMEM_TINY_GUARD_MAX");
+        if (el && *el) g_tiny_guard_limit = atoi(el);
+        if (g_tiny_guard_limit <= 0) g_tiny_guard_limit = 8;
+    }
+    return g_tiny_guard_enabled;
+}
+
+int tiny_guard_is_enabled(void) { return tiny_guard_enabled_runtime(); }
+
+static void tiny_guard_dump_bytes(const char* tag, const uint8_t* p, size_t n) {
+    fprintf(stderr, "[TGUARD] %s:", tag);
+    for (size_t i = 0; i < n; i++) fprintf(stderr, " %02x", p[i]);
+    fprintf(stderr, "\n");
+}
+
+void tiny_guard_on_alloc(int cls, void* base, void* user, size_t stride) {
+    if (!tiny_guard_enabled_runtime() || cls != g_tiny_guard_class) return;
+    if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return;
+    uint8_t* b = (uint8_t*)base;
+    uint8_t* u = (uint8_t*)user;
+    fprintf(stderr, "[TGUARD] alloc cls=%d base=%p user=%p stride=%zu hdr=%02x\n",
+            cls, base, user, stride, b[0]);
+    // 隣接ヘッダ可視化（前後）
+    tiny_guard_dump_bytes("around_base", b, (stride >= 8 ? 8 : stride));
+    tiny_guard_dump_bytes("next_header", b + stride, 4);
+}
+
+void tiny_guard_on_invalid(void* user_ptr, uint8_t hdr) {
+    if (!tiny_guard_enabled_runtime()) return;
+    if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return;
+    uint8_t* u = (uint8_t*)user_ptr;
+    fprintf(stderr, "[TGUARD] invalid header at user=%p hdr=%02x prev=%02x next=%02x\n",
+            user_ptr, hdr, *(u - 2), *(u));
+    tiny_guard_dump_bytes("dump_before", u - 8, 8);
+    tiny_guard_dump_bytes("dump_after", u, 8);
+}
+
--- a/core/hakmem_tiny_config_box.inc
+++ b/core/hakmem_tiny_config_box.inc
@ -0,0 +1,211 @@
+// hakmem_tiny_config_box.inc
+// Box: Tiny allocator configuration, debug counters, and return helpers.
+// Extracted from hakmem_tiny.c to reduce file size and isolate config logic.
+
+// ============================================================================
+// Size class table (Box 3 dependency)
+// ============================================================================
+// Phase E1-CORRECT: ALL classes have 1-byte header
+// These sizes represent TOTAL BLOCK SIZE (stride) = [Header 1B][Data N-1B]
+// Usable data = stride - 1 (implicit)
+const size_t g_tiny_class_sizes[TINY_NUM_CLASSES] = {
+    8,      // Class 0:   8B total = [Header 1B][Data  7B]
+    16,     // Class 1:  16B total = [Header 1B][Data 15B]
+    32,     // Class 2:  32B total = [Header 1B][Data 31B]
+    64,     // Class 3:  64B total = [Header 1B][Data 63B]
+    128,    // Class 4: 128B total = [Header 1B][Data 127B]
+    256,    // Class 5: 256B total = [Header 1B][Data 255B]
+    512,    // Class 6: 512B total = [Header 1B][Data 511B]
+    1024    // Class 7: 1024B total = [Header 1B][Data 1023B]
+};
+
+// ============================================================================
+// Phase 16: Dynamic Tiny Max Size (ENV: HAKMEM_TINY_MAX_CLASS)
+// Phase 17-1: Auto-adjust when Small-Mid enabled
+// ============================================================================
+
+// Forward declaration for Small-Mid check
+extern bool smallmid_is_enabled(void);
+
+// Get dynamic max size for Tiny allocator based on ENV configuration
+// Default: 1023B (C0-C7), can be reduced to 255B (C0-C5)
+// Phase 17-1: Auto-reduces to 255B when Small-Mid is enabled
+size_t tiny_get_max_size(void) {
+    static int g_max_class = -1;
+    if (__builtin_expect(g_max_class == -1, 0)) {
+        const char* env = getenv("HAKMEM_TINY_MAX_CLASS");
+        if (env && *env) {
+            int max_class = atoi(env);
+            if (max_class >= 0 && max_class < TINY_NUM_CLASSES) {
+                g_max_class = max_class;
+            } else {
+                g_max_class = 7;  // Default: all classes (C0-C7)
+            }
+        } else {
+            g_max_class = 7;  // Default: all classes
+        }
+    }
+
+    // Phase 17-1: Auto-adjust when Small-Mid enabled
+    // Small-Mid handles 256B-1KB, so Tiny should only handle 0-255B
+    int effective_class = g_max_class;
+    if (smallmid_is_enabled() && effective_class > 5) {
+        effective_class = 5;  // Limit to C0-C5 (0-255B)
+    }
+
+    // Map class to max usable size (stride - 1)
+    // C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B
+    static const size_t class_to_max_size[TINY_NUM_CLASSES] = {
+        7, 15, 31, 63, 127, 255, 511, 1023
+    };
+    return class_to_max_size[effective_class];
+}
+
+// ============================================================================
+// PRIORITY 1-4: Integrity Check Counters
+// ============================================================================
+_Atomic uint64_t g_integrity_check_class_bounds = 0;
+_Atomic uint64_t g_integrity_check_freelist = 0;
+_Atomic uint64_t g_integrity_check_canary = 0;
+_Atomic uint64_t g_integrity_check_header = 0;
+
+// Build-time gate for debug counters (path/ultra). Default OFF.
+#ifndef HAKMEM_DEBUG_COUNTERS
+#define HAKMEM_DEBUG_COUNTERS 0
+#endif
+
+int g_debug_fast0 = 0;
+int g_debug_remote_guard = 0;
+int g_remote_force_notify = 0;
+// Tiny free safety (debug)
+int g_tiny_safe_free = 0;         // Default OFF for performance; env: HAKMEM_SAFE_FREE=1 でON
+int g_tiny_safe_free_strict = 0;  // env: HAKMEM_SAFE_FREE_STRICT=1
+int g_tiny_force_remote = 0;      // env: HAKMEM_TINY_FORCE_REMOTE=1
+
+// Build-time gate: Minimal Tiny front (bench-only)
+
+static inline int superslab_trace_enabled(void) {
+    static int g_ss_trace_flag = -1;
+    if (__builtin_expect(g_ss_trace_flag == -1, 0)) {
+        const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE");
+        g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0;
+    }
+    return g_ss_trace_flag;
+}
+// When enabled, physically excludes optional front tiers from the hot path
+// (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving:
+//   SLL → TLS Magazine → SuperSlab → (remaining slow path)
+#ifndef HAKMEM_TINY_MINIMAL_FRONT
+#define HAKMEM_TINY_MINIMAL_FRONT 1
+#endif
+// Strict front: compile-out optional front tiers but keep baseline structure intact
+#ifndef HAKMEM_TINY_STRICT_FRONT
+#define HAKMEM_TINY_STRICT_FRONT 0
+#endif
+
+// Bench-only fast path knobs (defaults)
+#ifndef HAKMEM_TINY_BENCH_REFILL
+#define HAKMEM_TINY_BENCH_REFILL 8
+#endif
+// Optional per-class overrides (bench-only)
+#ifndef HAKMEM_TINY_BENCH_REFILL8
+#define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL
+#endif
+#ifndef HAKMEM_TINY_BENCH_REFILL16
+#define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL
+#endif
+#ifndef HAKMEM_TINY_BENCH_REFILL32
+#define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL
+#endif
+#ifndef HAKMEM_TINY_BENCH_REFILL64
+#define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL
+#endif
+
+// Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class)
+#ifndef HAKMEM_TINY_BENCH_WARMUP8
+#define HAKMEM_TINY_BENCH_WARMUP8 64
+#endif
+#ifndef HAKMEM_TINY_BENCH_WARMUP16
+#define HAKMEM_TINY_BENCH_WARMUP16 96
+#endif
+#ifndef HAKMEM_TINY_BENCH_WARMUP32
+#define HAKMEM_TINY_BENCH_WARMUP32 160
+#endif
+#ifndef HAKMEM_TINY_BENCH_WARMUP64
+#define HAKMEM_TINY_BENCH_WARMUP64 192
+#endif
+
+#ifdef HAKMEM_TINY_BENCH_FASTPATH
+static __thread unsigned char g_tls_bench_warm_done[4];
+#endif
+
+#if HAKMEM_DEBUG_COUNTERS
+#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)
+#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)
+#else
+#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)
+#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)
+#endif
+// Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0)
+#if HAKMEM_DEBUG_COUNTERS
+#define HAK_DBG_INC(var) do { (var)++; } while(0)
+#else
+#define HAK_DBG_INC(var) do { (void)0; } while(0)
+#endif
+// Return helper: record tiny alloc stat (guarded) then return pointer
+static inline void tiny_debug_track_alloc_ret(int cls, void* ptr);
+
+// ========== HAK_RET_ALLOC: Single Definition Point ==========
+// Choose implementation based on HAKMEM_TINY_HEADER_CLASSIDX
+// - Phase 7 enabled: Write header and return user pointer
+// - Phase 7 disabled: Legacy behavior (stats + route + return)
+
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    #if HAKMEM_BUILD_RELEASE
+        // Phase E1-CORRECT: ALL classes have 1-byte headers (including C7)
+        // Ultra-fast inline macro (3-4 instructions)
+        #define HAK_RET_ALLOC(cls, base_ptr) do { \
+            *(uint8_t*)(base_ptr) = HEADER_MAGIC | ((cls) & HEADER_CLASS_MASK); \
+            return (void*)((uint8_t*)(base_ptr) + 1); \
+        } while(0)
+    #else
+        // Debug: Keep full validation via tiny_region_id_write_header()
+        #define HAK_RET_ALLOC(cls, ptr) return tiny_region_id_write_header((ptr), (cls))
+    #endif
+#else
+    // Legacy: Stats and routing before return
+    #ifdef HAKMEM_ENABLE_STATS
+        // Optional: sampling（ビルド時に有効化）。ホットパスは直接インライン呼び出し（間接分岐なし）。
+        #ifdef HAKMEM_TINY_STAT_SAMPLING
+            static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES];
+            static int g_stat_rate_lg = 0;  // 0=毎回、それ以外=2^lgごと
+            static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) {
+                if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; }
+                unsigned m = (1u << g_stat_rate_lg) - 1u;
+                if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls);
+            }
+        #else
+            static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); }
+        #endif
+        #define HAK_RET_ALLOC(cls, ptr) do { \
+            tiny_debug_track_alloc_ret((cls), (ptr)); \
+            hkm_stat_alloc((cls)); \
+            ROUTE_COMMIT((cls), 0x7F); \
+            return (ptr); \
+        } while(0)
+    #else
+        #define HAK_RET_ALLOC(cls, ptr) do { \
+            tiny_debug_track_alloc_ret((cls), (ptr)); \
+            ROUTE_COMMIT((cls), 0x7F); \
+            return (ptr); \
+        } while(0)
+    #endif
+#endif  // HAKMEM_TINY_HEADER_CLASSIDX
+
+// Free-side stats: compile-time zero when stats disabled
+#ifdef HAKMEM_ENABLE_STATS
+#define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0)
+#else
+#define HAK_STAT_FREE(cls) do { } while(0)
+#endif
+
--- a/core/hakmem_tiny_globals_box.inc
+++ b/core/hakmem_tiny_globals_box.inc
@ -0,0 +1,256 @@
+// ============================================================================
+// Global State
+// ============================================================================
+
+// Global pool instance (extern declared in hakmem_tiny.h)
+TinyPool g_tiny_pool;
+int g_tiny_initialized = 0;  // Not static (extern in header for inline access)
+// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
+// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
+//
+// Results:
+//   Phase 1 (Push - deferred free): +1 instruction, zero benefit
+//   Phase 2 (Pull - background refill): +77 instructions, -3% performance
+//
+// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
+//   - TLS Magazine capacity: 2048 items
+//   - Benchmark working set: 100 items
+//   - Magazine hit rate: 100% after warmup
+//   - Slow path never executed!
+//
+// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
+//   - glibc: ~40 instructions/op (5-7× faster)
+//   - Gap is architectural (bitmap vs free-list, research features)
+//
+// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
+// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
+// Decision: Enable by default (proven production-ready)
+static int g_wrap_tiny_enabled = 1;  // ON by default (faster than glibc!)
+// Optional: allow limited trylock-based refill during wrapper calls
+static int g_wrap_tiny_refill = 0;
+// Remote-free drain controls
+static int g_remote_drain_thresh = 32;   // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
+static int g_remote_drain_tryrate = 16;  // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
+
+// ACE Learning Layer: Per-class remote drain thresholds
+int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
+// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
+// Old: XOR RNG sampling (10-15 ns overhead)
+// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
+static int g_tiny_count_sample_exp = 8;   // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
+
+// Step 2: Slab Registry (Hash Table)
+SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
+
+PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
+
+// Registry lock
+pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
+
+// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
+// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
+// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
+static int g_use_registry = 1;  // Default ON for thread-safety
+
+// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
+// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
+static int g_tiny_refill_max = 64;        // HAKMEM_TINY_REFILL_MAX (default 64)
+static int g_tiny_refill_max_hot = 192;   // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
+
+// hakmem_tiny_tls_list.h already included at top
+static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
+static int g_tls_list_enable = 0;  // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
+static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
+static int g_fast_enable = 1;
+static int g_fastcache_enable = 1;           // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
+static uint16_t g_fast_cap[TINY_NUM_CLASSES];
+static int g_ultra_bump_shadow = 0;          // HAKMEM_TINY_BUMP_SHADOW=1
+static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
+
+
+typedef void* (*TinyHotAllocFn)(void);
+static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
+static __thread void* g_fast_head[TINY_NUM_CLASSES];
+static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
+static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
+
+uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
+uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
+uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
+uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
+uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
+uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
+
+// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
+// NEW: Per-thread active slabs (up to 2 per class)
+static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
+static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
+
+static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
+    TinySlab* cand = g_tls_active_slab_a[class_idx];
+    if (cand) {
+        uintptr_t base = (uintptr_t)cand->base;
+        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
+            return cand;
+        }
+    }
+    cand = g_tls_active_slab_b[class_idx];
+    if (cand) {
+        uintptr_t base = (uintptr_t)cand->base;
+        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
+            return cand;
+        }
+    }
+    return NULL;
+}
+
+// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
+// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
+extern int g_use_superslab;
+
+#if !HAKMEM_BUILD_RELEASE
+static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
+    if (!ptr) return;
+    // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
+    void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
+    if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
+        SuperSlab* ss = hak_super_lookup(ptr);
+        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
+            tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
+        } else {
+            int slab_idx = slab_index_for(ss, base_ptr);
+            if (slab_idx < 0) {
+                tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
+            } else {
+                // Fail-Fast: class vs SuperSlab size_class must be consistent.
+    TinySlabMeta* meta = &ss->slabs[slab_idx];
+    if (meta->class_idx != (uint8_t)cls) {
+        tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
+    }
+                size_t blk = g_tiny_class_sizes[cls];
+                uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
+                uintptr_t delta = (uintptr_t)base_ptr - base;
+                if (blk == 0 || (delta % blk) != 0) {
+                    tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
+                } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
+                    tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
+                }
+            }
+        }
+    }
+    if (!__builtin_expect(g_debug_remote_guard, 0)) return;
+    if (!g_use_superslab) return;
+    SuperSlab* ss = hak_super_lookup(ptr);
+    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
+    int slab_idx = slab_index_for(ss, base_ptr);
+    if (slab_idx >= 0) {
+        tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
+    }
+}
+#else
+static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
+#endif
+
+// Debug counters for SuperSlab investigation
+#if HAKMEM_DEBUG_COUNTERS
+int g_superslab_alloc_count = 0;
+int g_superslab_fail_count = 0;
+int g_superslab_free_count = 0;   // Phase 7.6: Track SuperSlab frees
+int g_empty_superslab_count = 0;  // Phase 7.6: Track empty SuperSlabs detected
+int g_magazine_push_count = 0;    // Phase 7.6: Track Magazine pushes
+int g_tiny_free_with_slab_count = 0;  // Phase 7.6: Track tiny_free_with_slab calls
+#endif
+
+// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
+// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
+// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
+#define EMPTY_SUPERSLAB_RESERVE 0  // Keep up to N empty SuperSlabs per class (default)
+static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES];  // One empty SuperSlab per class
+static int g_empty_counts[TINY_NUM_CLASSES] = {0};  // Count of empty SuperSlabs
+static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
+static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
+static int g_ss_partial_enable = 1;  // Enable partial SuperSlab release by default
+static uint32_t g_ss_partial_interval = 4;
+static _Atomic uint32_t g_ss_partial_epoch = 0;
+
+// Phase 6.24: Unified TLS slab cache (Medium fix)
+// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
+// Phase E4: 64B alignment for L1 cache optimization
+__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
+static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
+static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
+static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
+static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
+static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
+static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
+static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
+
+// ----------------------------------------------------------------------------
+// Per-class partial SuperSlab slot (single-slot publish/adopt)
+// ----------------------------------------------------------------------------
+// Small ring of partial SuperSlabs per class (publish/adopt)
+#ifndef SS_PARTIAL_RING
+#define SS_PARTIAL_RING 64
+#endif
+static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
+static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
+static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
+static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES];
+static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN
+
+// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
+unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
+unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
+_Atomic int g_ss_remote_seen = 0;  // becomes 1 when any remote free occurs
+static int g_ss_adopt_env = -2;    // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
+static _Atomic int g_ss_adopt_runtime = 0;  // 0=inactive, 1=active
+static _Atomic int g_ss_adopt_log_once = 0;
+
+static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
+    if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
+        fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
+                reason ? reason : "unknown", class_idx);
+    }
+}
+
+static inline void tiny_adopt_gate_parse_env(void) {
+    if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
+        const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
+        if (!env || *env == '\0') {
+            g_ss_adopt_env = 0;  // auto
+        } else if (*env == '0') {
+            g_ss_adopt_env = -1; // forced OFF
+            atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
+        } else {
+            g_ss_adopt_env = 1;  // forced ON
+            atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
+            tiny_adopt_gate_log_activation("env", -1);
+        }
+    }
+}
+
+int tiny_adopt_gate_should_publish(void) {
+    tiny_adopt_gate_parse_env();
+    if (g_ss_adopt_env == 1) return 1;
+    if (g_ss_adopt_env == -1) return 0;
+    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
+}
+
+int tiny_adopt_gate_should_adopt(void) {
+    tiny_adopt_gate_parse_env();
+    if (g_ss_adopt_env == 1) return 1;
+    if (g_ss_adopt_env == -1) return 0;
+    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
+}
+
+void tiny_adopt_gate_on_remote_seen(int class_idx) {
+    tiny_adopt_gate_parse_env();
+    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
+    if (g_ss_adopt_env == -1) return;
+    int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
+    if (prev == 0) {
+        tiny_adopt_gate_log_activation("remote", class_idx);
+    }
+}
+
+// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers
--- a/core/hakmem_tiny_phase6_wrappers_box.inc
+++ b/core/hakmem_tiny_phase6_wrappers_box.inc
@ -0,0 +1,122 @@
+    // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check
+#if HAKMEM_TINY_PHASE6_BOX_REFACTOR
+    #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
+        #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options"
+    #endif
+
+    // Box 1: Atomic Operations (Layer 0 - Foundation)
+    #include "tiny_atomic.h"
+
+    // Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions)
+    #include "tiny_alloc_fast.inc.h"
+
+    // Box 6: Free Fast Path (Layer 2 - 2-3 instructions)
+    #include "tiny_free_fast.inc.h"
+
+    // ---------------- Refill count (Front) global config ----------------
+    // Parsed once at init; hot path reads plain ints (no getenv).
+    int g_refill_count_global = 0;              // HAKMEM_TINY_REFILL_COUNT
+    int g_refill_count_hot = 0;                 // HAKMEM_TINY_REFILL_COUNT_HOT
+    int g_refill_count_mid = 0;                 // HAKMEM_TINY_REFILL_COUNT_MID
+    int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7}
+
+    // Export wrapper functions for hakmem.c to call
+    // Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining
+    void* hak_tiny_alloc_fast_wrapper(size_t size) {
+        // Phase E5: Ultra fast path (8-instruction alloc, bypasses all layers)
+        // Enable with: HAKMEM_ULTRA_FAST_PATH=1 (compile-time)
+        #if HAKMEM_ULTRA_FAST_PATH
+            void* ret = tiny_alloc_fast_ultra(size);
+            if (ret) return ret;
+            // Miss → fallback to full fast path
+        #endif
+
+        // Bench-only ultra-short path: bypass diagnostics and pointer tracking
+        // Enable with: HAKMEM_BENCH_FAST_FRONT=1
+        static int g_bench_fast_front = -1;
+        if (__builtin_expect(g_bench_fast_front == -1, 0)) {
+            const char* e = getenv("HAKMEM_BENCH_FAST_FRONT");
+            g_bench_fast_front = (e && *e && *e != '0') ? 1 : 0;
+        }
+        if (__builtin_expect(g_bench_fast_front, 0)) {
+            return tiny_alloc_fast(size);
+        }
+
+        static _Atomic uint64_t wrapper_call_count = 0;
+        uint64_t call_num = atomic_fetch_add(&wrapper_call_count, 1);
+
+        // Pointer tracking init (first call only)
+        PTR_TRACK_INIT();
+
+        // PRIORITY 3: Periodic canary validation (every 1000 ops)
+        periodic_canary_check(call_num, "hak_tiny_alloc_fast_wrapper");
+
+        // Box I: Periodic full integrity check (every 5000 ops)
+        #if HAKMEM_INTEGRITY_LEVEL >= 3
+        if ((call_num % 5000) == 0) {
+            extern void integrity_periodic_full_check(const char*);
+            integrity_periodic_full_check("periodic check in alloc wrapper");
+        }
+        #endif
+
+        #if !HAKMEM_BUILD_RELEASE
+        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
+            fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu size=%zu\n", call_num, size);
+            fflush(stderr);
+        }
+        #endif
+
+        void* result = tiny_alloc_fast(size);
+
+        #if !HAKMEM_BUILD_RELEASE
+        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
+            fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu returned %p\n", call_num, result);
+            fflush(stderr);
+        }
+        #endif
+        return result;
+    }
+
+    void hak_tiny_free_fast_wrapper(void* ptr) {
+        // Phase E5: Ultra fast path (6-8 instruction free)
+        #if HAKMEM_ULTRA_FAST_PATH
+            tiny_free_fast_ultra(ptr);
+            return;
+        #endif
+
+        static _Atomic uint64_t free_call_count = 0;
+        uint64_t call_num = atomic_fetch_add(&free_call_count, 1);
+        if (call_num > 14135 && call_num < 14145) {
+            fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu ptr=%p\n", call_num, ptr);
+            fflush(stderr);
+        }
+        tiny_free_fast(ptr);
+        if (call_num > 14135 && call_num < 14145) {
+            fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu completed\n", call_num);
+            fflush(stderr);
+        }
+    }
+
+#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
+    // Phase 6-1.5: Alignment guessing (legacy)
+
+    // Refill count globals (needed for compatibility)
+    int g_refill_count_global = 0;
+    int g_refill_count_hot = 0;
+    int g_refill_count_mid = 0;
+    int g_refill_count_class[TINY_NUM_CLASSES] = {0};
+
+    #include "hakmem_tiny_ultra_simple.inc"
+
+    // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking)
+    void* hak_tiny_alloc_fast_wrapper(size_t size) {
+        return hak_tiny_alloc_ultra_simple(size);
+    }
+
+    void hak_tiny_free_fast_wrapper(void* ptr) {
+        hak_tiny_free_ultra_simple(ptr);
+    }
+#elif defined(HAKMEM_TINY_PHASE6_METADATA)
+    // Phase 6-1.6: Metadata header (recommended)
+    #include "hakmem_tiny_metadata.inc"
+#endif
--- a/core/hakmem_tiny_publish_box.inc
+++ b/core/hakmem_tiny_publish_box.inc
@ -0,0 +1,419 @@
+// hakmem_tiny_publish_box.inc
+// Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers.
+// Extracted from hakmem_tiny.c to keep hot-path logic focused.
+
+// TLS hint: last adopted SuperSlab/slab to avoid rescans
+#include "tiny_sticky.h"
+
+// Mailbox box
+#include "box/mailbox_box.h"
+
+// Publish pipeline counters (visibility)
+unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES]       = {0};
+unsigned long long g_pub_same_empty[TINY_NUM_CLASSES]         = {0};
+unsigned long long g_remote_transitions[TINY_NUM_CLASSES]     = {0};
+unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0};
+unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0};
+
+// Slab-ring counters (debug)
+unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0};
+unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0};
+unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES]    = {0};
+
+// Slab entry encoding helpers (used by Bench/Slab-ring paths)
+static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) {
+    return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu);
+}
+static inline SuperSlab* slab_entry_ss(uintptr_t ent) {
+    // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base
+    return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
+}
+static inline int slab_entry_idx(uintptr_t ent) {
+    return (int)(ent & 0x3Fu);
+}
+
+// ----------------------------------------------------------------------------
+// Bench Mode Publish Mailbox (single-slot per class)
+// ----------------------------------------------------------------------------
+static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1
+static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES];
+#ifndef BENCH_MAILBOX_WIDTH
+#define BENCH_MAILBOX_WIDTH 16
+#endif
+static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH];
+
+static inline int bench_mode_enabled(void) {
+    if (__builtin_expect(g_bench_mode == -1, 0)) {
+        const char* b = getenv("HAKMEM_TINY_BENCH_MODE");
+        g_bench_mode = (b && atoi(b) != 0) ? 1 : 0;
+    }
+    return g_bench_mode;
+}
+
+static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) {
+    if (!bench_mode_enabled()) return;
+    uintptr_t ent = slab_entry_make(ss, slab_idx);
+    uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed);
+    idx &= (BENCH_MAILBOX_WIDTH - 1);
+    atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release);
+}
+
+static inline uintptr_t bench_pub_pop(int class_idx) {
+    if (!bench_mode_enabled()) return (uintptr_t)0;
+    for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) {
+        uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
+        if (ent) return ent;
+    }
+    return 0;
+}
+
+// ----------------------------------------------------------------------------
+// Slab-Granular Partial Publish/Adopt (encoded entries)
+// ----------------------------------------------------------------------------
+#ifndef SLAB_PARTIAL_RING
+#define SLAB_PARTIAL_RING 128
+#endif
+static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING];
+static _Atomic(uint32_t)  g_slab_partial_rr2[TINY_NUM_CLASSES];
+
+// ----------------------------------------------------------------------------
+// Refill-stage counters (per class)
+// ----------------------------------------------------------------------------
+unsigned long long g_rf_total_calls[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES]     = {0};
+unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES]       = {0};
+unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES]     = {0};
+unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES]      = {0};
+unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES]      = {0};
+unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES]        = {0};
+unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES]       = {0};
+unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES]    = {0};
+
+// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0)
+unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0};
+unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0};
+unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0};
+
+// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE)
+unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0};
+unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0};
+unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES]  = {0};
+unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES]  = {0};
+unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES]    = {0};
+unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES]  = {0};
+
+// Refill item source breakdown (freelist vs carve)
+unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
+unsigned long long g_rf_carve_items[TINY_NUM_CLASSES]    = {0};
+
+static int g_rf_trace_en = -1;
+static inline int rf_trace_enabled(void) {
+    if (__builtin_expect(g_rf_trace_en == -1, 0)) {
+        const char* e = getenv("HAKMEM_TINY_RF_TRACE");
+        g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0;
+    }
+    return g_rf_trace_en;
+}
+
+static inline unsigned long long rf_now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec;
+}
+
+// Publish-side counters (debug)
+unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0};
+unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES]  = {0};
+
+// Free pipeline counters
+unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES]  = {0};
+unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
+unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_free_via_mag[TINY_NUM_CLASSES]       = {0};
+
+// Front Gate Breakdown (debug counters)
+unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_front_sll_hit[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
+unsigned long long g_front_mag_hit[TINY_NUM_CLASSES]   = {0};
+
+// Free-side trigger counters
+unsigned long long g_first_free_transitions[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES]  = {0};
+
+// Adopt/Registry gate counters
+unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
+unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES]  = {0};
+unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES]      = {0};
+unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES]  = {0};
+unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
+unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_fast_push_hits[TINY_NUM_CLASSES]     = {0};
+unsigned long long g_fast_push_full[TINY_NUM_CLASSES]     = {0};
+unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0};
+unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0};
+unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0};
+unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0};
+unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES]      = {0};
+unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES]      = {0};
+unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES]         = {0};
+unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES]     = {0};
+unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES]           = {0};
+unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES]         = {0};
+unsigned long long g_fast_lookup_none                            = 0;
+
+// ----------------------------------------------------------------------------
+// Live Superslab cap (must-adopt-before-mmap support)
+// ----------------------------------------------------------------------------
+static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value
+__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0};
+static inline int live_cap_for_class(int class_idx) {
+    if (__builtin_expect(g_live_cap_env == -2, 0)) {
+        const char* s = getenv("HAKMEM_SS_LIVE_CAP");
+        if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); }
+    }
+    (void)class_idx;
+    return g_live_cap_env;
+}
+
+// ----------------------------------------------------------------------------
+// Hot Slot (global simple path)
+// ----------------------------------------------------------------------------
+static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot)
+static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES];
+static inline int hot_slot_enabled(void) {
+    if (__builtin_expect(g_hot_slot_en == -1, 0)) {
+        const char* s = getenv("HAKMEM_HOT_SLOT");
+        g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0;
+    }
+    return g_hot_slot_en || bench_mode_enabled();
+}
+static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) {
+    if (!hot_slot_enabled()) return;
+    uintptr_t ent = slab_entry_make(ss, slab_idx);
+    atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release);
+}
+static inline uintptr_t hot_slot_pop(int class_idx) {
+    if (!hot_slot_enabled()) return (uintptr_t)0;
+    return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel);
+}
+
+static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
+    if (!ss) return;
+    uintptr_t ent = slab_entry_make(ss, slab_idx);
+    for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
+        uintptr_t expected = 0;
+        if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent,
+                                                    memory_order_release, memory_order_relaxed)) {
+            g_slab_publish_dbg[class_idx]++;
+            return;
+        }
+    }
+    // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot
+    uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
+    uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel);
+    if (old) {
+        for (int t = 0; t < 8; t++) {
+            uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
+            uintptr_t expected = 0;
+            if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old,
+                                                      memory_order_release, memory_order_relaxed)) {
+                g_slab_requeue_dbg[class_idx]++;
+                old = 0; break;
+            }
+        }
+    }
+    g_slab_publish_dbg[class_idx]++;
+}
+
+static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
+    for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
+        uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
+        if (ent) return ent;
+    }
+    return 0;
+}
+
+void ss_partial_publish(int class_idx, SuperSlab* ss) {
+    if (!ss) return;
+    // Gate by listed flag to avoid repeated publishes of the same SS
+    unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel);
+    if (prev != 0u) return; // already listed
+
+    // CRITICAL: Release ownership of all slabs so adopters can claim them!
+    // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0).
+    // The publishing thread must stop using this SS after publishing.
+    int cap_pub = ss_slabs_capacity(ss);
+    for (int s = 0; s < cap_pub; s++) {
+        // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split
+        TinySlabMeta* meta = ss_slab_meta_ptr(ss, s);
+        uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE);
+        if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) {
+            uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev;
+            tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE,
+                                   (uint16_t)ss_slab_meta_class_idx_get(ss, s),
+                                   meta,
+                                   aux);
+        }
+    }
+
+    // CRITICAL: Unbind current thread's TLS if it points to this SS!
+    // Otherwise, the publishing thread will continue allocating from the published SS,
+    // racing with adopters who acquire ownership.
+    extern __thread TinyTLSSlab g_tls_slabs[];
+    if (g_tls_slabs[class_idx].ss == ss) {
+        g_tls_slabs[class_idx].ss = NULL;
+        g_tls_slabs[class_idx].meta = NULL;
+        g_tls_slabs[class_idx].slab_base = NULL;
+        g_tls_slabs[class_idx].slab_idx = 0;
+    }
+
+    // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1)
+    int best = -1; uint32_t best_score = 0;
+    for (int s = 0; s < cap_pub; s++) {
+        TinySlabMeta* m = &ss->slabs[s];
+        uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
+        int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0);
+        unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u;
+        uint32_t score = rc
+                        + (m->freelist ? (1u<<30) : 0u)
+                        + (listed ? (1u<<29) : 0u)
+                        + (has_remote ? 1u : 0u);
+        if (score > best_score) { best_score = score; best = s; }
+    }
+    if (best >= 0 && best < 256) {
+        ss->publish_hint = (uint8_t)best;
+        // Box: Ready push — provide slab-level candidate to adopters
+        tiny_ready_push(class_idx, ss, best);
+    } else {
+        ss->publish_hint = 0xFF;
+    }
+    for (int i = 0; i < SS_PARTIAL_RING; i++) {
+        SuperSlab* expected = NULL;
+        if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
+                                                    memory_order_release, memory_order_relaxed)) {
+            g_ss_publish_dbg[class_idx]++;
+            return;  // published
+        }
+    }
+    // Ring full: replace one entry in round-robin to avoid dropping supply
+    uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed);
+    idx %= SS_PARTIAL_RING;
+    SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel);
+    if (old) {
+        // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads
+        // that just adopted from it. Draining without ownership checks causes freelist corruption.
+        // The adopter will drain when needed (with proper ownership checks in tiny_refill.h).
+        //
+        // Previous code (UNSAFE):
+        //   for (int s = 0; s < cap; s++) {
+        //       ss_remote_drain_to_freelist(old, s);  // ← Race with concurrent adopter!
+        //   }
+
+        // Keep listed=1 while in overflow so it stays eligible for adopt
+        // Push old into overflow stack (待機箱)
+        SuperSlab* head;
+        do {
+            head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
+            old->partial_next = head;
+        } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old,
+                                                        memory_order_release, memory_order_relaxed));
+    }
+    g_ss_publish_dbg[class_idx]++;
+}
+
+SuperSlab* ss_partial_adopt(int class_idx) {
+    for (int i = 0; i < SS_PARTIAL_RING; i++) {
+        SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel);
+        if (ss) {
+            // Clear listed flag on adopt to allow future publish of this SS
+            atomic_store_explicit(&ss->listed, 0u, memory_order_release);
+            g_ss_adopt_dbg[class_idx]++;
+            return ss;
+        }
+    }
+    // Fallback: adopt from overflow stack (LIFO)
+    while (1) {
+        SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
+        if (!head) break;
+        SuperSlab* next = head->partial_next;
+        if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next,
+                                                  memory_order_acq_rel, memory_order_relaxed)) {
+            atomic_store_explicit(&head->listed, 0u, memory_order_release);
+            g_ss_adopt_dbg[class_idx]++;
+            return head;
+        }
+    }
+    return NULL;
+}
+
+static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) {
+    // Canonical binding under Phase 12:
+    // - Per-slab TinySlabMeta.class_idx defines class for this slab
+    // - slab_idx is the owning slab index within ss
+    // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx)
+    tls->ss = ss;
+    tls->slab_idx = (uint8_t)slab_idx;
+    tls->meta = &ss->slabs[slab_idx];
+    tls->slab_base = tiny_slab_base_for(ss, slab_idx);
+}
+
+static inline uint32_t tiny_tls_default_refill(uint32_t cap) {
+    if (cap == 0u) return 8u;
+    uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u;
+    if (low < 4u) low = 4u;
+    return low;
+}
+
+static inline uint32_t tiny_tls_default_spill(uint32_t cap) {
+    if (cap == 0u) return 0u;
+    uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u);
+    if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP;
+    if (spill < cap) spill = cap;
+    return (uint32_t)spill;
+}
+
+static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) {
+    atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release);
+    atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed);
+    atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed);
+    atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
+}
+
+static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) {
+    atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release);
+    atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
+}
+
+static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) {
+    uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire);
+    if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) {
+        return;
+    }
+    uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire);
+    if (target_cap != 0u && tls->cap != target_cap) {
+        tls->cap = target_cap;
+        uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed);
+        if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap);
+        tls->refill_low = target_refill;
+        uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed);
+        if (target_spill < target_cap) target_spill = target_cap;
+        tls->spill_high = target_spill;
+    }
+    uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire);
+    if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) {
+        g_tls_trim_seen[class_idx] = trim_epoch;
+        if (tls->count > tls->cap) {
+            tls_list_spill_excess(class_idx, tls);
+        }
+    }
+    g_tls_param_seen[class_idx] = seq;
+}
+