diff --git a/Makefile b/Makefile
index b38a826a..97e7bd67 100644
--- a/Makefile
+++ b/Makefile
@@ -191,12 +191,12 @@ LDFLAGS += $(EXTRA_LDFLAGS)
 
 # Targets
 TARGET = test_hakmem
-OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
+OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
 OBJS = $(OBJS_BASE)
 
 # Shared library
 SHARED_LIB = libhakmem.so
-SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o hakmem_tiny_superslab_shared.o hakmem_smallmid_shared.o hakmem_smallmid_superslab_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_local_box_shared.o core/box/free_remote_box_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/unified_batch_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_mid_mt_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o
+SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o superslab_allocate_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o superslab_head_shared.o hakmem_smallmid_shared.o hakmem_smallmid_superslab_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_local_box_shared.o core/box/free_remote_box_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/unified_batch_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_mid_mt_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o
 
 # Pool TLS Phase 1 (enable with POOL_TLS_PHASE1=1)
 ifeq ($(POOL_TLS_PHASE1),1)
@@ -223,7 +223,7 @@ endif
 # Benchmark targets
 BENCH_HAKMEM = bench_allocators_hakmem
 BENCH_SYSTEM = bench_allocators_system
-BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
+BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
 BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE)
 ifeq ($(POOL_TLS_PHASE1),1)
 BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
@@ -400,7 +400,7 @@ test-box-refactor: box-refactor
 	./larson_hakmem 10 8 128 1024 1 12345 4
 
 # Phase 4: Tiny Pool benchmarks (properly linked with hakmem)
-TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/tiny_sizeclass_hist_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_unified_cache.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o
+TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/tiny_sizeclass_hist_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_unified_cache.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o
 TINY_BENCH_OBJS = $(TINY_BENCH_OBJS_BASE)
 ifeq ($(POOL_TLS_PHASE1),1)
 TINY_BENCH_OBJS += pool_tls.o pool_refill.o core/pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
diff --git a/core/box/ss_legacy_backend_box.c b/core/box/ss_legacy_backend_box.c
index 24f42915..f8e6783a 100644
--- a/core/box/ss_legacy_backend_box.c
+++ b/core/box/ss_legacy_backend_box.c
@@ -5,6 +5,7 @@
 #include "ss_allocation_box.h"
 #include "hakmem_tiny_config.h"
 #include "hakmem_tiny.h"  // For tiny_self_u32
+#include "../tiny_region_id.h"  // For tiny_region_id_write_header
 #include <stdio.h>
 #include <stdlib.h>
 #include <pthread.h>
@@ -88,7 +89,11 @@ void* hak_tiny_alloc_superslab_backend_hint(int class_idx)
         g_ss_legacy_hint_ss[class_idx] = NULL;
     }
 
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    return tiny_region_id_write_header(base, class_idx);
+#else
     return (void*)base;
+#endif
 }
 
 // ============================================================================
@@ -156,7 +161,11 @@ void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
                 hak_tiny_ss_hint_record(class_idx, chunk, slab_idx);
                 meta->used++;
                 atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed);
+#if HAKMEM_TINY_HEADER_CLASSIDX
+                return tiny_region_id_write_header(base, class_idx);
+#else
                 return (void*)base;
+#endif
             }
         }
         chunk = chunk->next_chunk;
@@ -197,7 +206,11 @@ void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
             hak_tiny_ss_hint_record(class_idx, new_chunk, slab_idx);
             meta->used++;
             atomic_fetch_add_explicit(&new_chunk->total_active_blocks, 1, memory_order_relaxed);
+#if HAKMEM_TINY_HEADER_CLASSIDX
+            return tiny_region_id_write_header(base, class_idx);
+#else
             return (void*)base;
+#endif
         }
     }
 
diff --git a/core/box/ss_unified_backend_box.c b/core/box/ss_unified_backend_box.c
index 7151c874..2bde19e7 100644
--- a/core/box/ss_unified_backend_box.c
+++ b/core/box/ss_unified_backend_box.c
@@ -7,6 +7,7 @@
 #include "hakmem_shared_pool.h"
 #include "hakmem_tiny_config.h"
 #include "ss_allocation_box.h"
+#include "../tiny_region_id.h"  // For tiny_region_id_write_header
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -109,7 +110,11 @@ void* hak_tiny_alloc_superslab_backend_shared(int class_idx)
 
     hak_tiny_ss_hint_record(class_idx, ss, slab_idx);
 
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    return tiny_region_id_write_header(base, class_idx);
+#else
     return (void*)base;
+#endif
 }
 
 // ============================================================================
diff --git a/core/hakmem_tiny_superslab.c b/core/hakmem_tiny_superslab.c
deleted file mode 100644
index 43f05342..00000000
--- a/core/hakmem_tiny_superslab.c
+++ /dev/null
@@ -1,1521 +0,0 @@
-// hakmem_tiny_superslab.c - SuperSlab allocator implementation (Phase 6.22)
-// Purpose: 2MB aligned slab allocation with fast pointer→slab lookup
-// License: MIT
-// Date: 2025-10-24
-
-#include "hakmem_tiny_superslab.h"
-#include "box/ss_hot_cold_box.h"  // Phase 3d-C: Hot/Cold Split
-#include "hakmem_super_registry.h"  // Phase 1: Registry integration
-#include "hakmem_tiny.h"  // For tiny_self_u32
-#include "hakmem_tiny_config.h"  // For extern g_tiny_class_sizes
-#include "hakmem_shared_pool.h"  // Phase 12: Shared SuperSlab pool backend (skeleton)
-#include <sys/mman.h>
-#include <sys/resource.h>
-#include <errno.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>  // getenv, atoi
-#include <pthread.h>
-#include <unistd.h>
-#include <sys/resource.h>  // getrlimit for OOM diagnostics
-#include <sys/mman.h>
-#include "hakmem_internal.h"  // HAKMEM_LOG for release-silent logging
-#include "tiny_region_id.h"   // For HEADER_MAGIC / HEADER_CLASS_MASK (restore header on remote-drain)
-#include "hakmem_tiny_integrity.h"  // HAK_CHECK_CLASS_IDX
-#include "box/tiny_next_ptr_box.h" // For tiny_next_write
-#include "box/slab_freelist_atomic.h" // Phase 1: Atomic freelist accessor
-
-static int g_ss_force_lg = -1;
-static _Atomic int g_ss_populate_once = 0;
-
-// Forward: decide next SuperSlab lg for a class (ACE-aware, clamped)
-static inline uint8_t hak_tiny_superslab_next_lg(int class_idx)
-{
-    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
-        return SUPERSLAB_LG_DEFAULT;
-    }
-    // Prefer ACE target if within allowed range
-    uint8_t t = atomic_load_explicit((_Atomic uint8_t*)&g_ss_ace[class_idx].target_lg,
-                                     memory_order_relaxed);
-    if (t < SUPERSLAB_LG_MIN || t > SUPERSLAB_LG_MAX) {
-        return SUPERSLAB_LG_DEFAULT;
-    }
-    return t;
-}
-
-// ============================================================================
-// Global Statistics
-// ============================================================================
-
-static pthread_mutex_t g_superslab_lock = PTHREAD_MUTEX_INITIALIZER;
-uint64_t g_superslabs_allocated = 0;  // Non-static for debugging
-uint64_t g_superslabs_freed = 0;      // Phase 7.6: Non-static for test access
-uint64_t g_bytes_allocated = 0;  // Non-static for debugging
-
-// ============================================================================
-// Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads
-// ============================================================================
-
-SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS] = {NULL};
-
-// Debug counters
-_Atomic uint64_t g_ss_active_dec_calls = 0;
-_Atomic uint64_t g_hak_tiny_free_calls = 0;
-_Atomic uint64_t g_ss_remote_push_calls = 0;
-// Free path instrumentation (lightweight, for OOM/route diagnosis)
-_Atomic uint64_t g_free_ss_enter = 0;          // hak_tiny_free_superslab() entries
-_Atomic uint64_t g_free_local_box_calls = 0;   // same-thread freelist pushes
-_Atomic uint64_t g_free_remote_box_calls = 0;  // cross-thread remote pushes
-// Per-class counters for gating/metrics (Tiny classes = 8)
-uint64_t g_ss_alloc_by_class[8] = {0};
-uint64_t g_ss_freed_by_class[8] = {0};
-
-typedef struct SuperslabCacheEntry {
-    struct SuperslabCacheEntry* next;
-} SuperslabCacheEntry;
-
-static SuperslabCacheEntry* g_ss_cache_head[8] = {0};
-static size_t g_ss_cache_count[8] = {0};
-static size_t g_ss_cache_cap[8] = {0};
-static size_t g_ss_precharge_target[8] = {0};
-static _Atomic int g_ss_precharge_done[8] = {0};
-static int g_ss_cache_enabled = 0;
-
-static pthread_once_t g_ss_cache_once = PTHREAD_ONCE_INIT;
-static pthread_mutex_t g_ss_cache_lock[8];
-
-uint64_t g_ss_cache_hits[8] = {0};
-uint64_t g_ss_cache_misses[8] = {0};
-uint64_t g_ss_cache_puts[8] = {0};
-uint64_t g_ss_cache_drops[8] = {0};
-uint64_t g_ss_cache_precharged[8] = {0};
-
-uint64_t g_superslabs_reused = 0;
-uint64_t g_superslabs_cached = 0;
-
-static void ss_cache_global_init(void) {
-    for (int i = 0; i < 8; i++) {
-        pthread_mutex_init(&g_ss_cache_lock[i], NULL);
-    }
-}
-
-static inline void ss_cache_ensure_init(void) {
-    pthread_once(&g_ss_cache_once, ss_cache_global_init);
-}
-
-static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate);
-static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask);
-static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class);
-static int ss_cache_push(uint8_t size_class, SuperSlab* ss);
-
-// Drain remote MPSC stack into freelist (ownership already verified by caller)
-void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta)
-{
-    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss) || !meta) return;
-
-    static _Atomic uint32_t g_remote_drain_diag_once = 0;
-    static int g_remote_drain_diag_en = -1;
-
-    // Atomically take the whole remote list
-    uintptr_t head = atomic_exchange_explicit(&ss->remote_heads[slab_idx], 0,
-                                              memory_order_acq_rel);
-    if (head == 0) return;
-
-    // Convert remote stack (offset 0 next) into freelist encoding via Box API
-    // and splice in front of current freelist preserving relative order.
-    void* prev = meta->freelist;
-    int cls = (int)meta->class_idx;
-    HAK_CHECK_CLASS_IDX(cls, "_ss_remote_drain_to_freelist_unsafe");
-    if (__builtin_expect(cls < 0 || cls >= TINY_NUM_CLASSES, 0)) {
-        static _Atomic int g_remote_drain_cls_oob = 0;
-        if (atomic_fetch_add_explicit(&g_remote_drain_cls_oob, 1, memory_order_relaxed) == 0) {
-            fprintf(stderr,
-                    "[REMOTE_DRAIN_CLASS_OOB] ss=%p slab_idx=%d meta=%p cls=%d head=%#lx\n",
-                    (void*)ss, slab_idx, (void*)meta, cls, (unsigned long)head);
-        }
-        return;
-    }
-    uintptr_t cur = head;
-    while (cur != 0) {
-        uintptr_t next = *(uintptr_t*)cur;  // remote-next stored at offset 0
-#if !HAKMEM_BUILD_RELEASE
-        if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) {
-            const char* e = getenv("HAKMEM_TINY_SLL_DIAG");
-            g_remote_drain_diag_en = (e && *e && *e != '0') ? 1 : 0;
-        }
-#else
-        if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) {
-            g_remote_drain_diag_en = 0;
-        }
-#endif
-        if (__builtin_expect(g_remote_drain_diag_en, 0)) {
-            uintptr_t addr = (uintptr_t)next;
-            if (addr != 0 && (addr < 4096 || addr > 0x00007fffffffffffULL)) {
-                uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed);
-                if (shot < 8) {
-                    fprintf(stderr,
-                            "[REMOTE_DRAIN_NEXT_INVALID] cls=%d slab=%d cur=%p next=%p head=%#lx prev=%p count=%u\n",
-                            cls,
-                            slab_idx,
-                            (void*)cur,
-                            (void*)next,
-                            (unsigned long)head,
-                            prev,
-                            (unsigned)meta->used);
-                }
-            }
-#if HAKMEM_TINY_HEADER_CLASSIDX
-            int hdr_cls = tiny_region_id_read_header((uint8_t*)cur + 1);
-            if (hdr_cls >= 0 && hdr_cls != cls) {
-                uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed);
-                if (shot < 8) {
-                    fprintf(stderr,
-                            "[REMOTE_DRAIN_HDR_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d head=%#lx\n",
-                            cls, slab_idx, (void*)cur, hdr_cls, (int)meta->class_idx, (unsigned long)head);
-                }
-            }
-#endif
-        }
-#if HAKMEM_TINY_HEADER_CLASSIDX
-        // Cross-check header vs meta before writing next (even if diag is off)
-        {
-            int hdr_cls_pre = tiny_region_id_read_header((uint8_t*)cur + 1);
-            if (hdr_cls_pre >= 0 && hdr_cls_pre != cls) {
-                static _Atomic uint32_t g_hdr_meta_mismatch_rd = 0;
-                uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_mismatch_rd, 1, memory_order_relaxed);
-                if (n < 16) {
-                    fprintf(stderr,
-                            "[REMOTE_DRAIN_HDR_META_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d\n",
-                            cls, slab_idx, (void*)cur, hdr_cls_pre, (int)meta->class_idx);
-                }
-            }
-        }
-#endif
-        // Restore header for header-classes (class 1-6) which were clobbered by remote push
-#if HAKMEM_TINY_HEADER_CLASSIDX
-        if (cls != 0) {
-            uint8_t expected = (uint8_t)(HEADER_MAGIC | (cls & HEADER_CLASS_MASK));
-            *(uint8_t*)(uintptr_t)cur = expected;
-        }
-#endif
-        // Rewrite next pointer to Box representation for this class
-        tiny_next_write(cls, (void*)cur, prev);
-        prev = (void*)cur;
-        cur = next;
-    }
-    meta->freelist = prev;
-    // Reset remote count after full drain
-    atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release);
-
-    // Update freelist/nonempty visibility bits
-    uint32_t bit = (1u << slab_idx);
-    atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
-    atomic_fetch_or_explicit(&ss->nonempty_mask, bit, memory_order_release);
-}
-
-static inline void ss_stats_os_alloc(uint8_t size_class, size_t ss_size) {
-    pthread_mutex_lock(&g_superslab_lock);
-    g_superslabs_allocated++;
-    if (size_class < 8) {
-        g_ss_alloc_by_class[size_class]++;
-    }
-    g_bytes_allocated += ss_size;
-    pthread_mutex_unlock(&g_superslab_lock);
-}
-
-static inline void ss_stats_cache_reuse(void) {
-    pthread_mutex_lock(&g_superslab_lock);
-    g_superslabs_reused++;
-    pthread_mutex_unlock(&g_superslab_lock);
-}
-
-static inline void ss_stats_cache_store(void) {
-    pthread_mutex_lock(&g_superslab_lock);
-    g_superslabs_cached++;
-    pthread_mutex_unlock(&g_superslab_lock);
-}
-
-// ============================================================================
-// Phase 8.3: ACE (Adaptive Cache Engine) State
-// ============================================================================
-
-SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS] = {{0}};
-
-// Phase 8.3: hak_now_ns() is now defined in hakmem_tiny_superslab.h as static inline
-
-// ============================================================================ 
-// Diagnostics
-// ============================================================================ 
-
-static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) {
-    static int logged = 0;
-    if (logged) return;
-    logged = 1;
-
-    // CRITICAL FIX: Increment lock depth FIRST before any LIBC calls
-    // fopen/fclose/getrlimit/fprintf all may call malloc internally
-    // Must bypass HAKMEM wrapper to avoid header mismatch crash
-    extern __thread int g_hakmem_lock_depth;
-    g_hakmem_lock_depth++;  // Force wrapper to use __libc_malloc
-
-    struct rlimit rl = {0};
-    if (getrlimit(RLIMIT_AS, &rl) != 0) {
-        rl.rlim_cur = RLIM_INFINITY;
-        rl.rlim_max = RLIM_INFINITY;
-    }
-
-    unsigned long vm_size_kb = 0;
-    unsigned long vm_rss_kb = 0;
-    FILE* status = fopen("/proc/self/status", "r");
-    if (status) {
-        char line[256];
-        while (fgets(line, sizeof(line), status)) {
-            if (strncmp(line, "VmSize:", 7) == 0) {
-                (void)sscanf(line + 7, "%lu", &vm_size_kb);
-            } else if (strncmp(line, "VmRSS:", 6) == 0) {
-                (void)sscanf(line + 6, "%lu", &vm_rss_kb);
-            }
-        }
-        fclose(status);
-    }
-    // CRITICAL FIX: Do NOT decrement lock_depth yet!
-    // fprintf() below may call malloc for buffering
-
-    char rl_cur_buf[32];
-    char rl_max_buf[32];
-    if (rl.rlim_cur == RLIM_INFINITY) {
-        strcpy(rl_cur_buf, "inf");
-    } else {
-        snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur);
-    }
-    if (rl.rlim_max == RLIM_INFINITY) {
-        strcpy(rl_max_buf, "inf");
-    } else {
-        snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max);
-    }
-
-#if !HAKMEM_BUILD_RELEASE
-    fprintf(stderr,
-            "[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu "
-            "alloc=%llu freed=%llu bytes=%llu "
-            "RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n",
-            err,
-            ss_size,
-            alloc_size,
-            (unsigned long long)g_superslabs_allocated,
-            (unsigned long long)g_superslabs_freed,
-            (unsigned long long)g_bytes_allocated,
-            rl_cur_buf,
-            rl_max_buf,
-            vm_size_kb,
-            vm_rss_kb);
-#endif
-
-    g_hakmem_lock_depth--;  // Now safe to restore (all libc calls complete)
-}
-
-// Global counters for debugging (non-static for external access)
-_Atomic uint64_t g_ss_mmap_count = 0;
-_Atomic uint64_t g_final_fallback_mmap_count = 0;
-
-static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) {
-    void* ptr = NULL;
-    static int log_count = 0;
-
-#ifdef MAP_ALIGNED_SUPER
-    int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER;
-#ifdef MAP_POPULATE
-    if (populate) {
-        map_flags |= MAP_POPULATE;
-    }
-#endif
-    ptr = mmap(NULL, ss_size,
-               PROT_READ | PROT_WRITE,
-               map_flags,
-               -1, 0);
-    if (ptr != MAP_FAILED) {
-        atomic_fetch_add(&g_ss_mmap_count, 1);
-        if (((uintptr_t)ptr & ss_mask) == 0) {
-            ss_stats_os_alloc(size_class, ss_size);
-            return ptr;
-        }
-        munmap(ptr, ss_size);
-        ptr = NULL;
-    } else {
-        log_superslab_oom_once(ss_size, ss_size, errno);
-    }
-#endif
-
-    size_t alloc_size = ss_size * 2;
-    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-#ifdef MAP_POPULATE
-    if (populate) {
-        flags |= MAP_POPULATE;
-    }
-#endif
-    void* raw = mmap(NULL, alloc_size,
-                     PROT_READ | PROT_WRITE,
-                     flags,
-                     -1, 0);
-    if (raw != MAP_FAILED) {
-        uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1;
-        #if !HAKMEM_BUILD_RELEASE
-        if (log_count < 10) {
-            fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n",
-                    (unsigned long)count, size_class, ss_size);
-            log_count++;
-        }
-        #endif
-    }
-    if (raw == MAP_FAILED) {
-        log_superslab_oom_once(ss_size, alloc_size, errno);
-        return NULL;
-    }
-
-    uintptr_t raw_addr = (uintptr_t)raw;
-    uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask;
-    ptr = (void*)aligned_addr;
-
-    size_t prefix_size = aligned_addr - raw_addr;
-    if (prefix_size > 0) {
-        munmap(raw, prefix_size);
-    }
-    size_t suffix_size = alloc_size - prefix_size - ss_size;
-    if (suffix_size > 0) {
-        if (populate) {
-#ifdef MADV_DONTNEED
-            madvise((char*)ptr + ss_size, suffix_size, MADV_DONTNEED);
-#endif
-        } else {
-            munmap((char*)ptr + ss_size, suffix_size);
-        }
-    }
-
-    ss_stats_os_alloc(size_class, ss_size);
-    return ptr;
-}
-
-static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask) {
-    if (!g_ss_cache_enabled) return;
-    if (size_class >= 8) return;
-    if (g_ss_precharge_target[size_class] == 0) return;
-    if (atomic_load_explicit(&g_ss_precharge_done[size_class], memory_order_acquire)) return;
-
-    ss_cache_ensure_init();
-    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
-    size_t target = g_ss_precharge_target[size_class];
-    size_t cap = g_ss_cache_cap[size_class];
-    size_t desired = target;
-    if (cap != 0 && desired > cap) {
-        desired = cap;
-    }
-    while (g_ss_cache_count[size_class] < desired) {
-        void* raw = ss_os_acquire(size_class, ss_size, ss_mask, 1);
-        if (!raw) {
-            break;
-        }
-        SuperslabCacheEntry* entry = (SuperslabCacheEntry*)raw;
-        entry->next = g_ss_cache_head[size_class];
-        g_ss_cache_head[size_class] = entry;
-        g_ss_cache_count[size_class]++;
-        g_ss_cache_precharged[size_class]++;
-    }
-    atomic_store_explicit(&g_ss_precharge_done[size_class], 1, memory_order_release);
-    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
-}
-
-static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class) {
-    if (!g_ss_cache_enabled) return NULL;
-    if (size_class >= 8) return NULL;
-
-    ss_cache_ensure_init();
-
-    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
-    SuperslabCacheEntry* entry = g_ss_cache_head[size_class];
-    if (entry) {
-        g_ss_cache_head[size_class] = entry->next;
-        if (g_ss_cache_count[size_class] > 0) {
-            g_ss_cache_count[size_class]--;
-        }
-        entry->next = NULL;
-        g_ss_cache_hits[size_class]++;
-    } else {
-        g_ss_cache_misses[size_class]++;
-    }
-    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
-    return entry;
-}
-
-static int ss_cache_push(uint8_t size_class, SuperSlab* ss) {
-    if (!g_ss_cache_enabled) return 0;
-    if (size_class >= 8) return 0;
-
-    ss_cache_ensure_init();
-    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
-    size_t cap = g_ss_cache_cap[size_class];
-    if (cap != 0 && g_ss_cache_count[size_class] >= cap) {
-        g_ss_cache_drops[size_class]++;
-        pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
-        return 0;
-    }
-    SuperslabCacheEntry* entry = (SuperslabCacheEntry*)ss;
-    entry->next = g_ss_cache_head[size_class];
-    g_ss_cache_head[size_class] = entry;
-    g_ss_cache_count[size_class]++;
-    g_ss_cache_puts[size_class]++;
-    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
-    return 1;
-}
-
-/*
- * Legacy backend for hak_tiny_alloc_superslab_box().
- *
- * Phase 12 Stage A/B:
- *  - Uses per-class SuperSlabHead (g_superslab_heads) as the implementation.
- *  - Callers MUST use hak_tiny_alloc_superslab_box() and never touch this directly.
- *  - Later Stage C: this function will be replaced by a shared_pool backend.
- */
-static SuperSlabHead* init_superslab_head(int class_idx);
-static int expand_superslab_head(SuperSlabHead* head);
-
-static void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
-{
-    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
-        return NULL;
-    }
-
-    SuperSlabHead* head = g_superslab_heads[class_idx];
-    if (!head) {
-        head = init_superslab_head(class_idx);
-        if (!head) {
-            return NULL;
-        }
-        g_superslab_heads[class_idx] = head;
-    }
-
-    SuperSlab* chunk = head->current_chunk ? head->current_chunk : head->first_chunk;
-
-    while (chunk) {
-        int cap = ss_slabs_capacity(chunk);
-        for (int slab_idx = 0; slab_idx < cap; slab_idx++) {
-            TinySlabMeta* meta = &chunk->slabs[slab_idx];
-
-            // Skip slabs that belong to a different class (or are uninitialized).
-            if (meta->class_idx != (uint8_t)class_idx && meta->class_idx != 255) {
-                continue;
-            }
-
-            // P1.2 FIX: Initialize slab on first use (like shared backend does)
-            // This ensures class_map is populated for all slabs, not just slab 0
-            if (meta->capacity == 0) {
-                size_t block_size = g_tiny_class_sizes[class_idx];
-                uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
-                superslab_init_slab(chunk, slab_idx, block_size, owner_tid);
-                meta = &chunk->slabs[slab_idx];  // Refresh pointer after init
-                meta->class_idx = (uint8_t)class_idx;
-                // P1.2: Update class_map for dynamic slab initialization
-                chunk->class_map[slab_idx] = (uint8_t)class_idx;
-            }
-
-            if (meta->used < meta->capacity) {
-                size_t stride = tiny_block_stride_for_class(class_idx);
-                size_t offset = (size_t)meta->used * stride;
-                uint8_t* base = (uint8_t*)chunk
-                              + SUPERSLAB_SLAB0_DATA_OFFSET
-                              + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE
-                              + offset;
-
-                meta->used++;
-                atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed);
-                return (void*)base;
-            }
-        }
-        chunk = chunk->next_chunk;
-    }
-
-    if (expand_superslab_head(head) < 0) {
-        return NULL;
-    }
-
-    SuperSlab* new_chunk = head->current_chunk;
-    if (!new_chunk) {
-        return NULL;
-    }
-
-    int cap2 = ss_slabs_capacity(new_chunk);
-    for (int slab_idx = 0; slab_idx < cap2; slab_idx++) {
-        TinySlabMeta* meta = &new_chunk->slabs[slab_idx];
-
-        // P1.2 FIX: Initialize slab on first use (like shared backend does)
-        if (meta->capacity == 0) {
-            size_t block_size = g_tiny_class_sizes[class_idx];
-            uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
-            superslab_init_slab(new_chunk, slab_idx, block_size, owner_tid);
-            meta = &new_chunk->slabs[slab_idx];  // Refresh pointer after init
-            meta->class_idx = (uint8_t)class_idx;
-            // P1.2: Update class_map for dynamic slab initialization
-            new_chunk->class_map[slab_idx] = (uint8_t)class_idx;
-        }
-
-        if (meta->used < meta->capacity) {
-            size_t stride = tiny_block_stride_for_class(class_idx);
-            size_t offset = (size_t)meta->used * stride;
-            uint8_t* base = (uint8_t*)new_chunk
-                          + SUPERSLAB_SLAB0_DATA_OFFSET
-                          + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE
-                          + offset;
-
-            meta->used++;
-            atomic_fetch_add_explicit(&new_chunk->total_active_blocks, 1, memory_order_relaxed);
-            return (void*)base;
-        }
-    }
-
-    return NULL;
-}
-
-/*
- * Shared pool backend for hak_tiny_alloc_superslab_box().
- *
- * Phase 12-2:
- *  - Uses SharedSuperSlabPool (g_shared_pool) to obtain a SuperSlab/slab
- *    for the requested class_idx.
- *  - This backend EXPRESSLY owns only:
- *      - choosing (ss, slab_idx) via shared_pool_acquire_slab()
- *      - initializing that slab's TinySlabMeta via superslab_init_slab()
- *    and nothing else; all callers must go through hak_tiny_alloc_superslab_box().
- *
- *  - For now this is a minimal, conservative implementation:
- *      - One linear bump-run is carved from the acquired slab using tiny_block_stride_for_class().
- *      - No complex per-slab freelist or refill policy yet (Phase 12-3+).
- *      - If shared_pool_acquire_slab() fails, we fall back to legacy backend.
- */
-static void* hak_tiny_alloc_superslab_backend_shared(int class_idx)
-{
-    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
-        return NULL;
-    }
-
-    SuperSlab* ss = NULL;
-    int slab_idx = -1;
-
-    if (shared_pool_acquire_slab(class_idx, &ss, &slab_idx) != 0 || !ss) {
-        // Shared pool could not provide a slab; caller may choose to fall back.
-        return NULL;
-    }
-
-    TinySlabMeta* meta = &ss->slabs[slab_idx];
-
-    // Defensive: shared_pool must either hand us an UNASSIGNED slab or one
-    // already bound to this class. Anything else is a hard bug.
-    if (meta->class_idx != 255 && meta->class_idx != (uint8_t)class_idx) {
-#if !HAKMEM_BUILD_RELEASE
-        fprintf(stderr,
-                "[HAKMEM][SS_SHARED] BUG: acquire_slab mismatch: cls=%d meta->class_idx=%u slab_idx=%d ss=%p\n",
-                class_idx, (unsigned)meta->class_idx, slab_idx, (void*)ss);
-#endif
-        return NULL;
-    }
-
-    // Initialize slab geometry once for this class.
-    if (meta->capacity == 0) {
-        size_t block_size = g_tiny_class_sizes[class_idx];
-        // LARSON FIX: Pass actual thread ID for cross-thread free detection
-        uint32_t my_tid = (uint32_t)(uintptr_t)pthread_self();
-        superslab_init_slab(ss, slab_idx, block_size, my_tid);
-        meta = &ss->slabs[slab_idx];
-
-        // CRITICAL FIX: Always set class_idx after init to avoid C0/C7 confusion.
-        // New SuperSlabs start with meta->class_idx=0 (mmap zero-init).
-        // Must explicitly set to requested class, not just when class_idx==255.
-        meta->class_idx = (uint8_t)class_idx;
-        // P1.1: Update class_map in shared acquire path
-        ss->class_map[slab_idx] = (uint8_t)class_idx;
-    }
-
-    // Final contract check before computing addresses.
-    if (meta->class_idx != (uint8_t)class_idx ||
-        meta->capacity == 0 ||
-        meta->used > meta->capacity) {
-#if !HAKMEM_BUILD_RELEASE
-        fprintf(stderr,
-                "[HAKMEM][SS_SHARED] BUG: invalid slab meta before alloc: "
-                "cls=%d slab_idx=%d meta_cls=%u used=%u cap=%u ss=%p\n",
-                class_idx, slab_idx,
-                (unsigned)meta->class_idx,
-                (unsigned)meta->used,
-                (unsigned)meta->capacity,
-                (void*)ss);
-#endif
-        return NULL;
-    }
-
-    // Simple bump allocation within this slab.
-    if (meta->used >= meta->capacity) {
-        // Slab exhausted: in minimal Phase12-2 backend we do not loop;
-        // caller or future logic must acquire another slab.
-        return NULL;
-    }
-
-    size_t stride = tiny_block_stride_for_class(class_idx);
-    size_t offset = (size_t)meta->used * stride;
-
-    // Phase 12-2 minimal geometry:
-    //  - slab 0 data offset via SUPERSLAB_SLAB0_DATA_OFFSET
-    //  - subsequent slabs at fixed SUPERSLAB_SLAB_USABLE_SIZE strides.
-    size_t slab_base_off = SUPERSLAB_SLAB0_DATA_OFFSET
-                         + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE;
-    uint8_t* base = (uint8_t*)ss + slab_base_off + offset;
-
-    meta->used++;
-    atomic_fetch_add_explicit(&ss->total_active_blocks, 1, memory_order_relaxed);
-
-    return (void*)base;
-}
-
-/*
- * Box API entry:
- *  - Single front-door for tiny-side Superslab allocations.
- *
- * Phase 12 policy:
- *  - HAKMEM_TINY_SS_SHARED=0 → legacy backendのみ（回帰確認用）
- *  - HAKMEM_TINY_SS_SHARED=1 → shared backendを優先し、失敗時のみ legacy にフォールバック
- */
-void* hak_tiny_alloc_superslab_box(int class_idx)
-{
-    static int g_ss_shared_mode = -1;
-    static _Atomic uint32_t g_ss_backend_log = 0;
-    if (__builtin_expect(g_ss_shared_mode == -1, 0)) {
-        const char* e = getenv("HAKMEM_TINY_SS_SHARED");
-        if (!e || !*e) {
-            g_ss_shared_mode = 1; // デフォルト: shared 有効
-        } else {
-            int v = atoi(e);
-            g_ss_shared_mode = (v != 0) ? 1 : 0;
-        }
-    }
-
-    if (g_ss_shared_mode == 1) {
-        void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
-        if (p != NULL) {
-            uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-            if (n < 4) {
-                fprintf(stderr, "[SS_BACKEND] shared cls=%d ptr=%p\n", class_idx, p);
-            }
-            return p;
-        }
-        // shared backend が失敗した場合は安全側で legacy にフォールバック
-        uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-        if (n < 4) {
-            fprintf(stderr, "[SS_BACKEND] shared_fail→legacy cls=%d\n", class_idx);
-        }
-        return hak_tiny_alloc_superslab_backend_legacy(class_idx);
-    }
-
-    // shared OFF 時は legacy のみ
-    uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-    if (n < 4) {
-        fprintf(stderr, "[SS_BACKEND] legacy cls=%d\n", class_idx);
-    }
-    return hak_tiny_alloc_superslab_backend_legacy(class_idx);
-}
-
-// ============================================================================
-// SuperSlab Allocation (2MB aligned)
-// ============================================================================
-
-SuperSlab* superslab_allocate(uint8_t size_class) {
-    // Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗
-    static int fault_rate = -1;  // -1=unparsed, 0=disabled, >0=rate
-    static __thread unsigned long fault_tick = 0;
-    if (__builtin_expect(fault_rate == -1, 0)) {
-        const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE");
-        if (e && *e) {
-            int v = atoi(e); if (v < 0) v = 0; fault_rate = v;
-        } else {
-            fault_rate = 0;
-        }
-    }
-    if (fault_rate > 0) {
-        unsigned long t = ++fault_tick;
-        if ((t % (unsigned long)fault_rate) == 0ul) {
-            return NULL;  // simulate OOM
-        }
-    }
-    // Optional env clamp for SuperSlab size
-    static int env_parsed = 0;
-    static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_DEFAULT;  // Start with default (2MB)
-    static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX;
-    if (!env_parsed) {
-        char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB");
-        if (maxmb) {
-            int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21;
-        }
-        char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB");
-        if (minmb) {
-            int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21;
-        }
-        if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env;
-        const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG");
-        if (force_lg_env && *force_lg_env) {
-            int v = atoi(force_lg_env);
-            if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) {
-                g_ss_force_lg = v;
-                g_ss_min_lg_env = g_ss_max_lg_env = v;
-            }
-        }
-        size_t precharge_default = 0;
-        const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE");
-        if (precharge_env && *precharge_env) {
-            long v = atol(precharge_env);
-            if (v < 0) v = 0;
-            precharge_default = (size_t)v;
-            if (v > 0) {
-                atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
-            }
-        }
-        size_t cache_default = 0;
-        const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE");
-        if (cache_env && *cache_env) {
-            long v = atol(cache_env);
-            if (v < 0) v = 0;
-            cache_default = (size_t)v;
-        }
-        for (int i = 0; i < 8; i++) {
-            g_ss_cache_cap[i] = cache_default;
-            g_ss_precharge_target[i] = precharge_default;
-        }
-        for (int i = 0; i < 8; i++) {
-            char name[64];
-            snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i);
-            char* cap_env = getenv(name);
-            if (cap_env && *cap_env) {
-                long v = atol(cap_env);
-                if (v < 0) v = 0;
-                g_ss_cache_cap[i] = (size_t)v;
-            }
-            snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i);
-            char* pre_env = getenv(name);
-            if (pre_env && *pre_env) {
-                long v = atol(pre_env);
-                if (v < 0) v = 0;
-                g_ss_precharge_target[i] = (size_t)v;
-                if (v > 0) {
-                    atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
-                }
-            }
-            if (g_ss_cache_cap[i] > 0 || g_ss_precharge_target[i] > 0) {
-                g_ss_cache_enabled = 1;
-            }
-        }
-        const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE");
-        if (populate_env && atoi(populate_env) != 0) {
-            atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
-        }
-        env_parsed = 1;
-    }
-
-    uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class);
-    if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env;
-    if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env;
-    size_t ss_size = (size_t)1 << lg;  // 2^20 = 1MB, 2^21 = 2MB
-    uintptr_t ss_mask = ss_size - 1;
-    int from_cache = 0;
-    void* ptr = NULL;
-
-    // Debug logging flag (lazy init)
-    static __thread int dbg = -1;
-#if HAKMEM_BUILD_RELEASE
-    dbg = 0;
-#else
-    if (__builtin_expect(dbg == -1, 0)) {
-        const char* e = getenv("HAKMEM_SS_PREWARM_DEBUG");
-        dbg = (e && *e && *e != '0') ? 1 : 0;
-    }
-#endif
-
-    // Phase 9: Try LRU cache first (lazy deallocation)
-    SuperSlab* cached_ss = hak_ss_lru_pop(size_class);
-    if (cached_ss) {
-        ptr = (void*)cached_ss;
-        from_cache = 1;
-        // Debug logging for REFILL from LRU
-        if (dbg == 1) {
-            fprintf(stderr, "[REFILL] class=%d from_lru=1 ss=%p\n",
-                    size_class, (void*)cached_ss);
-        }
-        // Skip old cache path - LRU cache takes priority
-    } else if (g_ss_cache_enabled && size_class < 8) {
-        // Fallback to old cache (will be deprecated)
-        ss_cache_precharge(size_class, ss_size, ss_mask);
-        SuperslabCacheEntry* old_cached = ss_cache_pop(size_class);
-        if (old_cached) {
-            ptr = (void*)old_cached;
-            from_cache = 1;
-            // Debug logging for REFILL from prewarm (old cache is essentially prewarm)
-            if (dbg == 1) {
-                fprintf(stderr, "[REFILL] class=%d from_prewarm=1 ss=%p\n",
-                        size_class, (void*)old_cached);
-            }
-        }
-    }
-
-    if (!ptr) {
-        int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel);
-        ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate);
-        if (!ptr) {
-            return NULL;
-        }
-        // Debug logging for REFILL with new allocation
-        if (dbg == 1) {
-            fprintf(stderr, "[REFILL] class=%d new_alloc=1 ss=%p\n",
-                    size_class, (void*)ptr);
-        }
-    }
-
-    // Initialize SuperSlab header (Phase 12: no global size_class field)
-    SuperSlab* ss = (SuperSlab*)ptr;
-    ss->magic = SUPERSLAB_MAGIC;
-    ss->active_slabs = 0;
-    ss->lg_size = lg;  // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB)
-    ss->slab_bitmap = 0;
-    ss->nonempty_mask = 0;  // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask
-    ss->partial_epoch = 0;
-    ss->publish_hint = 0xFF;
-
-    // Initialize atomics explicitly
-    atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed);
-    atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed);
-    atomic_store_explicit(&ss->listed, 0, memory_order_relaxed);
-    ss->partial_next = NULL;
-
-    // Phase 9: Initialize LRU fields
-    ss->last_used_ns = 0;
-    ss->generation = 0;
-    ss->lru_prev = NULL;
-    ss->lru_next = NULL;
-
-    // Phase 3d-C: Initialize Hot/Cold Split fields
-    ss->hot_count = 0;
-    ss->cold_count = 0;
-    for (int i = 0; i < 16; i++) {
-        ss->hot_indices[i] = 0;
-        ss->cold_indices[i] = 0;
-    }
-
-    // Initialize all slab metadata (only up to max slabs for this size)
-    int max_slabs = (int)(ss_size / SLAB_SIZE);
-
-    // PERF_OPT: memset removed - mmap() already returns zero-initialized pages
-    // Previous memset calls consumed 23.83% CPU time (perf analysis 2025-11-28)
-    // Measured improvement: +1.3% throughput (71.86M → 72.78M ops/s)
-    // Note: ASan/debug builds may need these, but production mmap guarantees zero pages
-    // memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta));
-    // memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t));
-    // memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t));
-    // memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t));
-
-    for (int i = 0; i < max_slabs; i++) {
-        // Phase 1: Atomic initialization (freelist + used are now _Atomic)
-        slab_freelist_store_relaxed(&ss->slabs[i], NULL);  // Explicit NULL (redundant after memset, but clear intent)
-        atomic_store_explicit(&ss->slabs[i].used, 0, memory_order_relaxed);
-        ss->slabs[i].capacity = 0;
-        ss->slabs[i].owner_tid_low = 0;
-
-        // Initialize remote queue atomics (memset already zeroed, but use proper atomic init)
-        atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed);
-        atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed);
-        atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed);
-    }
-
-    if (from_cache) {
-        ss_stats_cache_reuse();
-    }
-
-    // Phase 8.3: Update ACE current_lg to match allocated size
-    g_ss_ace[size_class].current_lg = lg;
-
-    // Phase 1: Register SuperSlab in global registry for fast lookup
-    // CRITICAL: Register AFTER full initialization (ss structure is ready)
-    uintptr_t base = (uintptr_t)ss;
-    if (!hak_super_register(base, ss)) {
-        // Registry full - this is a fatal error
-        fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss);
-        // Still return ss to avoid memory leak, but lookups may fail
-    }
-
-    return ss;
-}
-
-// ============================================================================
-// Phase 2a: Dynamic Expansion - Chunk Management Functions
-// ============================================================================
-
-// Initialize SuperSlabHead for a class
-SuperSlabHead* init_superslab_head(int class_idx) {
-    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
-        return NULL;
-    }
-
-    // Allocate SuperSlabHead structure
-    SuperSlabHead* head = (SuperSlabHead*)calloc(1, sizeof(SuperSlabHead));
-    if (!head) {
-        extern __thread int g_hakmem_lock_depth;
-        g_hakmem_lock_depth++;
-        fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate SuperSlabHead for class %d\n", class_idx);
-        g_hakmem_lock_depth--;
-        return NULL;
-    }
-
-    head->class_idx = (uint8_t)class_idx;
-    atomic_store_explicit(&head->total_chunks, 0, memory_order_relaxed);
-    head->first_chunk = NULL;
-    head->current_chunk = NULL;
-    pthread_mutex_init(&head->expansion_lock, NULL);
-
-    // Allocate initial chunk(s)
-    // Hot classes (1, 4, 6) get 2 initial chunks to reduce contention
-    int initial_chunks = 1;
-
-    // Phase 2a: Start with 1 chunk for all classes (expansion will handle growth)
-    // This reduces startup memory overhead while still allowing unlimited growth
-    initial_chunks = 1;
-
-    for (int i = 0; i < initial_chunks; i++) {
-        if (expand_superslab_head(head) < 0) {
-            extern __thread int g_hakmem_lock_depth;
-            g_hakmem_lock_depth++;
-            fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate initial chunk %d for class %d\n",
-                    i, class_idx);
-            g_hakmem_lock_depth--;
-
-            // Cleanup on failure
-            SuperSlab* chunk = head->first_chunk;
-            while (chunk) {
-                SuperSlab* next = chunk->next_chunk;
-                superslab_free(chunk);
-                chunk = next;
-            }
-            pthread_mutex_destroy(&head->expansion_lock);
-            free(head);
-            return NULL;
-        }
-    }
-
-    extern __thread int g_hakmem_lock_depth;
-    g_hakmem_lock_depth++;
-#if !HAKMEM_BUILD_RELEASE
-    fprintf(stderr, "[HAKMEM] Initialized SuperSlabHead for class %d: %zu initial chunks\n",
-            class_idx, atomic_load_explicit(&head->total_chunks, memory_order_relaxed));
-#endif
-    g_hakmem_lock_depth--;
-
-    return head;
-}
-
-// Expand SuperSlabHead by allocating and linking a new chunk
-int expand_superslab_head(SuperSlabHead* head) {
-    if (!head) {
-        return -1;
-    }
-
-    // Allocate new chunk via existing superslab_allocate
-    SuperSlab* new_chunk = superslab_allocate(head->class_idx);
-    if (!new_chunk) {
-#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
-        extern __thread int g_hakmem_lock_depth;
-        g_hakmem_lock_depth++;
-        fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate new chunk for class %d (system OOM)\n",
-                head->class_idx);
-        g_hakmem_lock_depth--;
-#endif
-        return -1;  // True OOM (system out of memory)
-    }
-
-    // CRITICAL FIX: Initialize slab 0 so bitmap != 0x00000000
-    // Phase 2a chunks must have at least one usable slab after allocation
-    size_t block_size = g_tiny_class_sizes[head->class_idx];
-    // Use pthread_self() directly since tiny_self_u32() is static inline in hakmem_tiny.c
-    uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
-
-    superslab_init_slab(new_chunk, 0, block_size, owner_tid);
-
-    // Initialize the next_chunk link to NULL
-    new_chunk->next_chunk = NULL;
-
-    // Thread-safe linking
-    pthread_mutex_lock(&head->expansion_lock);
-
-    if (head->current_chunk) {
-        // Find the tail of the list (optimization: could cache tail pointer)
-        SuperSlab* tail = head->current_chunk;
-        while (tail->next_chunk) {
-            tail = tail->next_chunk;
-        }
-        tail->next_chunk = new_chunk;
-    } else {
-        // First chunk
-        head->first_chunk = new_chunk;
-    }
-
-    // Update current chunk to new chunk (for fast allocation)
-    head->current_chunk = new_chunk;
-
-    // Increment total chunks atomically
-    size_t old_count = atomic_fetch_add_explicit(&head->total_chunks, 1, memory_order_relaxed);
-    size_t new_count = old_count + 1;
-
-    pthread_mutex_unlock(&head->expansion_lock);
-
-#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
-    extern __thread int g_hakmem_lock_depth;
-    g_hakmem_lock_depth++;
-    fprintf(stderr, "[HAKMEM] Expanded SuperSlabHead for class %d: %zu chunks now (bitmap=0x%08x)\n",
-            head->class_idx, new_count, new_chunk->slab_bitmap);
-    g_hakmem_lock_depth--;
-#endif
-
-    return 0;
-}
-
-// Find which chunk a pointer belongs to
-SuperSlab* find_chunk_for_ptr(void* ptr, int class_idx) {
-    if (!ptr || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
-        return NULL;
-    }
-
-    SuperSlabHead* head = g_superslab_heads[class_idx];
-    if (!head) {
-        return NULL;
-    }
-
-    uintptr_t ptr_addr = (uintptr_t)ptr;
-
-    // Walk the chunk list
-    SuperSlab* chunk = head->first_chunk;
-    while (chunk) {
-        // Check if ptr is within this chunk's memory range
-        // Each chunk is aligned to SUPERSLAB_SIZE (1MB or 2MB)
-        uintptr_t chunk_start = (uintptr_t)chunk;
-        size_t chunk_size = (size_t)1 << chunk->lg_size;  // Use actual chunk size
-        uintptr_t chunk_end = chunk_start + chunk_size;
-
-        if (ptr_addr >= chunk_start && ptr_addr < chunk_end) {
-            // Found the chunk
-            return chunk;
-        }
-
-        chunk = chunk->next_chunk;
-    }
-
-    return NULL;  // Not found in any chunk
-}
-
-// ============================================================================
-// SuperSlab Deallocation
-// ============================================================================
-
-void superslab_free(SuperSlab* ss) {
-    if (!ss || ss->magic != SUPERSLAB_MAGIC) {
-        return;  // Invalid SuperSlab
-    }
-
-    // ADD DEBUG LOGGING
-    static __thread int dbg = -1;
-#if HAKMEM_BUILD_RELEASE
-    dbg = 0;
-#else
-    if (__builtin_expect(dbg == -1, 0)) {
-        const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
-        dbg = (e && *e && *e != '0') ? 1 : 0;
-    }
-#endif
-    if (dbg == 1) {
-        fprintf(stderr, "[SS_FREE] CALLED: ss=%p lg_size=%d active_slabs=%u\n",
-                (void*)ss, ss->lg_size, ss->active_slabs);
-    }
-
-    // Phase 9: Lazy Deallocation - try to cache in LRU instead of munmap
-    size_t ss_size = (size_t)1 << ss->lg_size;
-
-    // Phase 1: Unregister SuperSlab from registry FIRST
-    // CRITICAL: Must unregister BEFORE adding to LRU cache
-    // Reason: Cached SuperSlabs should NOT be found by lookups
-    uintptr_t base = (uintptr_t)ss;
-    hak_super_unregister(base);
-
-    // Memory fence to ensure unregister is visible
-    atomic_thread_fence(memory_order_release);
-
-    // Phase 9: Try LRU cache first (lazy deallocation)
-    // NOTE: LRU cache keeps magic=SUPERSLAB_MAGIC for validation
-    // Magic will be cleared on eviction or reuse
-    int lru_cached = hak_ss_lru_push(ss);
-    if (dbg == 1) {
-        fprintf(stderr, "[SS_FREE] hak_ss_lru_push() returned %d\n", lru_cached);
-    }
-    if (lru_cached) {
-        // Successfully cached in LRU - defer munmap
-        return;
-    }
-
-    // LRU cache full or disabled - try old cache using head class_idx (if known)
-    int old_cached = ss_cache_push(0, ss);
-    if (old_cached) {
-        ss_stats_cache_store();
-        return;
-    }
-
-    // Both caches full - immediately free to OS (eager deallocation)
-    // Clear magic to prevent use-after-free
-    ss->magic = 0;
-
-#if !HAKMEM_BUILD_RELEASE
-    fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p size=%zu active=%u (LRU full)\n",
-            (void*)ss, ss_size,
-            atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed));
-#endif
-
-    munmap(ss, ss_size);
-
-    // Update statistics for actual release to OS
-    pthread_mutex_lock(&g_superslab_lock);
-    g_superslabs_freed++;
-    // Phase 12: we no longer track per-SS size_class on header; skip g_ss_freed_by_class here
-    g_bytes_allocated -= ss_size;
-    pthread_mutex_unlock(&g_superslab_lock);
-
-#if !HAKMEM_BUILD_RELEASE
-    fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n",
-            (unsigned long long)g_superslabs_freed);
-#endif
-}
-
-// ============================================================================
-// Slab Initialization within SuperSlab
-// ============================================================================
-
-void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_t owner_tid)
-{
-    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
-        return;
-    }
-
-    // Phase E1-CORRECT unified geometry:
-    // - block_size is the TOTAL stride for this class (g_tiny_class_sizes[cls])
-    // - usable bytes are determined by slab index (slab0 vs others)
-    // - capacity = usable / stride for ALL classes (including former C7)
-    size_t usable_size = (slab_idx == 0)
-                           ? SUPERSLAB_SLAB0_USABLE_SIZE
-                           : SUPERSLAB_SLAB_USABLE_SIZE;
-    size_t stride = block_size;
-    uint16_t capacity = (uint16_t)(usable_size / stride);
-
-#if !HAKMEM_BUILD_RELEASE
-    if (slab_idx == 0) {
-        fprintf(stderr,
-                "[SUPERSLAB_INIT] slab 0: usable_size=%zu stride=%zu capacity=%u\n",
-                usable_size, stride, (unsigned)capacity);
-    }
-#endif
-
-    TinySlabMeta* meta = &ss->slabs[slab_idx];
-    meta->freelist = NULL;          // NULL = linear allocation mode
-    meta->used = 0;
-    meta->active = 0;               // P1.3: blocks in use by user (starts at 0)
-    meta->tls_cached = 0;           // P2.2: blocks cached in TLS SLL (starts at 0)
-    meta->capacity = capacity;
-    meta->carved = 0;
-    // LARSON FIX: Use bits 8-15 instead of 0-7 since pthread TIDs are aligned to 256 bytes
-    meta->owner_tid_low = (uint8_t)((owner_tid >> 8) & 0xFFu);
-    // Fail-safe: stamp class_idx from geometry (stride → class).
-    // This ensures legacy/shared/legacy-refill paths all end with a correct class.
-    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
-        if (g_tiny_class_sizes[i] == stride) {
-            meta->class_idx = (uint8_t)i;
-            // P1.1: Update class_map for out-of-band lookup on free path
-            ss->class_map[slab_idx] = (uint8_t)i;
-            break;
-        }
-    }
-
-    superslab_activate_slab(ss, slab_idx);
-}
-
-// ============================================================================
-// Slab Bitmap Management
-// ============================================================================
-
-void superslab_activate_slab(SuperSlab* ss, int slab_idx) {
-    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
-        return;
-    }
-    uint32_t mask = 1u << slab_idx;
-    if ((ss->slab_bitmap & mask) == 0) {
-        ss->slab_bitmap |= mask;
-        ss->active_slabs++;
-
-        // Phase 3d-C: Update hot/cold indices after activating new slab
-        ss_update_hot_cold_indices(ss);
-    }
-}
-
-void superslab_deactivate_slab(SuperSlab* ss, int slab_idx) {
-    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
-        return;
-    }
-    uint32_t mask = 1u << slab_idx;
-    if (ss->slab_bitmap & mask) {
-        ss->slab_bitmap &= ~mask;
-        ss->active_slabs--;
-    }
-}
-
-int superslab_find_free_slab(SuperSlab* ss) {
-    if (!ss) return -1;
-    if ((int)ss->active_slabs >= ss_slabs_capacity(ss)) {
-        return -1;  // No free slabs
-    }
-    // Find first 0 bit in bitmap
-    int cap = ss_slabs_capacity(ss);
-    for (int i = 0; i < cap; i++) {
-        if ((ss->slab_bitmap & (1u << i)) == 0) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-// ============================================================================
-// Statistics / Debugging
-// ============================================================================
-
-void superslab_print_stats(SuperSlab* ss) {
-    if (!ss || ss->magic != SUPERSLAB_MAGIC) {
-        printf("Invalid SuperSlab\n");
-        return;
-    }
-
-    printf("=== SuperSlab Stats ===\n");
-    printf("Address: %p\n", (void*)ss);
-    // Phase 12: per-SS size_class removed; classes are per-slab via meta->class_idx.
-    printf("Active slabs: %u / %d\n", ss->active_slabs, ss_slabs_capacity(ss));
-    printf("Bitmap: 0x%08X\n", ss->slab_bitmap);
-    printf("\nPer-slab details:\n");
-    for (int i = 0; i < ss_slabs_capacity(ss); i++) {
-        if (ss->slab_bitmap & (1u << i)) {
-            TinySlabMeta* meta = &ss->slabs[i];
-            printf("  Slab %2d: used=%u/%u freelist=%p class=%u owner_tid_low=%u\n",
-                   i, meta->used, meta->capacity, meta->freelist,
-                   (unsigned)meta->class_idx, (unsigned)meta->owner_tid_low);
-        }
-    }
-    printf("\n");
-}
-
-// Global statistics
-void superslab_print_global_stats(void) {
-    pthread_mutex_lock(&g_superslab_lock);
-    printf("=== Global SuperSlab Stats ===\n");
-    printf("SuperSlabs allocated: %lu\n", g_superslabs_allocated);
-    printf("SuperSlabs freed: %lu\n", g_superslabs_freed);
-    printf("SuperSlabs active: %lu\n", g_superslabs_allocated - g_superslabs_freed);
-    printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024));
-    pthread_mutex_unlock(&g_superslab_lock);
-}
-
-// ============================================================================
-// Phase 8.3: ACE Statistics / Debugging
-// ============================================================================
-
-void superslab_ace_print_stats(void) {
-    printf("=== ACE (Adaptive Cache Engine) Stats ===\n");
-    const char* class_names[8] = {"8B", "16B", "24B", "32B", "40B", "48B", "56B", "64B"};
-
-    printf("Class   Curr  Targ  Hot  Allocs  Refills  Spills  LiveBlks\n");
-    printf("--------------------------------------------------------------\n");
-
-    for (int i = 0; i < TINY_NUM_CLASSES_SS; i++) {
-        SuperSlabACEState* c = &g_ss_ace[i];
-        printf("%-6s  %2uMB  %2uMB  %4u  %7u  %8u  %7u  %9u\n",
-               class_names[i],
-               (1u << c->current_lg) / (1024 * 1024),
-               (1u << c->target_lg) / (1024 * 1024),
-               c->hot_score,
-               c->alloc_count,
-               c->refill_count,
-               c->spill_count,
-               c->live_blocks);
-    }
-    printf("\n");
-}
-
-// ============================================================================
-// Phase 8.3: ACE Tick Function (Promotion/Demotion Logic)
-// ============================================================================
-
-#define ACE_TICK_NS        (150ULL * 1000 * 1000)  // 150ms tick interval
-#define ACE_COOLDOWN_NS    (800ULL * 1000 * 1000)  // 0.8s cooldown (anti-oscillation)
-
-// Simplified thresholds for refill activity
-#define HI_REFILL(k)       (g_ss_ace[k].refill_count > 64)   // High refill rate
-#define MID_REFILL(k)      (g_ss_ace[k].refill_count > 16)   // Medium refill rate
-
-// Object sizes per class (for capacity calculation)
-// Must match TINY size classes: 8, 16, 24, 32, 40, 48, 56, 64 bytes
-static const int g_tiny_obj_sizes[TINY_NUM_CLASSES_SS] = {8, 16, 24, 32, 40, 48, 56, 64};
-
-void hak_tiny_superslab_ace_tick(int k, uint64_t now) {
-    if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;
-
-    SuperSlabACEState* c = &g_ss_ace[k];
-
-    // Rate limiting: only tick every ACE_TICK_NS (~150ms)
-    if (now - c->last_tick_ns < ACE_TICK_NS) return;
-
-    // Calculate capacity for 1MB and 2MB SuperSlabs
-    int obj_size = g_tiny_obj_sizes[k];
-    double cap1MB = (double)((1U << 20) / obj_size);  // 1MB capacity
-    double cap2MB = (double)((1U << 21) / obj_size);  // 2MB capacity
-
-    // Calculate hotness score (weighted: 60% live blocks, 40% refill rate)
-    double hot = 0.6 * (double)c->live_blocks + 0.4 * (double)c->refill_count;
-    if (hot < 0) hot = 0;
-    if (hot > 1000) hot = 1000;
-    c->hot_score = (uint16_t)hot;
-
-    // Cooldown mechanism: prevent size changes within 0.8s of last change
-    static uint64_t last_switch_ns[TINY_NUM_CLASSES_SS] = {0};
-
-    if (now - last_switch_ns[k] >= ACE_COOLDOWN_NS) {
-        if (c->current_lg <= 20) {
-            // Promotion condition: 1MB → 2MB
-            // High demand (live > 75% capacity) AND high refill rate
-            if (c->live_blocks > 0.75 * cap1MB && HI_REFILL(k)) {
-                c->target_lg = 21;  // Promote to 2MB
-                last_switch_ns[k] = now;
-            }
-        } else {
-            // Demotion condition: 2MB → 1MB
-            // Low demand (live < 35% capacity) AND low refill rate
-            if (c->live_blocks < 0.35 * cap2MB && !MID_REFILL(k)) {
-                c->target_lg = 20;  // Demote to 1MB
-                last_switch_ns[k] = now;
-            }
-        }
-    }
-
-    // EMA-style decay for counters (reduce by 75% each tick)
-    c->alloc_count  = c->alloc_count  / 4;
-    c->refill_count = c->refill_count / 4;
-    c->spill_count  = c->spill_count  / 4;
-    // live_blocks is updated incrementally by alloc/free, not decayed here
-
-    c->last_tick_ns = now;
-}
-
-// ============================================================================
-// Phase 8.4: ACE Observer (Registry-based, zero hot-path overhead)
-// ============================================================================
-
-// Global debug flag (set once at initialization)
-static int g_ace_debug = 0;
-
-// Registry-based observation: scan all SuperSlabs for usage stats
-static void ace_observe_and_decide(int k) {
-    if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;
-
-    SuperSlabACEState* c = &g_ss_ace[k];
-
-    // Scan Registry to count SuperSlabs and total live blocks
-    int ss_count = 0;
-    uint32_t total_live = 0;
-
-    for (int i = 0; i < SUPER_REG_SIZE; i++) {
-        SuperRegEntry* e = &g_super_reg[i];
-
-        // Atomic read (thread-safe)
-        uintptr_t base = atomic_load_explicit(
-            (_Atomic uintptr_t*)&e->base,
-            memory_order_acquire);
-
-        if (base == 0) continue;  // Empty slot
-
-        // Phase 8.4: Safety check - skip if ss pointer is invalid
-        if (!e->ss) continue;
-        // Phase 12: per-SS size_class removed; registry entries are per-class by construction.
-
-        ss_count++;
-        // Phase 8.4: Scan all slabs to count used blocks (zero hot-path overhead)
-        uint32_t ss_live = 0;
-        int cap_scan = ss_slabs_capacity(e->ss);
-        for (int slab_idx = 0; slab_idx < cap_scan; slab_idx++) {
-            TinySlabMeta* meta = &e->ss->slabs[slab_idx];
-            // Relaxed read is OK (stats only, no hot-path impact)
-            ss_live += meta->used;
-        }
-        total_live += ss_live;
-    }
-
-    // Calculate utilization
-    int obj_size = g_tiny_obj_sizes[k];
-    uint8_t current_lg = atomic_load_explicit(
-        (_Atomic uint8_t*)&c->current_lg,
-        memory_order_relaxed);
-
-    uint32_t capacity = (ss_count > 0) ? ss_count * ((1U << current_lg) / obj_size) : 1;
-    double util = (double)total_live / capacity;
-
-    // Update hot_score (for debugging/visualization)
-    c->hot_score = (uint16_t)(util * 1000);
-    if (c->hot_score > 1000) c->hot_score = 1000;
-
-    // Promotion/Demotion decision
-    uint8_t new_target = current_lg;
-
-    if (current_lg <= 20) {
-        // Promotion: 1MB → 2MB
-        if (util > 0.75) {
-            new_target = 21;
-        }
-    } else {
-        // Demotion: 2MB → 1MB
-        if (util < 0.35) {
-            new_target = 20;
-        }
-    }
-
-    // Debug output (if enabled)
-    if (g_ace_debug && ss_count > 0) {
-        fprintf(stderr, "[ACE] Class %d (%dB): ss=%d live=%u cap=%u util=%.2f%% lg=%d->%d hot=%d\n",
-                k, obj_size, ss_count, total_live, capacity, util * 100.0,
-                current_lg, new_target, c->hot_score);
-    }
-
-    // Atomic write (thread-safe)
-    if (new_target != current_lg) {
-        atomic_store_explicit(
-            (_Atomic uint8_t*)&c->target_lg,
-            new_target,
-            memory_order_release);
-        if (g_ace_debug) {
-            fprintf(stderr, "[ACE] *** Class %d: SIZE CHANGE %dMB -> %dMB (util=%.2f%%)\n",
-                    k, 1 << (current_lg - 20), 1 << (new_target - 20), util * 100.0);
-        }
-    }
-}
-
-// Called from Learner thread (background observation)
-void hak_tiny_superslab_ace_observe_all(void) {
-    // Initialize debug flag once
-    static int initialized = 0;
-    if (!initialized) {
-        const char* ace_debug = getenv("HAKMEM_ACE_DEBUG");
-        g_ace_debug = (ace_debug && atoi(ace_debug) != 0) ? 1 : 0;
-        initialized = 1;
-    }
-
-    for (int k = 0; k < TINY_NUM_CLASSES_SS; k++) {
-        ace_observe_and_decide(k);
-    }
-}
diff --git a/core/hakmem_tiny_superslab_internal.h b/core/hakmem_tiny_superslab_internal.h
new file mode 100644
index 00000000..1e0d307e
--- /dev/null
+++ b/core/hakmem_tiny_superslab_internal.h
@@ -0,0 +1,128 @@
+// hakmem_tiny_superslab_internal.h - Internal declarations for superslab refactor
+// Purpose: Shared declarations between superslab implementation files
+// License: MIT
+// Date: 2025-11-28
+
+#ifndef HAKMEM_TINY_SUPERSLAB_INTERNAL_H
+#define HAKMEM_TINY_SUPERSLAB_INTERNAL_H
+
+#include "hakmem_tiny_superslab.h"
+#include "box/ss_hot_cold_box.h"
+#include "hakmem_super_registry.h"
+#include "hakmem_tiny.h"
+#include "hakmem_tiny_config.h"
+#include "hakmem_shared_pool.h"
+#include "hakmem_internal.h"
+#include "tiny_region_id.h"
+#include "hakmem_tiny_integrity.h"
+#include "box/tiny_next_ptr_box.h"
+#include "box/slab_freelist_atomic.h"
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <unistd.h>
+
+// ============================================================================
+// Global Variables (defined in superslab_stats.c)
+// ============================================================================
+
+extern pthread_mutex_t g_superslab_lock;
+extern uint64_t g_superslabs_allocated;
+extern uint64_t g_superslabs_freed;
+extern uint64_t g_bytes_allocated;
+extern _Atomic uint64_t g_ss_active_dec_calls;
+extern _Atomic uint64_t g_hak_tiny_free_calls;
+extern _Atomic uint64_t g_ss_remote_push_calls;
+extern _Atomic uint64_t g_free_ss_enter;
+extern _Atomic uint64_t g_free_local_box_calls;
+extern _Atomic uint64_t g_free_remote_box_calls;
+extern uint64_t g_ss_alloc_by_class[8];
+extern uint64_t g_ss_freed_by_class[8];
+extern _Atomic uint64_t g_ss_mmap_count;
+extern _Atomic uint64_t g_final_fallback_mmap_count;
+
+// ============================================================================
+// SuperSlabHead Management (defined in superslab_head.c)
+// ============================================================================
+
+extern SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS];
+
+// ============================================================================
+// Cache System (defined in superslab_cache.c)
+// ============================================================================
+
+typedef struct SuperslabCacheEntry {
+    struct SuperslabCacheEntry* next;
+} SuperslabCacheEntry;
+
+extern SuperslabCacheEntry* g_ss_cache_head[8];
+extern size_t g_ss_cache_count[8];
+extern size_t g_ss_cache_cap[8];
+extern size_t g_ss_precharge_target[8];
+extern _Atomic int g_ss_precharge_done[8];
+extern int g_ss_cache_enabled;
+extern pthread_once_t g_ss_cache_once;
+extern pthread_mutex_t g_ss_cache_lock[8];
+extern uint64_t g_ss_cache_hits[8];
+extern uint64_t g_ss_cache_misses[8];
+extern uint64_t g_ss_cache_puts[8];
+extern uint64_t g_ss_cache_drops[8];
+extern uint64_t g_ss_cache_precharged[8];
+extern uint64_t g_superslabs_reused;
+extern uint64_t g_superslabs_cached;
+
+// Cache functions (defined in superslab_cache.c)
+void ss_cache_global_init(void);
+void ss_cache_ensure_init(void);
+void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate);
+void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask);
+SuperslabCacheEntry* ss_cache_pop(uint8_t size_class);
+int ss_cache_push(uint8_t size_class, SuperSlab* ss);
+
+// ============================================================================
+// ACE (Adaptive Cache Engine) - defined in superslab_ace.c
+// ============================================================================
+
+extern SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS];
+extern int g_ss_force_lg;
+extern _Atomic int g_ss_populate_once;
+
+uint8_t hak_tiny_superslab_next_lg(int class_idx);
+void ace_observe_and_decide(int k);
+
+// ============================================================================
+// Statistics (defined in superslab_stats.c)
+// ============================================================================
+
+void ss_stats_os_alloc(uint8_t size_class, size_t ss_size);
+void ss_stats_cache_reuse(void);
+void ss_stats_cache_store(void);
+void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err);
+
+// ============================================================================
+// Slab Management (defined in superslab_slab.c)
+// ============================================================================
+
+// Drain remote MPSC stack into freelist (ownership already verified by caller)
+void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta);
+
+// ============================================================================
+// Backend Allocation (defined in superslab_backend.c)
+// ============================================================================
+
+void* hak_tiny_alloc_superslab_backend_legacy(int class_idx);
+void* hak_tiny_alloc_superslab_backend_shared(int class_idx);
+
+// ============================================================================
+// SuperSlabHead Management (defined in superslab_head.c)
+// ============================================================================
+
+SuperSlabHead* init_superslab_head(int class_idx);
+int expand_superslab_head(SuperSlabHead* head);
+SuperSlab* find_chunk_for_ptr(void* ptr, int class_idx);
+
+#endif // HAKMEM_TINY_SUPERSLAB_INTERNAL_H
diff --git a/core/superslab_ace.c b/core/superslab_ace.c
new file mode 100644
index 00000000..65583150
--- /dev/null
+++ b/core/superslab_ace.c
@@ -0,0 +1,230 @@
+// superslab_ace.c - ACE (Adaptive Cache Engine) for SuperSlab allocator
+// Purpose: Dynamic SuperSlab size adaptation based on usage patterns
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+// ============================================================================
+// ACE (Adaptive Cache Engine) State
+// ============================================================================
+
+SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS] = {{0}};
+
+int g_ss_force_lg = -1;
+_Atomic int g_ss_populate_once = 0;
+
+// ============================================================================
+// ACE Helper Functions
+// ============================================================================
+
+// Forward: decide next SuperSlab lg for a class (ACE-aware, clamped)
+uint8_t hak_tiny_superslab_next_lg(int class_idx)
+{
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
+        return SUPERSLAB_LG_DEFAULT;
+    }
+    // Prefer ACE target if within allowed range
+    uint8_t t = atomic_load_explicit((_Atomic uint8_t*)&g_ss_ace[class_idx].target_lg,
+                                     memory_order_relaxed);
+    if (t < SUPERSLAB_LG_MIN || t > SUPERSLAB_LG_MAX) {
+        return SUPERSLAB_LG_DEFAULT;
+    }
+    return t;
+}
+
+// ============================================================================
+// ACE Statistics / Debugging
+// ============================================================================
+
+void superslab_ace_print_stats(void) {
+    printf("=== ACE (Adaptive Cache Engine) Stats ===\n");
+    const char* class_names[8] = {"8B", "16B", "24B", "32B", "40B", "48B", "56B", "64B"};
+
+    printf("Class   Curr  Targ  Hot  Allocs  Refills  Spills  LiveBlks\n");
+    printf("--------------------------------------------------------------\n");
+
+    for (int i = 0; i < TINY_NUM_CLASSES_SS; i++) {
+        SuperSlabACEState* c = &g_ss_ace[i];
+        printf("%-6s  %2uMB  %2uMB  %4u  %7u  %8u  %7u  %9u\n",
+               class_names[i],
+               (1u << c->current_lg) / (1024 * 1024),
+               (1u << c->target_lg) / (1024 * 1024),
+               c->hot_score,
+               c->alloc_count,
+               c->refill_count,
+               c->spill_count,
+               c->live_blocks);
+    }
+    printf("\n");
+}
+
+// ============================================================================
+// ACE Tick Function (Promotion/Demotion Logic)
+// ============================================================================
+
+#define ACE_TICK_NS        (150ULL * 1000 * 1000)  // 150ms tick interval
+#define ACE_COOLDOWN_NS    (800ULL * 1000 * 1000)  // 0.8s cooldown (anti-oscillation)
+
+// Simplified thresholds for refill activity
+#define HI_REFILL(k)       (g_ss_ace[k].refill_count > 64)   // High refill rate
+#define MID_REFILL(k)      (g_ss_ace[k].refill_count > 16)   // Medium refill rate
+
+// Object sizes per class (for capacity calculation)
+// Must match TINY size classes: 8, 16, 24, 32, 40, 48, 56, 64 bytes
+static const int g_tiny_obj_sizes[TINY_NUM_CLASSES_SS] = {8, 16, 24, 32, 40, 48, 56, 64};
+
+void hak_tiny_superslab_ace_tick(int k, uint64_t now) {
+    if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;
+
+    SuperSlabACEState* c = &g_ss_ace[k];
+
+    // Rate limiting: only tick every ACE_TICK_NS (~150ms)
+    if (now - c->last_tick_ns < ACE_TICK_NS) return;
+
+    // Calculate capacity for 1MB and 2MB SuperSlabs
+    int obj_size = g_tiny_obj_sizes[k];
+    double cap1MB = (double)((1U << 20) / obj_size);  // 1MB capacity
+    double cap2MB = (double)((1U << 21) / obj_size);  // 2MB capacity
+
+    // Calculate hotness score (weighted: 60% live blocks, 40% refill rate)
+    double hot = 0.6 * (double)c->live_blocks + 0.4 * (double)c->refill_count;
+    if (hot < 0) hot = 0;
+    if (hot > 1000) hot = 1000;
+    c->hot_score = (uint16_t)hot;
+
+    // Cooldown mechanism: prevent size changes within 0.8s of last change
+    static uint64_t last_switch_ns[TINY_NUM_CLASSES_SS] = {0};
+
+    if (now - last_switch_ns[k] >= ACE_COOLDOWN_NS) {
+        if (c->current_lg <= 20) {
+            // Promotion condition: 1MB → 2MB
+            // High demand (live > 75% capacity) AND high refill rate
+            if (c->live_blocks > 0.75 * cap1MB && HI_REFILL(k)) {
+                c->target_lg = 21;  // Promote to 2MB
+                last_switch_ns[k] = now;
+            }
+        } else {
+            // Demotion condition: 2MB → 1MB
+            // Low demand (live < 35% capacity) AND low refill rate
+            if (c->live_blocks < 0.35 * cap2MB && !MID_REFILL(k)) {
+                c->target_lg = 20;  // Demote to 1MB
+                last_switch_ns[k] = now;
+            }
+        }
+    }
+
+    // EMA-style decay for counters (reduce by 75% each tick)
+    c->alloc_count  = c->alloc_count  / 4;
+    c->refill_count = c->refill_count / 4;
+    c->spill_count  = c->spill_count  / 4;
+    // live_blocks is updated incrementally by alloc/free, not decayed here
+
+    c->last_tick_ns = now;
+}
+
+// ============================================================================
+// ACE Observer (Registry-based, zero hot-path overhead)
+// ============================================================================
+
+// Global debug flag (set once at initialization)
+static int g_ace_debug = 0;
+
+// Registry-based observation: scan all SuperSlabs for usage stats
+void ace_observe_and_decide(int k) {
+    if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;
+
+    SuperSlabACEState* c = &g_ss_ace[k];
+
+    // Scan Registry to count SuperSlabs and total live blocks
+    int ss_count = 0;
+    uint32_t total_live = 0;
+
+    for (int i = 0; i < SUPER_REG_SIZE; i++) {
+        SuperRegEntry* e = &g_super_reg[i];
+
+        // Atomic read (thread-safe)
+        uintptr_t base = atomic_load_explicit(
+            (_Atomic uintptr_t*)&e->base,
+            memory_order_acquire);
+
+        if (base == 0) continue;  // Empty slot
+
+        // Phase 8.4: Safety check - skip if ss pointer is invalid
+        if (!e->ss) continue;
+        // Phase 12: per-SS size_class removed; registry entries are per-class by construction.
+
+        ss_count++;
+        // Phase 8.4: Scan all slabs to count used blocks (zero hot-path overhead)
+        uint32_t ss_live = 0;
+        int cap_scan = ss_slabs_capacity(e->ss);
+        for (int slab_idx = 0; slab_idx < cap_scan; slab_idx++) {
+            TinySlabMeta* meta = &e->ss->slabs[slab_idx];
+            // Relaxed read is OK (stats only, no hot-path impact)
+            ss_live += meta->used;
+        }
+        total_live += ss_live;
+    }
+
+    // Calculate utilization
+    int obj_size = g_tiny_obj_sizes[k];
+    uint8_t current_lg = atomic_load_explicit(
+        (_Atomic uint8_t*)&c->current_lg,
+        memory_order_relaxed);
+
+    uint32_t capacity = (ss_count > 0) ? ss_count * ((1U << current_lg) / obj_size) : 1;
+    double util = (double)total_live / capacity;
+
+    // Update hot_score (for debugging/visualization)
+    c->hot_score = (uint16_t)(util * 1000);
+    if (c->hot_score > 1000) c->hot_score = 1000;
+
+    // Promotion/Demotion decision
+    uint8_t new_target = current_lg;
+
+    if (current_lg <= 20) {
+        // Promotion: 1MB → 2MB
+        if (util > 0.75) {
+            new_target = 21;
+        }
+    } else {
+        // Demotion: 2MB → 1MB
+        if (util < 0.35) {
+            new_target = 20;
+        }
+    }
+
+    // Debug output (if enabled)
+    if (g_ace_debug && ss_count > 0) {
+        fprintf(stderr, "[ACE] Class %d (%dB): ss=%d live=%u cap=%u util=%.2f%% lg=%d->%d hot=%d\n",
+                k, obj_size, ss_count, total_live, capacity, util * 100.0,
+                current_lg, new_target, c->hot_score);
+    }
+
+    // Atomic write (thread-safe)
+    if (new_target != current_lg) {
+        atomic_store_explicit(
+            (_Atomic uint8_t*)&c->target_lg,
+            new_target,
+            memory_order_release);
+        if (g_ace_debug) {
+            fprintf(stderr, "[ACE] *** Class %d: SIZE CHANGE %dMB -> %dMB (util=%.2f%%)\n",
+                    k, 1 << (current_lg - 20), 1 << (new_target - 20), util * 100.0);
+        }
+    }
+}
+
+// Called from Learner thread (background observation)
+void hak_tiny_superslab_ace_observe_all(void) {
+    // Initialize debug flag once
+    static int initialized = 0;
+    if (!initialized) {
+        const char* ace_debug = getenv("HAKMEM_ACE_DEBUG");
+        g_ace_debug = (ace_debug && atoi(ace_debug) != 0) ? 1 : 0;
+        initialized = 1;
+    }
+
+    for (int k = 0; k < TINY_NUM_CLASSES_SS; k++) {
+        ace_observe_and_decide(k);
+    }
+}
diff --git a/core/superslab_allocate.c b/core/superslab_allocate.c
new file mode 100644
index 00000000..f4f03b11
--- /dev/null
+++ b/core/superslab_allocate.c
@@ -0,0 +1,313 @@
+// superslab_allocate.c - SuperSlab allocation and deallocation
+// Purpose: Main allocation/free entry points for SuperSlabs
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+// ============================================================================
+// SuperSlab Allocation (2MB aligned)
+// ============================================================================
+
+SuperSlab* superslab_allocate(uint8_t size_class) {
+    // Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗
+    static int fault_rate = -1;  // -1=unparsed, 0=disabled, >0=rate
+    static __thread unsigned long fault_tick = 0;
+    if (__builtin_expect(fault_rate == -1, 0)) {
+        const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE");
+        if (e && *e) {
+            int v = atoi(e); if (v < 0) v = 0; fault_rate = v;
+        } else {
+            fault_rate = 0;
+        }
+    }
+    if (fault_rate > 0) {
+        unsigned long t = ++fault_tick;
+        if ((t % (unsigned long)fault_rate) == 0ul) {
+            return NULL;  // simulate OOM
+        }
+    }
+    // Optional env clamp for SuperSlab size
+    static int env_parsed = 0;
+    static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_DEFAULT;  // Start with default (2MB)
+    static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX;
+    if (!env_parsed) {
+        char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB");
+        if (maxmb) {
+            int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21;
+        }
+        char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB");
+        if (minmb) {
+            int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21;
+        }
+        if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env;
+        const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG");
+        if (force_lg_env && *force_lg_env) {
+            int v = atoi(force_lg_env);
+            if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) {
+                g_ss_force_lg = v;
+                g_ss_min_lg_env = g_ss_max_lg_env = v;
+            }
+        }
+        size_t precharge_default = 0;
+        const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE");
+        if (precharge_env && *precharge_env) {
+            long v = atol(precharge_env);
+            if (v < 0) v = 0;
+            precharge_default = (size_t)v;
+            if (v > 0) {
+                atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
+            }
+        }
+        size_t cache_default = 0;
+        const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE");
+        if (cache_env && *cache_env) {
+            long v = atol(cache_env);
+            if (v < 0) v = 0;
+            cache_default = (size_t)v;
+        }
+        for (int i = 0; i < 8; i++) {
+            g_ss_cache_cap[i] = cache_default;
+            g_ss_precharge_target[i] = precharge_default;
+        }
+        for (int i = 0; i < 8; i++) {
+            char name[64];
+            snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i);
+            char* cap_env = getenv(name);
+            if (cap_env && *cap_env) {
+                long v = atol(cap_env);
+                if (v < 0) v = 0;
+                g_ss_cache_cap[i] = (size_t)v;
+            }
+            snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i);
+            char* pre_env = getenv(name);
+            if (pre_env && *pre_env) {
+                long v = atol(pre_env);
+                if (v < 0) v = 0;
+                g_ss_precharge_target[i] = (size_t)v;
+                if (v > 0) {
+                    atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
+                }
+            }
+            if (g_ss_cache_cap[i] > 0 || g_ss_precharge_target[i] > 0) {
+                g_ss_cache_enabled = 1;
+            }
+        }
+        const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE");
+        if (populate_env && atoi(populate_env) != 0) {
+            atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
+        }
+        env_parsed = 1;
+    }
+
+    uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class);
+    if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env;
+    if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env;
+    size_t ss_size = (size_t)1 << lg;  // 2^20 = 1MB, 2^21 = 2MB
+    uintptr_t ss_mask = ss_size - 1;
+    int from_cache = 0;
+    void* ptr = NULL;
+
+    // Debug logging flag (lazy init)
+    static __thread int dbg = -1;
+#if HAKMEM_BUILD_RELEASE
+    dbg = 0;
+#else
+    if (__builtin_expect(dbg == -1, 0)) {
+        const char* e = getenv("HAKMEM_SS_PREWARM_DEBUG");
+        dbg = (e && *e && *e != '0') ? 1 : 0;
+    }
+#endif
+
+    // Phase 9: Try LRU cache first (lazy deallocation)
+    SuperSlab* cached_ss = hak_ss_lru_pop(size_class);
+    if (cached_ss) {
+        ptr = (void*)cached_ss;
+        from_cache = 1;
+        // Debug logging for REFILL from LRU
+        if (dbg == 1) {
+            fprintf(stderr, "[REFILL] class=%d from_lru=1 ss=%p\n",
+                    size_class, (void*)cached_ss);
+        }
+        // Skip old cache path - LRU cache takes priority
+    } else if (g_ss_cache_enabled && size_class < 8) {
+        // Fallback to old cache (will be deprecated)
+        ss_cache_precharge(size_class, ss_size, ss_mask);
+        SuperslabCacheEntry* old_cached = ss_cache_pop(size_class);
+        if (old_cached) {
+            ptr = (void*)old_cached;
+            from_cache = 1;
+            // Debug logging for REFILL from prewarm (old cache is essentially prewarm)
+            if (dbg == 1) {
+                fprintf(stderr, "[REFILL] class=%d from_prewarm=1 ss=%p\n",
+                        size_class, (void*)old_cached);
+            }
+        }
+    }
+
+    if (!ptr) {
+        int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel);
+        ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate);
+        if (!ptr) {
+            return NULL;
+        }
+        // Debug logging for REFILL with new allocation
+        if (dbg == 1) {
+            fprintf(stderr, "[REFILL] class=%d new_alloc=1 ss=%p\n",
+                    size_class, (void*)ptr);
+        }
+    }
+
+    // Initialize SuperSlab header (Phase 12: no global size_class field)
+    SuperSlab* ss = (SuperSlab*)ptr;
+    ss->magic = SUPERSLAB_MAGIC;
+    ss->active_slabs = 0;
+    ss->lg_size = lg;  // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB)
+    ss->slab_bitmap = 0;
+    ss->nonempty_mask = 0;  // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask
+    ss->partial_epoch = 0;
+    ss->publish_hint = 0xFF;
+
+    // Initialize atomics explicitly
+    atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed);
+    atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed);
+    atomic_store_explicit(&ss->listed, 0, memory_order_relaxed);
+    ss->partial_next = NULL;
+
+    // Phase 9: Initialize LRU fields
+    ss->last_used_ns = 0;
+    ss->generation = 0;
+    ss->lru_prev = NULL;
+    ss->lru_next = NULL;
+
+    // Phase 3d-C: Initialize Hot/Cold Split fields
+    ss->hot_count = 0;
+    ss->cold_count = 0;
+    for (int i = 0; i < 16; i++) {
+        ss->hot_indices[i] = 0;
+        ss->cold_indices[i] = 0;
+    }
+
+    // Initialize all slab metadata (only up to max slabs for this size)
+    int max_slabs = (int)(ss_size / SLAB_SIZE);
+
+    // PERF_OPT: memset removed - mmap() already returns zero-initialized pages
+    // Previous memset calls consumed 23.83% CPU time (perf analysis 2025-11-28)
+    // Measured improvement: +1.3% throughput (71.86M → 72.78M ops/s)
+    // Note: ASan/debug builds may need these, but production mmap guarantees zero pages
+    // memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta));
+    // memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t));
+    // memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t));
+    // memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t));
+
+    for (int i = 0; i < max_slabs; i++) {
+        // Phase 1: Atomic initialization (freelist + used are now _Atomic)
+        slab_freelist_store_relaxed(&ss->slabs[i], NULL);  // Explicit NULL (redundant after memset, but clear intent)
+        atomic_store_explicit(&ss->slabs[i].used, 0, memory_order_relaxed);
+        ss->slabs[i].capacity = 0;
+        ss->slabs[i].owner_tid_low = 0;
+
+        // Initialize remote queue atomics (memset already zeroed, but use proper atomic init)
+        atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed);
+        atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed);
+        atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed);
+    }
+
+    if (from_cache) {
+        ss_stats_cache_reuse();
+    }
+
+    // Phase 8.3: Update ACE current_lg to match allocated size
+    g_ss_ace[size_class].current_lg = lg;
+
+    // Phase 1: Register SuperSlab in global registry for fast lookup
+    // CRITICAL: Register AFTER full initialization (ss structure is ready)
+    uintptr_t base = (uintptr_t)ss;
+    if (!hak_super_register(base, ss)) {
+        // Registry full - this is a fatal error
+        fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss);
+        // Still return ss to avoid memory leak, but lookups may fail
+    }
+
+    return ss;
+}
+
+// ============================================================================
+// SuperSlab Deallocation
+// ============================================================================
+
+void superslab_free(SuperSlab* ss) {
+    if (!ss || ss->magic != SUPERSLAB_MAGIC) {
+        return;  // Invalid SuperSlab
+    }
+
+    // ADD DEBUG LOGGING
+    static __thread int dbg = -1;
+#if HAKMEM_BUILD_RELEASE
+    dbg = 0;
+#else
+    if (__builtin_expect(dbg == -1, 0)) {
+        const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
+        dbg = (e && *e && *e != '0') ? 1 : 0;
+    }
+#endif
+    if (dbg == 1) {
+        fprintf(stderr, "[SS_FREE] CALLED: ss=%p lg_size=%d active_slabs=%u\n",
+                (void*)ss, ss->lg_size, ss->active_slabs);
+    }
+
+    // Phase 9: Lazy Deallocation - try to cache in LRU instead of munmap
+    size_t ss_size = (size_t)1 << ss->lg_size;
+
+    // Phase 1: Unregister SuperSlab from registry FIRST
+    // CRITICAL: Must unregister BEFORE adding to LRU cache
+    // Reason: Cached SuperSlabs should NOT be found by lookups
+    uintptr_t base = (uintptr_t)ss;
+    hak_super_unregister(base);
+
+    // Memory fence to ensure unregister is visible
+    atomic_thread_fence(memory_order_release);
+
+    // Phase 9: Try LRU cache first (lazy deallocation)
+    // NOTE: LRU cache keeps magic=SUPERSLAB_MAGIC for validation
+    // Magic will be cleared on eviction or reuse
+    int lru_cached = hak_ss_lru_push(ss);
+    if (dbg == 1) {
+        fprintf(stderr, "[SS_FREE] hak_ss_lru_push() returned %d\n", lru_cached);
+    }
+    if (lru_cached) {
+        // Successfully cached in LRU - defer munmap
+        return;
+    }
+
+    // LRU cache full or disabled - try old cache using head class_idx (if known)
+    int old_cached = ss_cache_push(0, ss);
+    if (old_cached) {
+        ss_stats_cache_store();
+        return;
+    }
+
+    // Both caches full - immediately free to OS (eager deallocation)
+    // Clear magic to prevent use-after-free
+    ss->magic = 0;
+
+#if !HAKMEM_BUILD_RELEASE
+    fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p size=%zu active=%u (LRU full)\n",
+            (void*)ss, ss_size,
+            atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed));
+#endif
+
+    munmap(ss, ss_size);
+
+    // Update statistics for actual release to OS
+    pthread_mutex_lock(&g_superslab_lock);
+    g_superslabs_freed++;
+    // Phase 12: we no longer track per-SS size_class on header; skip g_ss_freed_by_class here
+    g_bytes_allocated -= ss_size;
+    pthread_mutex_unlock(&g_superslab_lock);
+
+#if !HAKMEM_BUILD_RELEASE
+    fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n",
+            (unsigned long long)g_superslabs_freed);
+#endif
+}
diff --git a/core/superslab_backend.c b/core/superslab_backend.c
new file mode 100644
index 00000000..b7593f07
--- /dev/null
+++ b/core/superslab_backend.c
@@ -0,0 +1,281 @@
+// superslab_backend.c - Backend allocation paths for SuperSlab allocator
+// Purpose: Legacy and shared pool backend implementations
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+/*
+ * superslab_return_block() - Single exit point for all SuperSlab allocations
+ *
+ * Purpose: Ensures consistent header writing across all allocation paths.
+ * This prevents bugs where headers are written in some paths but not others.
+ *
+ * Parameters:
+ *   base      - Block start address from SuperSlab geometry
+ *   class_idx - Tiny class index (0-7)
+ *
+ * Returns:
+ *   User pointer (base + 1 if headers enabled, base otherwise)
+ *
+ * Header writing behavior:
+ *   - If HAKMEM_TINY_HEADER_CLASSIDX=1: Writes header via tiny_region_id_write_header()
+ *   - If HAKMEM_TINY_HEADER_CLASSIDX=0: Returns base directly (no header)
+ */
+static inline void* superslab_return_block(void* base, int class_idx) {
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    return tiny_region_id_write_header(base, class_idx);
+#else
+    return (void*)base;
+#endif
+}
+
+/*
+ * Legacy backend for hak_tiny_alloc_superslab_box().
+ *
+ * Phase 12 Stage A/B:
+ *  - Uses per-class SuperSlabHead (g_superslab_heads) as the implementation.
+ *  - Callers MUST use hak_tiny_alloc_superslab_box() and never touch this directly.
+ *  - Later Stage C: this function will be replaced by a shared_pool backend.
+ */
+void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
+{
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
+        return NULL;
+    }
+
+    SuperSlabHead* head = g_superslab_heads[class_idx];
+    if (!head) {
+        head = init_superslab_head(class_idx);
+        if (!head) {
+            return NULL;
+        }
+        g_superslab_heads[class_idx] = head;
+    }
+
+    SuperSlab* chunk = head->current_chunk ? head->current_chunk : head->first_chunk;
+
+    while (chunk) {
+        int cap = ss_slabs_capacity(chunk);
+        for (int slab_idx = 0; slab_idx < cap; slab_idx++) {
+            TinySlabMeta* meta = &chunk->slabs[slab_idx];
+
+            // Skip slabs that belong to a different class (or are uninitialized).
+            if (meta->class_idx != (uint8_t)class_idx && meta->class_idx != 255) {
+                continue;
+            }
+
+            // P1.2 FIX: Initialize slab on first use (like shared backend does)
+            // This ensures class_map is populated for all slabs, not just slab 0
+            if (meta->capacity == 0) {
+                size_t block_size = g_tiny_class_sizes[class_idx];
+                uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
+                superslab_init_slab(chunk, slab_idx, block_size, owner_tid);
+                meta = &chunk->slabs[slab_idx];  // Refresh pointer after init
+                meta->class_idx = (uint8_t)class_idx;
+                // P1.2: Update class_map for dynamic slab initialization
+                chunk->class_map[slab_idx] = (uint8_t)class_idx;
+            }
+
+            if (meta->used < meta->capacity) {
+                size_t stride = tiny_block_stride_for_class(class_idx);
+                size_t offset = (size_t)meta->used * stride;
+                uint8_t* base = (uint8_t*)chunk
+                              + SUPERSLAB_SLAB0_DATA_OFFSET
+                              + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE
+                              + offset;
+
+                meta->used++;
+                atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed);
+                return superslab_return_block(base, class_idx);
+            }
+        }
+        chunk = chunk->next_chunk;
+    }
+
+    if (expand_superslab_head(head) < 0) {
+        return NULL;
+    }
+
+    SuperSlab* new_chunk = head->current_chunk;
+    if (!new_chunk) {
+        return NULL;
+    }
+
+    int cap2 = ss_slabs_capacity(new_chunk);
+    for (int slab_idx = 0; slab_idx < cap2; slab_idx++) {
+        TinySlabMeta* meta = &new_chunk->slabs[slab_idx];
+
+        // P1.2 FIX: Initialize slab on first use (like shared backend does)
+        if (meta->capacity == 0) {
+            size_t block_size = g_tiny_class_sizes[class_idx];
+            uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
+            superslab_init_slab(new_chunk, slab_idx, block_size, owner_tid);
+            meta = &new_chunk->slabs[slab_idx];  // Refresh pointer after init
+            meta->class_idx = (uint8_t)class_idx;
+            // P1.2: Update class_map for dynamic slab initialization
+            new_chunk->class_map[slab_idx] = (uint8_t)class_idx;
+        }
+
+        if (meta->used < meta->capacity) {
+            size_t stride = tiny_block_stride_for_class(class_idx);
+            size_t offset = (size_t)meta->used * stride;
+            uint8_t* base = (uint8_t*)new_chunk
+                          + SUPERSLAB_SLAB0_DATA_OFFSET
+                          + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE
+                          + offset;
+
+            meta->used++;
+            atomic_fetch_add_explicit(&new_chunk->total_active_blocks, 1, memory_order_relaxed);
+            return superslab_return_block(base, class_idx);
+        }
+    }
+
+    return NULL;
+}
+
+/*
+ * Shared pool backend for hak_tiny_alloc_superslab_box().
+ *
+ * Phase 12-2:
+ *  - Uses SharedSuperSlabPool (g_shared_pool) to obtain a SuperSlab/slab
+ *    for the requested class_idx.
+ *  - This backend EXPRESSLY owns only:
+ *      - choosing (ss, slab_idx) via shared_pool_acquire_slab()
+ *      - initializing that slab's TinySlabMeta via superslab_init_slab()
+ *    and nothing else; all callers must go through hak_tiny_alloc_superslab_box().
+ *
+ *  - For now this is a minimal, conservative implementation:
+ *      - One linear bump-run is carved from the acquired slab using tiny_block_stride_for_class().
+ *      - No complex per-slab freelist or refill policy yet (Phase 12-3+).
+ *      - If shared_pool_acquire_slab() fails, we fall back to legacy backend.
+ */
+void* hak_tiny_alloc_superslab_backend_shared(int class_idx)
+{
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
+        return NULL;
+    }
+
+    SuperSlab* ss = NULL;
+    int slab_idx = -1;
+
+    if (shared_pool_acquire_slab(class_idx, &ss, &slab_idx) != 0 || !ss) {
+        // Shared pool could not provide a slab; caller may choose to fall back.
+        return NULL;
+    }
+
+    TinySlabMeta* meta = &ss->slabs[slab_idx];
+
+    // Defensive: shared_pool must either hand us an UNASSIGNED slab or one
+    // already bound to this class. Anything else is a hard bug.
+    if (meta->class_idx != 255 && meta->class_idx != (uint8_t)class_idx) {
+#if !HAKMEM_BUILD_RELEASE
+        fprintf(stderr,
+                "[HAKMEM][SS_SHARED] BUG: acquire_slab mismatch: cls=%d meta->class_idx=%u slab_idx=%d ss=%p\n",
+                class_idx, (unsigned)meta->class_idx, slab_idx, (void*)ss);
+#endif
+        return NULL;
+    }
+
+    // Initialize slab geometry once for this class.
+    if (meta->capacity == 0) {
+        size_t block_size = g_tiny_class_sizes[class_idx];
+        // LARSON FIX: Pass actual thread ID for cross-thread free detection
+        uint32_t my_tid = (uint32_t)(uintptr_t)pthread_self();
+        superslab_init_slab(ss, slab_idx, block_size, my_tid);
+        meta = &ss->slabs[slab_idx];
+
+        // CRITICAL FIX: Always set class_idx after init to avoid C0/C7 confusion.
+        // New SuperSlabs start with meta->class_idx=0 (mmap zero-init).
+        // Must explicitly set to requested class, not just when class_idx==255.
+        meta->class_idx = (uint8_t)class_idx;
+        // P1.1: Update class_map in shared acquire path
+        ss->class_map[slab_idx] = (uint8_t)class_idx;
+    }
+
+    // Final contract check before computing addresses.
+    if (meta->class_idx != (uint8_t)class_idx ||
+        meta->capacity == 0 ||
+        meta->used > meta->capacity) {
+#if !HAKMEM_BUILD_RELEASE
+        fprintf(stderr,
+                "[HAKMEM][SS_SHARED] BUG: invalid slab meta before alloc: "
+                "cls=%d slab_idx=%d meta_cls=%u used=%u cap=%u ss=%p\n",
+                class_idx, slab_idx,
+                (unsigned)meta->class_idx,
+                (unsigned)meta->used,
+                (unsigned)meta->capacity,
+                (void*)ss);
+#endif
+        return NULL;
+    }
+
+    // Simple bump allocation within this slab.
+    if (meta->used >= meta->capacity) {
+        // Slab exhausted: in minimal Phase12-2 backend we do not loop;
+        // caller or future logic must acquire another slab.
+        return NULL;
+    }
+
+    size_t stride = tiny_block_stride_for_class(class_idx);
+    size_t offset = (size_t)meta->used * stride;
+
+    // Phase 12-2 minimal geometry:
+    //  - slab 0 data offset via SUPERSLAB_SLAB0_DATA_OFFSET
+    //  - subsequent slabs at fixed SUPERSLAB_SLAB_USABLE_SIZE strides.
+    size_t slab_base_off = SUPERSLAB_SLAB0_DATA_OFFSET
+                         + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE;
+    uint8_t* base = (uint8_t*)ss + slab_base_off + offset;
+
+    meta->used++;
+    atomic_fetch_add_explicit(&ss->total_active_blocks, 1, memory_order_relaxed);
+
+    return superslab_return_block(base, class_idx);
+}
+
+/*
+ * Box API entry:
+ *  - Single front-door for tiny-side Superslab allocations.
+ *
+ * Phase 12 policy:
+ *  - HAKMEM_TINY_SS_SHARED=0 → legacy backendのみ（回帰確認用）
+ *  - HAKMEM_TINY_SS_SHARED=1 → shared backendを優先し、失敗時のみ legacy にフォールバック
+ */
+void* hak_tiny_alloc_superslab_box(int class_idx)
+{
+    static int g_ss_shared_mode = -1;
+    static _Atomic uint32_t g_ss_backend_log = 0;
+    if (__builtin_expect(g_ss_shared_mode == -1, 0)) {
+        const char* e = getenv("HAKMEM_TINY_SS_SHARED");
+        if (!e || !*e) {
+            g_ss_shared_mode = 1; // デフォルト: shared 有効
+        } else {
+            int v = atoi(e);
+            g_ss_shared_mode = (v != 0) ? 1 : 0;
+        }
+    }
+
+    if (g_ss_shared_mode == 1) {
+        void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
+        if (p != NULL) {
+            uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
+            if (n < 4) {
+                fprintf(stderr, "[SS_BACKEND] shared cls=%d ptr=%p\n", class_idx, p);
+            }
+            return p;
+        }
+        // shared backend が失敗した場合は安全側で legacy にフォールバック
+        uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
+        if (n < 4) {
+            fprintf(stderr, "[SS_BACKEND] shared_fail→legacy cls=%d\n", class_idx);
+        }
+        return hak_tiny_alloc_superslab_backend_legacy(class_idx);
+    }
+
+    // shared OFF 時は legacy のみ
+    uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
+    if (n < 4) {
+        fprintf(stderr, "[SS_BACKEND] legacy cls=%d\n", class_idx);
+    }
+    return hak_tiny_alloc_superslab_backend_legacy(class_idx);
+}
diff --git a/core/superslab_cache.c b/core/superslab_cache.c
new file mode 100644
index 00000000..d26bef6d
--- /dev/null
+++ b/core/superslab_cache.c
@@ -0,0 +1,204 @@
+// superslab_cache.c - Cache management for SuperSlab allocator
+// Purpose: LRU cache and old cache (prewarm) for SuperSlabs
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+// ============================================================================
+// Cache System - Global Variables
+// ============================================================================
+
+SuperslabCacheEntry* g_ss_cache_head[8] = {0};
+size_t g_ss_cache_count[8] = {0};
+size_t g_ss_cache_cap[8] = {0};
+size_t g_ss_precharge_target[8] = {0};
+_Atomic int g_ss_precharge_done[8] = {0};
+int g_ss_cache_enabled = 0;
+
+pthread_once_t g_ss_cache_once = PTHREAD_ONCE_INIT;
+pthread_mutex_t g_ss_cache_lock[8];
+
+uint64_t g_ss_cache_hits[8] = {0};
+uint64_t g_ss_cache_misses[8] = {0};
+uint64_t g_ss_cache_puts[8] = {0};
+uint64_t g_ss_cache_drops[8] = {0};
+uint64_t g_ss_cache_precharged[8] = {0};
+
+uint64_t g_superslabs_reused = 0;
+uint64_t g_superslabs_cached = 0;
+
+// ============================================================================
+// Cache Initialization
+// ============================================================================
+
+void ss_cache_global_init(void) {
+    for (int i = 0; i < 8; i++) {
+        pthread_mutex_init(&g_ss_cache_lock[i], NULL);
+    }
+}
+
+void ss_cache_ensure_init(void) {
+    pthread_once(&g_ss_cache_once, ss_cache_global_init);
+}
+
+// ============================================================================
+// OS Acquisition (mmap with alignment)
+// ============================================================================
+
+void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) {
+    void* ptr = NULL;
+    static int log_count = 0;
+
+#ifdef MAP_ALIGNED_SUPER
+    int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER;
+#ifdef MAP_POPULATE
+    if (populate) {
+        map_flags |= MAP_POPULATE;
+    }
+#endif
+    ptr = mmap(NULL, ss_size,
+               PROT_READ | PROT_WRITE,
+               map_flags,
+               -1, 0);
+    if (ptr != MAP_FAILED) {
+        atomic_fetch_add(&g_ss_mmap_count, 1);
+        if (((uintptr_t)ptr & ss_mask) == 0) {
+            ss_stats_os_alloc(size_class, ss_size);
+            return ptr;
+        }
+        munmap(ptr, ss_size);
+        ptr = NULL;
+    } else {
+        log_superslab_oom_once(ss_size, ss_size, errno);
+    }
+#endif
+
+    size_t alloc_size = ss_size * 2;
+    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef MAP_POPULATE
+    if (populate) {
+        flags |= MAP_POPULATE;
+    }
+#endif
+    void* raw = mmap(NULL, alloc_size,
+                     PROT_READ | PROT_WRITE,
+                     flags,
+                     -1, 0);
+    if (raw != MAP_FAILED) {
+        uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1;
+        #if !HAKMEM_BUILD_RELEASE
+        if (log_count < 10) {
+            fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n",
+                    (unsigned long)count, size_class, ss_size);
+            log_count++;
+        }
+        #endif
+    }
+    if (raw == MAP_FAILED) {
+        log_superslab_oom_once(ss_size, alloc_size, errno);
+        return NULL;
+    }
+
+    uintptr_t raw_addr = (uintptr_t)raw;
+    uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask;
+    ptr = (void*)aligned_addr;
+
+    size_t prefix_size = aligned_addr - raw_addr;
+    if (prefix_size > 0) {
+        munmap(raw, prefix_size);
+    }
+    size_t suffix_size = alloc_size - prefix_size - ss_size;
+    if (suffix_size > 0) {
+        if (populate) {
+#ifdef MADV_DONTNEED
+            madvise((char*)ptr + ss_size, suffix_size, MADV_DONTNEED);
+#endif
+        } else {
+            munmap((char*)ptr + ss_size, suffix_size);
+        }
+    }
+
+    ss_stats_os_alloc(size_class, ss_size);
+    return ptr;
+}
+
+// ============================================================================
+// Cache Precharge (prewarm)
+// ============================================================================
+
+void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask) {
+    if (!g_ss_cache_enabled) return;
+    if (size_class >= 8) return;
+    if (g_ss_precharge_target[size_class] == 0) return;
+    if (atomic_load_explicit(&g_ss_precharge_done[size_class], memory_order_acquire)) return;
+
+    ss_cache_ensure_init();
+    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
+    size_t target = g_ss_precharge_target[size_class];
+    size_t cap = g_ss_cache_cap[size_class];
+    size_t desired = target;
+    if (cap != 0 && desired > cap) {
+        desired = cap;
+    }
+    while (g_ss_cache_count[size_class] < desired) {
+        void* raw = ss_os_acquire(size_class, ss_size, ss_mask, 1);
+        if (!raw) {
+            break;
+        }
+        SuperslabCacheEntry* entry = (SuperslabCacheEntry*)raw;
+        entry->next = g_ss_cache_head[size_class];
+        g_ss_cache_head[size_class] = entry;
+        g_ss_cache_count[size_class]++;
+        g_ss_cache_precharged[size_class]++;
+    }
+    atomic_store_explicit(&g_ss_precharge_done[size_class], 1, memory_order_release);
+    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
+}
+
+// ============================================================================
+// Cache Pop/Push Operations
+// ============================================================================
+
+SuperslabCacheEntry* ss_cache_pop(uint8_t size_class) {
+    if (!g_ss_cache_enabled) return NULL;
+    if (size_class >= 8) return NULL;
+
+    ss_cache_ensure_init();
+
+    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
+    SuperslabCacheEntry* entry = g_ss_cache_head[size_class];
+    if (entry) {
+        g_ss_cache_head[size_class] = entry->next;
+        if (g_ss_cache_count[size_class] > 0) {
+            g_ss_cache_count[size_class]--;
+        }
+        entry->next = NULL;
+        g_ss_cache_hits[size_class]++;
+    } else {
+        g_ss_cache_misses[size_class]++;
+    }
+    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
+    return entry;
+}
+
+int ss_cache_push(uint8_t size_class, SuperSlab* ss) {
+    if (!g_ss_cache_enabled) return 0;
+    if (size_class >= 8) return 0;
+
+    ss_cache_ensure_init();
+    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
+    size_t cap = g_ss_cache_cap[size_class];
+    if (cap != 0 && g_ss_cache_count[size_class] >= cap) {
+        g_ss_cache_drops[size_class]++;
+        pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
+        return 0;
+    }
+    SuperslabCacheEntry* entry = (SuperslabCacheEntry*)ss;
+    entry->next = g_ss_cache_head[size_class];
+    g_ss_cache_head[size_class] = entry;
+    g_ss_cache_count[size_class]++;
+    g_ss_cache_puts[size_class]++;
+    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
+    return 1;
+}
diff --git a/core/superslab_head.c b/core/superslab_head.c
new file mode 100644
index 00000000..e9841181
--- /dev/null
+++ b/core/superslab_head.c
@@ -0,0 +1,176 @@
+// superslab_head.c - SuperSlabHead management for dynamic expansion
+// Purpose: Per-class chunk lists and expansion logic
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+// ============================================================================
+// Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads
+// ============================================================================
+
+SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS] = {NULL};
+
+// ============================================================================
+// SuperSlabHead Management Functions
+// ============================================================================
+
+// Initialize SuperSlabHead for a class
+SuperSlabHead* init_superslab_head(int class_idx) {
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
+        return NULL;
+    }
+
+    // Allocate SuperSlabHead structure
+    SuperSlabHead* head = (SuperSlabHead*)calloc(1, sizeof(SuperSlabHead));
+    if (!head) {
+        extern __thread int g_hakmem_lock_depth;
+        g_hakmem_lock_depth++;
+        fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate SuperSlabHead for class %d\n", class_idx);
+        g_hakmem_lock_depth--;
+        return NULL;
+    }
+
+    head->class_idx = (uint8_t)class_idx;
+    atomic_store_explicit(&head->total_chunks, 0, memory_order_relaxed);
+    head->first_chunk = NULL;
+    head->current_chunk = NULL;
+    pthread_mutex_init(&head->expansion_lock, NULL);
+
+    // Allocate initial chunk(s)
+    // Hot classes (1, 4, 6) get 2 initial chunks to reduce contention
+    int initial_chunks = 1;
+
+    // Phase 2a: Start with 1 chunk for all classes (expansion will handle growth)
+    // This reduces startup memory overhead while still allowing unlimited growth
+    initial_chunks = 1;
+
+    for (int i = 0; i < initial_chunks; i++) {
+        if (expand_superslab_head(head) < 0) {
+            extern __thread int g_hakmem_lock_depth;
+            g_hakmem_lock_depth++;
+            fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate initial chunk %d for class %d\n",
+                    i, class_idx);
+            g_hakmem_lock_depth--;
+
+            // Cleanup on failure
+            SuperSlab* chunk = head->first_chunk;
+            while (chunk) {
+                SuperSlab* next = chunk->next_chunk;
+                superslab_free(chunk);
+                chunk = next;
+            }
+            pthread_mutex_destroy(&head->expansion_lock);
+            free(head);
+            return NULL;
+        }
+    }
+
+    extern __thread int g_hakmem_lock_depth;
+    g_hakmem_lock_depth++;
+#if !HAKMEM_BUILD_RELEASE
+    fprintf(stderr, "[HAKMEM] Initialized SuperSlabHead for class %d: %zu initial chunks\n",
+            class_idx, atomic_load_explicit(&head->total_chunks, memory_order_relaxed));
+#endif
+    g_hakmem_lock_depth--;
+
+    return head;
+}
+
+// Expand SuperSlabHead by allocating and linking a new chunk
+int expand_superslab_head(SuperSlabHead* head) {
+    if (!head) {
+        return -1;
+    }
+
+    // Allocate new chunk via existing superslab_allocate
+    SuperSlab* new_chunk = superslab_allocate(head->class_idx);
+    if (!new_chunk) {
+#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
+        extern __thread int g_hakmem_lock_depth;
+        g_hakmem_lock_depth++;
+        fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate new chunk for class %d (system OOM)\n",
+                head->class_idx);
+        g_hakmem_lock_depth--;
+#endif
+        return -1;  // True OOM (system out of memory)
+    }
+
+    // CRITICAL FIX: Initialize slab 0 so bitmap != 0x00000000
+    // Phase 2a chunks must have at least one usable slab after allocation
+    size_t block_size = g_tiny_class_sizes[head->class_idx];
+    // Use pthread_self() directly since tiny_self_u32() is static inline in hakmem_tiny.c
+    uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
+
+    superslab_init_slab(new_chunk, 0, block_size, owner_tid);
+
+    // Initialize the next_chunk link to NULL
+    new_chunk->next_chunk = NULL;
+
+    // Thread-safe linking
+    pthread_mutex_lock(&head->expansion_lock);
+
+    if (head->current_chunk) {
+        // Find the tail of the list (optimization: could cache tail pointer)
+        SuperSlab* tail = head->current_chunk;
+        while (tail->next_chunk) {
+            tail = tail->next_chunk;
+        }
+        tail->next_chunk = new_chunk;
+    } else {
+        // First chunk
+        head->first_chunk = new_chunk;
+    }
+
+    // Update current chunk to new chunk (for fast allocation)
+    head->current_chunk = new_chunk;
+
+    // Increment total chunks atomically
+    size_t old_count = atomic_fetch_add_explicit(&head->total_chunks, 1, memory_order_relaxed);
+    size_t new_count = old_count + 1;
+
+    pthread_mutex_unlock(&head->expansion_lock);
+
+#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
+    extern __thread int g_hakmem_lock_depth;
+    g_hakmem_lock_depth++;
+    fprintf(stderr, "[HAKMEM] Expanded SuperSlabHead for class %d: %zu chunks now (bitmap=0x%08x)\n",
+            head->class_idx, new_count, new_chunk->slab_bitmap);
+    g_hakmem_lock_depth--;
+#endif
+
+    return 0;
+}
+
+// Find which chunk a pointer belongs to
+SuperSlab* find_chunk_for_ptr(void* ptr, int class_idx) {
+    if (!ptr || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
+        return NULL;
+    }
+
+    SuperSlabHead* head = g_superslab_heads[class_idx];
+    if (!head) {
+        return NULL;
+    }
+
+    uintptr_t ptr_addr = (uintptr_t)ptr;
+
+    // Walk the chunk list
+    SuperSlab* chunk = head->first_chunk;
+    while (chunk) {
+        // Check if ptr is within this chunk's memory range
+        // Each chunk is aligned to SUPERSLAB_SIZE (1MB or 2MB)
+        uintptr_t chunk_start = (uintptr_t)chunk;
+        size_t chunk_size = (size_t)1 << chunk->lg_size;  // Use actual chunk size
+        uintptr_t chunk_end = chunk_start + chunk_size;
+
+        if (ptr_addr >= chunk_start && ptr_addr < chunk_end) {
+            // Found the chunk
+            return chunk;
+        }
+
+        chunk = chunk->next_chunk;
+    }
+
+    return NULL;  // Not found in any chunk
+}
diff --git a/core/superslab_slab.c b/core/superslab_slab.c
new file mode 100644
index 00000000..4219f943
--- /dev/null
+++ b/core/superslab_slab.c
@@ -0,0 +1,210 @@
+// superslab_slab.c - Slab initialization and management
+// Purpose: Slab lifecycle and bitmap management within SuperSlabs
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+// ============================================================================
+// Remote Drain (MPSC queue to freelist conversion)
+// ============================================================================
+
+// Drain remote MPSC stack into freelist (ownership already verified by caller)
+void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta)
+{
+    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss) || !meta) return;
+
+    static _Atomic uint32_t g_remote_drain_diag_once = 0;
+    static int g_remote_drain_diag_en = -1;
+
+    // Atomically take the whole remote list
+    uintptr_t head = atomic_exchange_explicit(&ss->remote_heads[slab_idx], 0,
+                                              memory_order_acq_rel);
+    if (head == 0) return;
+
+    // Convert remote stack (offset 0 next) into freelist encoding via Box API
+    // and splice in front of current freelist preserving relative order.
+    void* prev = meta->freelist;
+    int cls = (int)meta->class_idx;
+    HAK_CHECK_CLASS_IDX(cls, "_ss_remote_drain_to_freelist_unsafe");
+    if (__builtin_expect(cls < 0 || cls >= TINY_NUM_CLASSES, 0)) {
+        static _Atomic int g_remote_drain_cls_oob = 0;
+        if (atomic_fetch_add_explicit(&g_remote_drain_cls_oob, 1, memory_order_relaxed) == 0) {
+            fprintf(stderr,
+                    "[REMOTE_DRAIN_CLASS_OOB] ss=%p slab_idx=%d meta=%p cls=%d head=%#lx\n",
+                    (void*)ss, slab_idx, (void*)meta, cls, (unsigned long)head);
+        }
+        return;
+    }
+    uintptr_t cur = head;
+    while (cur != 0) {
+        uintptr_t next = *(uintptr_t*)cur;  // remote-next stored at offset 0
+#if !HAKMEM_BUILD_RELEASE
+        if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) {
+            const char* e = getenv("HAKMEM_TINY_SLL_DIAG");
+            g_remote_drain_diag_en = (e && *e && *e != '0') ? 1 : 0;
+        }
+#else
+        if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) {
+            g_remote_drain_diag_en = 0;
+        }
+#endif
+        if (__builtin_expect(g_remote_drain_diag_en, 0)) {
+            uintptr_t addr = (uintptr_t)next;
+            if (addr != 0 && (addr < 4096 || addr > 0x00007fffffffffffULL)) {
+                uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed);
+                if (shot < 8) {
+                    fprintf(stderr,
+                            "[REMOTE_DRAIN_NEXT_INVALID] cls=%d slab=%d cur=%p next=%p head=%#lx prev=%p count=%u\n",
+                            cls,
+                            slab_idx,
+                            (void*)cur,
+                            (void*)next,
+                            (unsigned long)head,
+                            prev,
+                            (unsigned)meta->used);
+                }
+            }
+#if HAKMEM_TINY_HEADER_CLASSIDX
+            int hdr_cls = tiny_region_id_read_header((uint8_t*)cur + 1);
+            if (hdr_cls >= 0 && hdr_cls != cls) {
+                uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed);
+                if (shot < 8) {
+                    fprintf(stderr,
+                            "[REMOTE_DRAIN_HDR_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d head=%#lx\n",
+                            cls, slab_idx, (void*)cur, hdr_cls, (int)meta->class_idx, (unsigned long)head);
+                }
+            }
+#endif
+        }
+#if HAKMEM_TINY_HEADER_CLASSIDX
+        // Cross-check header vs meta before writing next (even if diag is off)
+        {
+            int hdr_cls_pre = tiny_region_id_read_header((uint8_t*)cur + 1);
+            if (hdr_cls_pre >= 0 && hdr_cls_pre != cls) {
+                static _Atomic uint32_t g_hdr_meta_mismatch_rd = 0;
+                uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_mismatch_rd, 1, memory_order_relaxed);
+                if (n < 16) {
+                    fprintf(stderr,
+                            "[REMOTE_DRAIN_HDR_META_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d\n",
+                            cls, slab_idx, (void*)cur, hdr_cls_pre, (int)meta->class_idx);
+                }
+            }
+        }
+#endif
+        // Restore header for header-classes (class 1-6) which were clobbered by remote push
+#if HAKMEM_TINY_HEADER_CLASSIDX
+        if (cls != 0) {
+            uint8_t expected = (uint8_t)(HEADER_MAGIC | (cls & HEADER_CLASS_MASK));
+            *(uint8_t*)(uintptr_t)cur = expected;
+        }
+#endif
+        // Rewrite next pointer to Box representation for this class
+        tiny_next_write(cls, (void*)cur, prev);
+        prev = (void*)cur;
+        cur = next;
+    }
+    meta->freelist = prev;
+    // Reset remote count after full drain
+    atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release);
+
+    // Update freelist/nonempty visibility bits
+    uint32_t bit = (1u << slab_idx);
+    atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
+    atomic_fetch_or_explicit(&ss->nonempty_mask, bit, memory_order_release);
+}
+
+// ============================================================================
+// Slab Initialization within SuperSlab
+// ============================================================================
+
+void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_t owner_tid)
+{
+    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
+        return;
+    }
+
+    // Phase E1-CORRECT unified geometry:
+    // - block_size is the TOTAL stride for this class (g_tiny_class_sizes[cls])
+    // - usable bytes are determined by slab index (slab0 vs others)
+    // - capacity = usable / stride for ALL classes (including former C7)
+    size_t usable_size = (slab_idx == 0)
+                           ? SUPERSLAB_SLAB0_USABLE_SIZE
+                           : SUPERSLAB_SLAB_USABLE_SIZE;
+    size_t stride = block_size;
+    uint16_t capacity = (uint16_t)(usable_size / stride);
+
+#if !HAKMEM_BUILD_RELEASE
+    if (slab_idx == 0) {
+        fprintf(stderr,
+                "[SUPERSLAB_INIT] slab 0: usable_size=%zu stride=%zu capacity=%u\n",
+                usable_size, stride, (unsigned)capacity);
+    }
+#endif
+
+    TinySlabMeta* meta = &ss->slabs[slab_idx];
+    meta->freelist = NULL;          // NULL = linear allocation mode
+    meta->used = 0;
+    meta->active = 0;               // P1.3: blocks in use by user (starts at 0)
+    meta->tls_cached = 0;           // P2.2: blocks cached in TLS SLL (starts at 0)
+    meta->capacity = capacity;
+    meta->carved = 0;
+    // LARSON FIX: Use bits 8-15 instead of 0-7 since pthread TIDs are aligned to 256 bytes
+    meta->owner_tid_low = (uint8_t)((owner_tid >> 8) & 0xFFu);
+    // Fail-safe: stamp class_idx from geometry (stride → class).
+    // This ensures legacy/shared/legacy-refill paths all end with a correct class.
+    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
+        if (g_tiny_class_sizes[i] == stride) {
+            meta->class_idx = (uint8_t)i;
+            // P1.1: Update class_map for out-of-band lookup on free path
+            ss->class_map[slab_idx] = (uint8_t)i;
+            break;
+        }
+    }
+
+    superslab_activate_slab(ss, slab_idx);
+}
+
+// ============================================================================
+// Slab Bitmap Management
+// ============================================================================
+
+void superslab_activate_slab(SuperSlab* ss, int slab_idx) {
+    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
+        return;
+    }
+    uint32_t mask = 1u << slab_idx;
+    if ((ss->slab_bitmap & mask) == 0) {
+        ss->slab_bitmap |= mask;
+        ss->active_slabs++;
+
+        // Phase 3d-C: Update hot/cold indices after activating new slab
+        ss_update_hot_cold_indices(ss);
+    }
+}
+
+void superslab_deactivate_slab(SuperSlab* ss, int slab_idx) {
+    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
+        return;
+    }
+    uint32_t mask = 1u << slab_idx;
+    if (ss->slab_bitmap & mask) {
+        ss->slab_bitmap &= ~mask;
+        ss->active_slabs--;
+    }
+}
+
+int superslab_find_free_slab(SuperSlab* ss) {
+    if (!ss) return -1;
+    if ((int)ss->active_slabs >= ss_slabs_capacity(ss)) {
+        return -1;  // No free slabs
+    }
+    // Find first 0 bit in bitmap
+    int cap = ss_slabs_capacity(ss);
+    for (int i = 0; i < cap; i++) {
+        if ((ss->slab_bitmap & (1u << i)) == 0) {
+            return i;
+        }
+    }
+    return -1;
+}
diff --git a/core/superslab_stats.c b/core/superslab_stats.c
new file mode 100644
index 00000000..c7a56135
--- /dev/null
+++ b/core/superslab_stats.c
@@ -0,0 +1,166 @@
+// superslab_stats.c - Statistics and debugging for SuperSlab allocator
+// Purpose: Tracking and reporting allocation statistics
+// License: MIT
+// Date: 2025-11-28
+
+#include "hakmem_tiny_superslab_internal.h"
+
+// ============================================================================
+// Global Statistics
+// ============================================================================
+
+pthread_mutex_t g_superslab_lock = PTHREAD_MUTEX_INITIALIZER;
+uint64_t g_superslabs_allocated = 0;  // Non-static for debugging
+uint64_t g_superslabs_freed = 0;      // Phase 7.6: Non-static for test access
+uint64_t g_bytes_allocated = 0;  // Non-static for debugging
+
+// Debug counters
+_Atomic uint64_t g_ss_active_dec_calls = 0;
+_Atomic uint64_t g_hak_tiny_free_calls = 0;
+_Atomic uint64_t g_ss_remote_push_calls = 0;
+// Free path instrumentation (lightweight, for OOM/route diagnosis)
+_Atomic uint64_t g_free_ss_enter = 0;          // hak_tiny_free_superslab() entries
+_Atomic uint64_t g_free_local_box_calls = 0;   // same-thread freelist pushes
+_Atomic uint64_t g_free_remote_box_calls = 0;  // cross-thread remote pushes
+// Per-class counters for gating/metrics (Tiny classes = 8)
+uint64_t g_ss_alloc_by_class[8] = {0};
+uint64_t g_ss_freed_by_class[8] = {0};
+
+// Global counters for debugging (non-static for external access)
+_Atomic uint64_t g_ss_mmap_count = 0;
+_Atomic uint64_t g_final_fallback_mmap_count = 0;
+
+// ============================================================================
+// Statistics Functions
+// ============================================================================
+
+void ss_stats_os_alloc(uint8_t size_class, size_t ss_size) {
+    pthread_mutex_lock(&g_superslab_lock);
+    g_superslabs_allocated++;
+    if (size_class < 8) {
+        g_ss_alloc_by_class[size_class]++;
+    }
+    g_bytes_allocated += ss_size;
+    pthread_mutex_unlock(&g_superslab_lock);
+}
+
+void ss_stats_cache_reuse(void) {
+    pthread_mutex_lock(&g_superslab_lock);
+    g_superslabs_reused++;
+    pthread_mutex_unlock(&g_superslab_lock);
+}
+
+void ss_stats_cache_store(void) {
+    pthread_mutex_lock(&g_superslab_lock);
+    g_superslabs_cached++;
+    pthread_mutex_unlock(&g_superslab_lock);
+}
+
+// ============================================================================
+// Diagnostics
+// ============================================================================
+
+void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) {
+    static int logged = 0;
+    if (logged) return;
+    logged = 1;
+
+    // CRITICAL FIX: Increment lock depth FIRST before any LIBC calls
+    // fopen/fclose/getrlimit/fprintf all may call malloc internally
+    // Must bypass HAKMEM wrapper to avoid header mismatch crash
+    extern __thread int g_hakmem_lock_depth;
+    g_hakmem_lock_depth++;  // Force wrapper to use __libc_malloc
+
+    struct rlimit rl = {0};
+    if (getrlimit(RLIMIT_AS, &rl) != 0) {
+        rl.rlim_cur = RLIM_INFINITY;
+        rl.rlim_max = RLIM_INFINITY;
+    }
+
+    unsigned long vm_size_kb = 0;
+    unsigned long vm_rss_kb = 0;
+    FILE* status = fopen("/proc/self/status", "r");
+    if (status) {
+        char line[256];
+        while (fgets(line, sizeof(line), status)) {
+            if (strncmp(line, "VmSize:", 7) == 0) {
+                (void)sscanf(line + 7, "%lu", &vm_size_kb);
+            } else if (strncmp(line, "VmRSS:", 6) == 0) {
+                (void)sscanf(line + 6, "%lu", &vm_rss_kb);
+            }
+        }
+        fclose(status);
+    }
+    // CRITICAL FIX: Do NOT decrement lock_depth yet!
+    // fprintf() below may call malloc for buffering
+
+    char rl_cur_buf[32];
+    char rl_max_buf[32];
+    if (rl.rlim_cur == RLIM_INFINITY) {
+        strcpy(rl_cur_buf, "inf");
+    } else {
+        snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur);
+    }
+    if (rl.rlim_max == RLIM_INFINITY) {
+        strcpy(rl_max_buf, "inf");
+    } else {
+        snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max);
+    }
+
+#if !HAKMEM_BUILD_RELEASE
+    fprintf(stderr,
+            "[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu "
+            "alloc=%llu freed=%llu bytes=%llu "
+            "RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n",
+            err,
+            ss_size,
+            alloc_size,
+            (unsigned long long)g_superslabs_allocated,
+            (unsigned long long)g_superslabs_freed,
+            (unsigned long long)g_bytes_allocated,
+            rl_cur_buf,
+            rl_max_buf,
+            vm_size_kb,
+            vm_rss_kb);
+#endif
+
+    g_hakmem_lock_depth--;  // Now safe to restore (all libc calls complete)
+}
+
+// ============================================================================
+// Statistics / Debugging
+// ============================================================================
+
+void superslab_print_stats(SuperSlab* ss) {
+    if (!ss || ss->magic != SUPERSLAB_MAGIC) {
+        printf("Invalid SuperSlab\n");
+        return;
+    }
+
+    printf("=== SuperSlab Stats ===\n");
+    printf("Address: %p\n", (void*)ss);
+    // Phase 12: per-SS size_class removed; classes are per-slab via meta->class_idx.
+    printf("Active slabs: %u / %d\n", ss->active_slabs, ss_slabs_capacity(ss));
+    printf("Bitmap: 0x%08X\n", ss->slab_bitmap);
+    printf("\nPer-slab details:\n");
+    for (int i = 0; i < ss_slabs_capacity(ss); i++) {
+        if (ss->slab_bitmap & (1u << i)) {
+            TinySlabMeta* meta = &ss->slabs[i];
+            printf("  Slab %2d: used=%u/%u freelist=%p class=%u owner_tid_low=%u\n",
+                   i, meta->used, meta->capacity, meta->freelist,
+                   (unsigned)meta->class_idx, (unsigned)meta->owner_tid_low);
+        }
+    }
+    printf("\n");
+}
+
+// Global statistics
+void superslab_print_global_stats(void) {
+    pthread_mutex_lock(&g_superslab_lock);
+    printf("=== Global SuperSlab Stats ===\n");
+    printf("SuperSlabs allocated: %lu\n", g_superslabs_allocated);
+    printf("SuperSlabs freed: %lu\n", g_superslabs_freed);
+    printf("SuperSlabs active: %lu\n", g_superslabs_allocated - g_superslabs_freed);
+    printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024));
+    pthread_mutex_unlock(&g_superslab_lock);
+}
diff --git a/core/tiny_region_id.h b/core/tiny_region_id.h
index f7e4e18a..8770577e 100644
--- a/core/tiny_region_id.h
+++ b/core/tiny_region_id.h
@@ -290,25 +290,18 @@ static inline int tiny_region_id_read_header(void* ptr) {
     // CRITICAL FIX (Pool TLS Phase 1): ALWAYS validate magic when Pool TLS is enabled
     // Reason: Pool TLS uses different magic (0xb0 vs 0xa0), MUST distinguish them!
     // Without this, Pool TLS allocations are wrongly routed to Tiny freelist → corruption
-#if !HAKMEM_BUILD_RELEASE || defined(HAKMEM_POOL_TLS_PHASE1)
-    // Debug/Development OR Pool TLS: Validate magic byte to catch non-header allocations
-    // Reason: Mid/Large allocations don't have headers, must detect and reject them
+    // Always validate magic byte to catch non-header allocations (release included).
+    // Reason: mmap-zero or mid/large frees can otherwise be misrouted as class 0.
     uint8_t magic = header & 0xF0;
-    #if HAKMEM_DEBUG_VERBOSE
+#if HAKMEM_DEBUG_VERBOSE
     static int debug_count = 0;
     if (debug_count < 5) {
         fprintf(stderr, "[TINY_READ_HEADER] ptr=%p header=0x%02x magic=0x%02x expected=0x%02x\n",
                 ptr, header, magic, HEADER_MAGIC);
         debug_count++;
     }
-    #endif
+#endif
     if (magic != HEADER_MAGIC) {
-        // Invalid header - likely non-header allocation (Mid/Large/Pool TLS)
-        #if HAKMEM_DEBUG_VERBOSE
-        if (debug_count < 6) {  // One more after the 5 above
-            fprintf(stderr, "[TINY_READ_HEADER] REJECTING ptr=%p (magic mismatch)\n", ptr);
-        }
-        #endif
 #if !HAKMEM_BUILD_RELEASE
         static int invalid_count = 0;
         if (invalid_count < 5) {
@@ -322,12 +315,6 @@ static inline int tiny_region_id_read_header(void* ptr) {
         if (tiny_guard_is_enabled()) tiny_guard_on_invalid(ptr, header);
         return -1;
     }
-#else
-    // Release (without Pool TLS): Skip magic validation (save 2-3 cycles)
-    // Safety: Bounds check below still prevents out-of-bounds array access
-    // Trade-off: Mid/Large frees may corrupt TLS freelist (rare, ~0.1% of frees)
-    // NOTE: This optimization is DISABLED when Pool TLS is enabled (different magic bytes!)
-#endif
 
     int class_idx = (int)(header & HEADER_CLASS_MASK);