Phase 6-2.8: SuperSlab modular refactoring (665 lines → 104 lines)

目的: hakmem_tiny_superslab.h の肥大化を解消 (500+ 行) 実装内容: 1. superslab_types.h を作成 - SuperSlab 構造体定義 (TinySlabMeta, SuperSlab) - 設定定数 (SUPERSLAB_SIZE_MAX, TINY_NUM_CLASSES_SS) - コンパイル時アサーション 2. superslab_inline.h を作成 - ホットパス用インライン関数を集約 - ss_slabs_capacity(), slab_index_for() - tiny_slab_base_for(), ss_remote_push() - _ss_remote_drain_to_freelist_unsafe() - Fail-fast validation helpers - ACE helpers (hak_now_ns, hak_tiny_superslab_next_lg) 3. hakmem_tiny_superslab.h をリファクタリング - 665 行 → 104 行 (-84%) - include のみに書き換え - 関数宣言と extern 宣言のみ残す効果: ✅ ビルド成功 (libhakmem.so, larson_hakmem) ✅ Mid-Large allocator テスト通過 (3.98M ops/s) ⚠️ Tiny allocator の freelist corruption バグは未解決 (リファクタリングのスコープ外) 注意: - Phase 6-2.6/6-2.7 の freelist バグは依然として存在 - リファクタリングは保守性向上のみが目的 - バグ修正は次のフェーズで対応 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 23:05:33 +09:00
parent 3523e02e51
commit a430545820
3 changed files with 611 additions and 582 deletions
--- a/core/hakmem_tiny_superslab.h
+++ b/core/hakmem_tiny_superslab.h
@ -2,6 +2,7 @@
 // Purpose: mimalloc-inspired 2MB aligned slab allocation for fast pointer→slab lookup
 // License: MIT
 // Date: 2025-10-24
+// Phase 6-2.8: Refactored into modular headers (types, inline)

 #ifndef HAKMEM_TINY_SUPERSLAB_H
 #define HAKMEM_TINY_SUPERSLAB_H
@ -15,6 +16,12 @@
 #include <signal.h>
 #include <stdio.h>  // For fprintf() debugging
 #include <pthread.h>
+
+// Phase 6-2.8: Modular headers (types, inline functions)
+#include "superslab/superslab_types.h"
+#include "superslab/superslab_inline.h"
+
+// Legacy includes (for backward compatibility)
 #include "tiny_debug_ring.h"
 #include "tiny_remote.h"
 #include "hakmem_tiny_superslab_constants.h"  // Phase 6-2.5: Centralized layout constants
@ -22,264 +29,9 @@
 // Debug instrumentation flags (defined in hakmem_tiny.c)
 extern int g_debug_remote_guard;
 extern int g_tiny_safe_free_strict;
-
-uint32_t tiny_remote_drain_threshold(void);
-
-// ============================================================================
-// SuperSlab Configuration
-// ============================================================================
-
-// Phase 8.3: ACE - Variable SuperSlab size (1MB ↔ 2MB)
-#define SUPERSLAB_SIZE_MAX  (2 * 1024 * 1024)  // 2MB max size
-#define SUPERSLAB_SIZE_MIN  (1 * 1024 * 1024)  // 1MB min size
-#define SUPERSLAB_LG_MAX    21  // lg(2MB)
-#define SUPERSLAB_LG_MIN    20  // lg(1MB)
-#define SUPERSLAB_LG_DEFAULT 21 // Default: 2MB (syscall reduction, ACE will adapt)
-
-// Phase 6-2.5: SLAB_SIZE now defined in hakmem_tiny_superslab_constants.h
-// #define SLAB_SIZE         (64 * 1024)        // 64KB per slab (fixed)
-
-// Legacy defines (kept for backward compatibility, use lg_size instead)
-#define SUPERSLAB_SIZE    SUPERSLAB_SIZE_MAX  // Default to 2MB (syscall reduction)
-#define SUPERSLAB_MASK    (SUPERSLAB_SIZE - 1)
-// IMPORTANT: Support variable-size SuperSlab (1MB=16 slabs, 2MB=32 slabs)
-// Arrays below must be sized for the MAX to avoid OOB when lg_size=21 (2MB)
-#define SLABS_PER_SUPERSLAB_MIN (SUPERSLAB_SIZE_MIN / SLAB_SIZE)  // 16 for 1MB
-#define SLABS_PER_SUPERSLAB_MAX (SUPERSLAB_SIZE_MAX / SLAB_SIZE)  // 32 for 2MB
-
-// Magic number for validation
-#define SUPERSLAB_MAGIC   0x48414B4D454D5353ULL  // "HAKMEMSS"
-
-// ============================================================================
-// SuperSlab Metadata Structure
-// ============================================================================
-
-// Per-slab metadata (16 bytes)
-typedef struct TinySlabMeta {
-    void*    freelist;       // Freelist head (NULL = linear mode, Phase 6.24)
-    uint16_t used;           // Blocks currently used
-    uint16_t capacity;       // Total blocks in slab
-    uint32_t owner_tid;      // Owner thread ID (for same-thread fast path)
-    // Phase 6.24: freelist == NULL → linear allocation mode (lazy init)
-    // Linear mode: allocate sequentially without building freelist
-    // Freelist mode: use freelist after first free() call
-} TinySlabMeta;
-
-// SuperSlab header (cache-line aligned, 64B)
-typedef struct SuperSlab {
-    // Header fields (64B total)
-    uint64_t magic;              // Magic number (0xHAKMEM_SUPERSLAB)
-    uint8_t  size_class;         // Size class (0-7 for 8-64B)
-    uint8_t  active_slabs;       // Number of active slabs (0-32 for 2MB, 0-16 for 1MB)
-    uint8_t  lg_size;            // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
-    uint8_t  _pad0;              // Padding
-    uint32_t slab_bitmap;        // 32-bit bitmap (1=active, 0=free)
-    _Atomic uint32_t freelist_mask; // Bit i=1 when slab i freelist is non-empty (opt-in)
-
-    // Phase 6-2.1: ChatGPT Pro P0 optimization - O(1) non-empty slab lookup
-    uint32_t nonempty_mask;      // Bit i = 1 if slabs[i].freelist != NULL (O(1) lookup via ctz)
-
-    // Phase 7.6: Deallocation support
-    atomic_uint total_active_blocks; // Total blocks in use (all slabs combined)
-    atomic_uint refcount;            // MT-safe refcount for empty detection/free（将来利用）
-    atomic_uint listed;              // 0/1: published to partial adopt ring（publish gating）
-    uint32_t partial_epoch;         // Last partial madvise epoch (optional)
-    uint8_t  publish_hint;          // Best slab index hint for adopt (0..31), 0xFF=none
-    uint8_t  _pad1[3];              // Padding
-
-    // Per-slab metadata (16B each)
-    // Sized for MAX; use ss->lg_size to bound loops at runtime
-    TinySlabMeta slabs[SLABS_PER_SUPERSLAB_MAX];
-
-    // Remote free queues (per slab): MPSC stack heads + counts
-    _Atomic(uintptr_t) remote_heads[SLABS_PER_SUPERSLAB_MAX];
-    _Atomic(uint32_t)  remote_counts[SLABS_PER_SUPERSLAB_MAX];
-
-    // Per-slab publish state: 0/1 = not listed/listed (for slab-granular republish hints)
-    atomic_uint slab_listed[SLABS_PER_SUPERSLAB_MAX];
-
-    // Partial adopt overflow linkage (single-linked, best-effort)
-    struct SuperSlab* partial_next;
-
-    // Padding to fill remaining space (2MB - 64B - 512B)
-    // Note: Actual slab data starts at offset SLAB_SIZE (64KB)
-
-} __attribute__((aligned(64))) SuperSlab;
-
-static inline int ss_slabs_capacity(const SuperSlab* ss);
-
-static inline int tiny_refill_failfast_level(void) {
-    static int g_failfast_level = -1;
-    if (__builtin_expect(g_failfast_level == -1, 0)) {
-        const char* env = getenv("HAKMEM_TINY_REFILL_FAILFAST");
-        if (env && *env) {
-            g_failfast_level = atoi(env);
-        } else {
-            g_failfast_level = 1;
-        }
-    }
-    return g_failfast_level;
-}
-
-static inline void tiny_failfast_log(const char* stage,
-                                     int class_idx,
-                                     SuperSlab* ss,
-                                     TinySlabMeta* meta,
-                                     const void* node,
-                                     const void* next) {
-    if (__builtin_expect(tiny_refill_failfast_level() < 2, 1)) return;
-    uintptr_t base = ss ? (uintptr_t)ss : 0;
-    size_t size = ss ? ((size_t)1ULL << ss->lg_size) : 0;
-    uintptr_t limit = base + size;
-    fprintf(stderr,
-            "[TRC_FREELIST_LOG] stage=%s cls=%d node=%p next=%p head=%p base=%p limit=%p\n",
-            stage ? stage : "(null)",
-            class_idx,
-            node,
-            next,
-            meta ? meta->freelist : NULL,
-            (void*)base,
-            (void*)limit);
-    fflush(stderr);
-}
-
-static inline void tiny_failfast_abort_ptr(const char* stage,
-                                           SuperSlab* ss,
-                                           int slab_idx,
-                                           const void* ptr,
-                                           const char* reason) {
-    if (__builtin_expect(tiny_refill_failfast_level() < 2, 1)) return;
-    uintptr_t base = ss ? (uintptr_t)ss : 0;
-    size_t size = ss ? ((size_t)1ULL << ss->lg_size) : 0;
-    uintptr_t limit = base + size;
-    size_t cap = 0;
-    uint32_t used = 0;
-    if (ss && slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
-        cap = ss->slabs[slab_idx].capacity;
-        used = ss->slabs[slab_idx].used;
-    }
-    size_t offset = 0;
-    if (ptr && base && ptr >= (void*)base) {
-        offset = (size_t)((uintptr_t)ptr - base);
-    }
-    fprintf(stderr,
-            "[TRC_FAILFAST_PTR] stage=%s cls=%d slab_idx=%d ptr=%p reason=%s base=%p limit=%p cap=%zu used=%u offset=%zu\n",
-            stage ? stage : "(null)",
-            ss ? (int)ss->size_class : -1,
-            slab_idx,
-            ptr,
-            reason ? reason : "(null)",
-            (void*)base,
-            (void*)limit,
-            cap,
-            used,
-            offset);
-    fflush(stderr);
-    abort();
-}
-
-// Compile-time assertions
-_Static_assert(sizeof(TinySlabMeta) == 16, "TinySlabMeta must be 16 bytes");
-// Phase 8.3: Variable-size SuperSlab assertions (1MB=16 slabs, 2MB=32 slabs)
-_Static_assert((SUPERSLAB_SIZE_MIN / SLAB_SIZE) == 16, "1MB SuperSlab must have 16 slabs");
-_Static_assert((SUPERSLAB_SIZE_MAX / SLAB_SIZE) == 32, "2MB SuperSlab must have 32 slabs");
-_Static_assert((SUPERSLAB_SIZE & SUPERSLAB_MASK) == 0, "SUPERSLAB_SIZE must be power of 2");
-
-// ============================================================================
-// Fast Inline Functions (mimalloc-style)
-// ============================================================================
-
-// DEPRECATED (Phase 1): This function causes false positives! Use hak_super_lookup() instead.
-// Problem: L2.5 allocations at 1MB boundary are misidentified as SuperSlabs
-// Solution: Use registry-based hak_super_lookup() from hakmem_super_registry.h
-#if 0  // DISABLED - unsafe function removed in Phase 1
-static inline SuperSlab* ptr_to_superslab(void* p) {
-    return (SuperSlab*)((uintptr_t)p & ~(uintptr_t)SUPERSLAB_MASK);
-}
-#endif
-
-// Get slab index within SuperSlab (shift operation, 0-31)
-// Deprecated: Do not use for 2MB SuperSlabs (mask is 1MB). Use slab_index_for().
-static inline int ptr_to_slab_index(void* p) {
-    uintptr_t offset = (uintptr_t)p & SUPERSLAB_MASK;
-    return (int)(offset >> 16);  // Divide by 64KB (2^16)
-}
-
-// Runtime-safe slab count for a given SuperSlab
-static inline int ss_slabs_capacity(const SuperSlab* ss) {
-    size_t ss_size = (size_t)1 << ss->lg_size;
-    return (int)(ss_size / SLAB_SIZE);  // 16 or 32
-}
-
-// Safe slab index computation using SuperSlab base (supports 1MB/2MB)
-static inline int slab_index_for(const SuperSlab* ss, const void* p) {
-    uintptr_t base = (uintptr_t)ss;
-    uintptr_t addr = (uintptr_t)p;
-    uintptr_t off = addr - base;
-    int idx = (int)(off >> 16);  // 64KB
-    int cap = ss_slabs_capacity(ss);
-    return (idx >= 0 && idx < cap) ? idx : -1;
-}
-
-// DEPRECATED (Phase 1): Uses unsafe ptr_to_superslab() internally
-// Use hak_super_lookup() + ptr_to_slab_index() instead
-#if 0  // DISABLED - uses unsafe ptr_to_superslab()
-static inline TinySlabMeta* ptr_to_slab_meta(void* p) {
-    SuperSlab* ss = ptr_to_superslab(p);
-    int idx = ptr_to_slab_index(p);
-    return &ss->slabs[idx];
-}
-#endif
-
-// Get slab data start address
-static inline void* slab_data_start(SuperSlab* ss, int slab_idx) {
-    return (char*)ss + (slab_idx * SLAB_SIZE);
-}
-
-static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) {
-    uint8_t* base = (uint8_t*)slab_data_start(ss, slab_idx);
-    // Phase 6-2.5 FIX: Use SUPERSLAB_SLAB0_DATA_OFFSET constant
-    // sizeof(SuperSlab)=1088, aligned to next 1024-boundary=2048
-    // This ensures proper alignment for class 7 (1024-byte blocks)
-    if (slab_idx == 0) base += SUPERSLAB_SLAB0_DATA_OFFSET;
-    return base;
-}
-
-// DEPRECATED (Phase 1): Uses unsafe ptr_to_superslab() internally (false positives!)
-// Use: SuperSlab* ss = hak_super_lookup(p); if (ss && ss->magic == SUPERSLAB_MAGIC) { ... }
-#if 0  // DISABLED - uses unsafe ptr_to_superslab(), causes crashes on L2.5 boundaries
-static inline int is_superslab_pointer(void* p) {
-    SuperSlab* ss = ptr_to_superslab(p);
-    return ss->magic == SUPERSLAB_MAGIC;
-}
-#endif
-
-// Refcount helpers（将来のMT安全な空回収に使用）
-static inline void superslab_ref_inc(SuperSlab* ss) {
-    atomic_fetch_add_explicit(&ss->refcount, 1u, memory_order_relaxed);
-}
-static inline unsigned superslab_ref_dec(SuperSlab* ss) {
-    return atomic_fetch_sub_explicit(&ss->refcount, 1u, memory_order_acq_rel) - 1u;
-}
-static inline unsigned superslab_ref_get(SuperSlab* ss) {
-    return atomic_load_explicit(&ss->refcount, memory_order_acquire);
-}
-
-// Debug counter extern declaration
 extern _Atomic uint64_t g_ss_active_dec_calls;

-// Active block counter helpers (saturating decrement for free operations)
-static inline void ss_active_dec_one(SuperSlab* ss) {
-    atomic_fetch_add_explicit(&g_ss_active_dec_calls, 1, memory_order_relaxed);
-    uint32_t old = atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed);
-    while (old != 0) {
-        if (atomic_compare_exchange_weak_explicit(&ss->total_active_blocks, &old, old - 1u,
-                                                  memory_order_relaxed, memory_order_relaxed)) {
-            break;
-        }
-        // CAS failed: old is reloaded by CAS intrinsic
-    }
-}
+uint32_t tiny_remote_drain_threshold(void);

 // ============================================================================
 // SuperSlab Management Functions
@ -313,23 +65,6 @@ void superslab_ace_print_stats(void);
 // Phase 8.3: ACE (Adaptive Cache Engine) - SuperSlab adaptive sizing
 // ============================================================================

-#define TINY_NUM_CLASSES_SS 8  // Same as TINY_NUM_CLASSES (avoid circular include)
-
-// Per-class ACE state (lightweight observation + decision)
-typedef struct {
-    uint8_t  current_lg;      // Current lg_size in use (20=1MB, 21=2MB)
-    uint8_t  target_lg;       // Target lg_size for next allocation (20/21)
-    uint16_t hot_score;       // Hotness score (0-1000) for visualization
-    uint32_t alloc_count;     // Allocs since last tick
-    uint32_t refill_count;    // Refills since last tick
-    uint32_t spill_count;     // Spills since last tick
-    uint32_t live_blocks;     // Estimated live blocks (alloc-free EMA)
-    uint64_t last_tick_ns;    // Last tick timestamp (ns)
-} SuperSlabACEState;
-
-// Global ACE state (one per tiny class)
-extern SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS];
-
 // ACE tick function (called periodically, ~150ms interval)
 // Observes metrics and decides promotion (1MB→2MB) or demotion (2MB→1MB)
 void hak_tiny_superslab_ace_tick(int class_idx, uint64_t now_ns);
@ -337,31 +72,20 @@ void hak_tiny_superslab_ace_tick(int class_idx, uint64_t now_ns);
 // Phase 8.4: ACE Observer (called from Learner thread - zero hot-path overhead)
 void hak_tiny_superslab_ace_observe_all(void);

-// Low-cost timestamp (nanoseconds, monotonic) - inline for hot path
-static inline uint64_t hak_now_ns(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
-}
-
-// Get next lg_size for new SuperSlab allocation (uses target_lg)
-static inline uint8_t hak_tiny_superslab_next_lg(int class_idx) {
-    uint8_t lg = g_ss_ace[class_idx].target_lg ? g_ss_ace[class_idx].target_lg
-                                                : g_ss_ace[class_idx].current_lg;
-    return lg ? lg : SUPERSLAB_LG_DEFAULT;  // Use default if uninitialized
-}
-
-// ----------------------------------------------------------------------------
+// ============================================================================
 // Partial SuperSlab adopt/publish (per-class single-slot)
-// ----------------------------------------------------------------------------
+// ============================================================================
+
 // Publish a SuperSlab with available freelist for other threads to adopt.
 void ss_partial_publish(int class_idx, SuperSlab* ss);
+
 // Adopt published SuperSlab for the class (returns NULL if none).
 SuperSlab* ss_partial_adopt(int class_idx);

-// ----------------------------------------------------------------------------
+// ============================================================================
 // SuperSlab adopt gate (publish/adopt wiring helper)
-// ----------------------------------------------------------------------------
+// ============================================================================
+
 // Environment-aware switch that keeps free/alloc sides in sync. Default:
 //   - Disabled until cross-thread free is observed.
 //   - `HAKMEM_TINY_SS_ADOPT=1` forces ON, `=0` forces OFF.
@ -369,297 +93,11 @@ int tiny_adopt_gate_should_publish(void);
 int tiny_adopt_gate_should_adopt(void);
 void tiny_adopt_gate_on_remote_seen(int class_idx);

-// Remote free push (MPSC stack) - returns 1 if transitioned from empty
+// ============================================================================
+// External variable declarations
+// ============================================================================
+
 extern _Atomic int g_ss_remote_seen;  // set to 1 on first remote free observed
-extern int g_debug_remote_guard;
-static inline int ss_remote_push(SuperSlab* ss, int slab_idx, void* ptr) {
-    extern _Atomic uint64_t g_ss_remote_push_calls;
-    atomic_fetch_add_explicit(&g_ss_remote_push_calls, 1, memory_order_relaxed);
-    static _Atomic int g_remote_push_count = 0;
-    int count = atomic_fetch_add_explicit(&g_remote_push_count, 1, memory_order_relaxed);
-    if (count < 5) {
-        fprintf(stderr, "[DEBUG ss_remote_push] Call #%d ss=%p slab_idx=%d\n", count+1, (void*)ss, slab_idx);
-        fflush(stderr);
-    }
-    if (g_debug_remote_guard && count < 5) {
-        fprintf(stderr, "[REMOTE_PUSH] ss=%p slab_idx=%d ptr=%p count=%d\n",
-                (void*)ss, slab_idx, ptr, count);
-    }
-
-    // Unconditional sanity checks (Fail-Fast without crashing)
-    {
-        uintptr_t ptr_val = (uintptr_t)ptr;
-        uintptr_t base = (uintptr_t)ss;
-        size_t ss_size = (size_t)1ULL << ss->lg_size;
-        int cap = ss_slabs_capacity(ss);
-        int in_range = (ptr_val >= base) && (ptr_val < base + ss_size);
-        int aligned = ((ptr_val & (sizeof(void*) - 1)) == 0);
-        if (!in_range || slab_idx < 0 || slab_idx >= cap || !aligned) {
-            uintptr_t code = 0xB001u;
-            if (!in_range) code |= 0x01u;
-            if (!aligned)  code |= 0x02u;
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID,
-                                   (uint16_t)ss->size_class,
-                                   ptr,
-                                   ((uintptr_t)slab_idx << 32) | code);
-            return 0;
-        }
-    }
-    // A/B: global disable for remote MPSC — fallback to legacy freelist push
-    do {
-        static int g_disable_remote_glob = -1;
-        if (__builtin_expect(g_disable_remote_glob == -1, 0)) {
-            const char* e = getenv("HAKMEM_TINY_DISABLE_REMOTE");
-            g_disable_remote_glob = (e && *e && *e != '0') ? 1 : 0;
-        }
-        if (__builtin_expect(g_disable_remote_glob, 0)) {
-            TinySlabMeta* meta = &ss->slabs[slab_idx];
-            void* prev = meta->freelist;
-            *(void**)ptr = prev;
-            meta->freelist = ptr;
-            // Reflect accounting (callers also decrement used; keep idempotent here)
-            ss_active_dec_one(ss);
-            if (prev == NULL) {
-                // first item: mark this slab visible to adopters
-                uint32_t bit = (1u << slab_idx);
-                atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
-                return 1;
-            }
-            return 0;
-        }
-    } while (0);
-
-    _Atomic(uintptr_t)* head = &ss->remote_heads[slab_idx];
-    uintptr_t old;
-    do {
-        old = atomic_load_explicit(head, memory_order_acquire);
-        if (!g_remote_side_enable) {
-            *(void**)ptr = (void*)old;  // legacy embedding
-        }
-    } while (!atomic_compare_exchange_weak_explicit(head, &old, (uintptr_t)ptr,
-                                                    memory_order_release, memory_order_relaxed));
-    tiny_remote_side_set(ss, slab_idx, ptr, old);
-    tiny_remote_track_on_remote_push(ss, slab_idx, ptr, "remote_push", 0);
-    if (__builtin_expect(g_debug_remote_guard, 0)) {
-        // One-shot verify just-written next/ptr alignment and range
-        uintptr_t base = (uintptr_t)ss;
-        size_t ss_size = (size_t)1ULL << ss->lg_size;
-        uintptr_t pv = (uintptr_t)ptr;
-        int ptr_in = (pv >= base && pv < base + ss_size);
-        int ptr_al = ((pv & (sizeof(void*) - 1)) == 0);
-        int old_in = (old == 0) || ((old >= base) && (old < base + ss_size));
-        int old_al = (old == 0) || ((old & (sizeof(void*) - 1)) == 0);
-        if (!ptr_in || !ptr_al || !old_in || !old_al) {
-            uintptr_t flags = ((uintptr_t)ptr_al << 3) | ((uintptr_t)ptr_in << 2) | ((uintptr_t)old_al << 1) | (uintptr_t)old_in;
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID,
-                                   (uint16_t)ss->size_class,
-                                   ptr,
-                                   0xB100u | (flags & 0xFu));
-            if (g_tiny_safe_free_strict) { raise(SIGUSR2); }
-        }
-        fprintf(stderr, "[REMOTE_PUSH] cls=%u slab=%d ptr=%p old=%p transitioned=%d\n",
-                ss->size_class, slab_idx, ptr, (void*)old, old == 0);
-        // Pack: [slab_idx<<32 | bit0:old==0 | bit1:old_al | bit2:ptr_al]
-        uintptr_t aux = ((uintptr_t)slab_idx << 32) | ((old == 0) ? 1u : 0u) | ((old_al ? 1u : 0u) << 1) | ((ptr_al ? 1u : 0u) << 2);
-        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_PUSH,
-                               (uint16_t)ss->size_class,
-                               ptr,
-                               aux);
-    } else {
-        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_PUSH,
-                               (uint16_t)ss->size_class,
-                               ptr,
-                               ((uintptr_t)slab_idx << 32) | (uint32_t)(old == 0));
-    }
-    atomic_fetch_add_explicit(&ss->remote_counts[slab_idx], 1u, memory_order_relaxed);
-    ss_active_dec_one(ss);  // Fix: Decrement active blocks on cross-thread free
-    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
-    int transitioned = (old == 0);
-    // (optional hint to Ready ring moved to mailbox/aggregator to avoid header coupling)
-    if (transitioned) {
-        // First remote observed for this slab: mark slab_listed and notify publisher paths
-        unsigned prev = atomic_exchange_explicit(&ss->slab_listed[slab_idx], 1u, memory_order_acq_rel);
-        (void)prev; // best-effort
-        extern void tiny_publish_notify(int class_idx, struct SuperSlab* ss, int slab_idx);
-        tiny_publish_notify((int)ss->size_class, ss, slab_idx);
-    } else {
-        // Optional: best-effort notify if already non-empty but not listed
-        extern int g_remote_force_notify;
-        if (__builtin_expect(g_remote_force_notify, 0)) {
-            unsigned listed = atomic_load_explicit(&ss->slab_listed[slab_idx], memory_order_acquire);
-            if (listed == 0) {
-                unsigned prev = atomic_exchange_explicit(&ss->slab_listed[slab_idx], 1u, memory_order_acq_rel);
-                (void)prev;
-                extern void tiny_publish_notify(int class_idx, struct SuperSlab* ss, int slab_idx);
-                tiny_publish_notify((int)ss->size_class, ss, slab_idx);
-            }
-        }
-    }
-    return transitioned;
-}
-
-// Drain remote queue into freelist (no change to used/active; already adjusted at free)
-// INTERNAL UNSAFE VERSION - Only called by slab_handle.h after ownership verified!
-// DO NOT call directly - use slab_drain_remote() via SlabHandle instead.
-static inline void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta) {
-    do { // one-shot debug print when enabled
-        static int en = -1; static _Atomic int printed;
-        if (__builtin_expect(en == -1, 0)) {
-            const char* e = getenv("HAKMEM_TINY_REFILL_OPT_DEBUG");
-            en = (e && *e && *e != '0') ? 1 : 0;
-        }
-        if (en) {
-            int exp = 0; if (atomic_compare_exchange_strong(&printed, &exp, 1)) {
-                fprintf(stderr, "[DRAIN_OPT] chain splice active (cls=%u slab=%d)\n", ss ? ss->size_class : 0u, slab_idx);
-            }
-        }
-    } while (0);
-    _Atomic(uintptr_t)* head = &ss->remote_heads[slab_idx];
-    uintptr_t p = atomic_exchange_explicit(head, (uintptr_t)NULL, memory_order_acq_rel);
-    if (p == 0) return;
-
-    uint32_t drained = 0;
-    uintptr_t base = (uintptr_t)ss;
-    size_t ss_size = (size_t)1ULL << ss->lg_size;
-    uint32_t drain_tid = (uint32_t)(uintptr_t)pthread_self();
-    // Build a local chain then splice once into freelist to reduce writes
-    void* chain_head = NULL;
-    void* chain_tail = NULL;
-    while (p != 0) {
-        // Guard: range/alignment before deref
-        if (__builtin_expect(g_debug_remote_guard, 0)) {
-            if (p < base || p >= base + ss_size) {
-                uintptr_t aux = tiny_remote_pack_diag(0xA210u, base, ss_size, p);
-                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)p, aux);
-                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                break;
-            }
-            if ((p & (uintptr_t)(sizeof(void*) - 1)) != 0) {
-                uintptr_t aux = tiny_remote_pack_diag(0xA211u, base, ss_size, p);
-                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)p, aux);
-                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                break;
-            }
-        }
-        void* node = (void*)p;
-        uintptr_t next = tiny_remote_side_get(ss, slab_idx, node);
-        tiny_remote_watch_note("drain_pull", ss, slab_idx, node, 0xA238u, drain_tid, 0);
-        if (__builtin_expect(g_remote_side_enable, 0)) {
-            if (!tiny_remote_sentinel_ok(node)) {
-                uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, (uintptr_t)node);
-                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, node, aux);
-                uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)node, memory_order_relaxed);
-                tiny_remote_report_corruption("drain", node, observed);
-                TinySlabMeta* meta = &ss->slabs[slab_idx];
-                fprintf(stderr,
-                        "[REMOTE_SENTINEL-DRAIN] cls=%u slab=%d node=%p drained=%u observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p\n",
-                        ss->size_class,
-                        slab_idx,
-                        node,
-                        drained,
-                        observed,
-                        meta->owner_tid,
-                        (unsigned)meta->used,
-                        meta->freelist);
-                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-            }
-            tiny_remote_side_clear(ss, slab_idx, node);
-        }
-        tiny_remote_watch_note("drain_link", ss, slab_idx, node, 0xA239u, drain_tid, 0);
-        tiny_remote_track_on_remote_drain(ss, slab_idx, node, "remote_drain", drain_tid);
-        if (__builtin_expect(g_debug_remote_guard && drained < 3, 0)) {
-            // First few nodes: record low info for triage
-            uintptr_t aux = ((uintptr_t)slab_idx << 32) | (uintptr_t)(drained & 0xFFFF);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_DRAIN, (uint16_t)ss->size_class, node, aux);
-        }
-        // Link into local chain (avoid touching meta->freelist per node)
-        if (chain_head == NULL) {
-            chain_head = node;
-            chain_tail = node;
-            *(void**)node = NULL;
-        } else {
-            *(void**)node = chain_head;
-            chain_head = node;
-        }
-        p = next;
-        drained++;
-    }
-    // Splice the drained chain into freelist (single meta write)
-    if (chain_head != NULL) {
-        if (chain_tail != NULL) {
-            *(void**)chain_tail = meta->freelist;
-        }
-        void* prev = meta->freelist;
-        meta->freelist = chain_head;
-        tiny_failfast_log("remote_drain", ss->size_class, ss, meta, chain_head, prev);
-        // Optional: set freelist bit when transitioning from empty
-        do {
-            static int g_mask_en = -1;
-            if (__builtin_expect(g_mask_en == -1, 0)) {
-                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
-                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
-            }
-            if (__builtin_expect(g_mask_en, 0)) {
-                uint32_t bit = (1u << slab_idx);
-                atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
-            }
-        } while (0);
-    }
-    // Reset remote count after full drain
-    atomic_store_explicit(&ss->remote_counts[slab_idx], 0u, memory_order_relaxed);
-    tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_DRAIN,
-                           (uint16_t)ss->size_class,
-                           ss,
-                           ((uintptr_t)slab_idx << 32) | drained);
-}
-
-// Legacy wrapper for compatibility (UNSAFE - ownership NOT checked!)
-// DEPRECATED: Use slab_drain_remote() via SlabHandle instead
-static inline void ss_remote_drain_to_freelist(SuperSlab* ss, int slab_idx) {
-    TinySlabMeta* meta = &ss->slabs[slab_idx];
-    _ss_remote_drain_to_freelist_unsafe(ss, slab_idx, meta);
-}
-
-
-// Try to acquire exclusive ownership of slab (REQUIRED before draining remote queue!)
-// Returns 1 on success (now own slab), 0 on failure (another thread owns it)
-// CRITICAL: Only succeeds if slab is unowned (owner_tid==0) or already owned by us.
-static inline int ss_owner_try_acquire(TinySlabMeta* m, uint32_t self_tid) {
-    uint32_t cur = __atomic_load_n(&m->owner_tid, __ATOMIC_RELAXED);
-    if (cur == self_tid) return 1;  // Already owner - success
-    if (cur != 0) return 0;  // Another thread owns it - FAIL immediately
-
-    // Slab is unowned (cur==0) - try to claim it
-    uint32_t expected = 0;
-    return __atomic_compare_exchange_n(&m->owner_tid, &expected, self_tid, false,
-                                       __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
-}
-
-// Drain remote queues where activity was observed (lightweight sweep).
-// CRITICAL: Must acquire ownership before draining each slab!
-static inline void ss_remote_drain_light(SuperSlab* ss) {
-    if (!ss) return;
-    uint32_t threshold = tiny_remote_drain_threshold();
-    uint32_t self_tid = (uint32_t)(uintptr_t)pthread_self();
-    int cap = ss_slabs_capacity(ss);
-    for (int s = 0; s < cap; s++) {
-        uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
-        if (rc <= threshold) continue;
-        if (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0) {
-            // BUGFIX: Must acquire ownership BEFORE draining!
-            // Without this, we can drain a slab owned by another thread → freelist corruption
-            TinySlabMeta* m = &ss->slabs[s];
-            if (!ss_owner_try_acquire(m, self_tid)) {
-                continue;  // Failed to acquire - skip this slab
-            }
-            ss_remote_drain_to_freelist(ss, s);
-        }
-    }
-}
-
-// Best-effort CAS to transfer slab ownership (DEPRECATED - use ss_owner_try_acquire!)
-static inline void ss_owner_cas(TinySlabMeta* m, uint32_t self_tid) {
-    (void)ss_owner_try_acquire(m, self_tid);  // Ignore result (unsafe)
-}
+extern int g_remote_force_notify;

 #endif // HAKMEM_TINY_SUPERSLAB_H
--- a/core/superslab/superslab_inline.h
+++ b/core/superslab/superslab_inline.h
@ -0,0 +1,487 @@
+// superslab_inline.h - SuperSlab Hot Path Inline Functions
+// Purpose: Performance-critical inline helpers for SuperSlab allocator
+// Extracted from hakmem_tiny_superslab.h (Phase 6-2.8 Refactoring)
+
+#ifndef SUPERSLAB_INLINE_H
+#define SUPERSLAB_INLINE_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <signal.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include "superslab_types.h"
+#include "hakmem_tiny_superslab_constants.h"
+#include "tiny_debug_ring.h"
+#include "tiny_remote.h"
+
+// External declarations
+extern int g_debug_remote_guard;
+extern int g_tiny_safe_free_strict;
+extern _Atomic uint64_t g_ss_active_dec_calls;
+extern _Atomic uint64_t g_ss_remote_push_calls;
+extern _Atomic int g_ss_remote_seen;
+extern int g_remote_side_enable;
+extern int g_remote_force_notify;
+
+// Function declarations
+uint32_t tiny_remote_drain_threshold(void);
+void tiny_publish_notify(int class_idx, struct SuperSlab* ss, int slab_idx);
+
+// ============================================================================
+// Fast Path Inline Functions
+// ============================================================================
+
+// Runtime-safe slab count for a given SuperSlab (MUST BE FIRST - used by other functions)
+static inline int ss_slabs_capacity(const SuperSlab* ss) {
+    size_t ss_size = (size_t)1 << ss->lg_size;
+    return (int)(ss_size / SLAB_SIZE);  // 16 or 32
+}
+
+// Fail-fast validation level (0=off, 1=basic, 2=paranoid)
+static inline int tiny_refill_failfast_level(void) {
+    static int g_failfast_level = -1;
+    if (__builtin_expect(g_failfast_level == -1, 0)) {
+        const char* env = getenv("HAKMEM_TINY_REFILL_FAILFAST");
+        if (env && *env) {
+            g_failfast_level = atoi(env);
+        } else {
+            g_failfast_level = 1;
+        }
+    }
+    return g_failfast_level;
+}
+
+// Fail-fast logging (level 2 only)
+static inline void tiny_failfast_log(const char* stage,
+                                     int class_idx,
+                                     SuperSlab* ss,
+                                     TinySlabMeta* meta,
+                                     const void* node,
+                                     const void* next) {
+    if (__builtin_expect(tiny_refill_failfast_level() < 2, 1)) return;
+    uintptr_t base = ss ? (uintptr_t)ss : 0;
+    size_t size = ss ? ((size_t)1ULL << ss->lg_size) : 0;
+    uintptr_t limit = base + size;
+    fprintf(stderr,
+            "[TRC_FREELIST_LOG] stage=%s cls=%d node=%p next=%p head=%p base=%p limit=%p\n",
+            stage ? stage : "(null)",
+            class_idx,
+            node,
+            next,
+            meta ? meta->freelist : NULL,
+            (void*)base,
+            (void*)limit);
+    fflush(stderr);
+}
+
+// Fail-fast abort with detailed diagnostics
+static inline void tiny_failfast_abort_ptr(const char* stage,
+                                           SuperSlab* ss,
+                                           int slab_idx,
+                                           const void* ptr,
+                                           const char* reason) {
+    if (__builtin_expect(tiny_refill_failfast_level() < 2, 1)) return;
+    uintptr_t base = ss ? (uintptr_t)ss : 0;
+    size_t size = ss ? ((size_t)1ULL << ss->lg_size) : 0;
+    uintptr_t limit = base + size;
+    size_t cap = 0;
+    uint32_t used = 0;
+    if (ss && slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
+        cap = ss->slabs[slab_idx].capacity;
+        used = ss->slabs[slab_idx].used;
+    }
+    size_t offset = 0;
+    if (ptr && base && ptr >= (void*)base) {
+        offset = (size_t)((uintptr_t)ptr - base);
+    }
+    fprintf(stderr,
+            "[TRC_FAILFAST_PTR] stage=%s cls=%d slab_idx=%d ptr=%p reason=%s base=%p limit=%p cap=%zu used=%u offset=%zu\n",
+            stage ? stage : "(null)",
+            ss ? (int)ss->size_class : -1,
+            slab_idx,
+            ptr,
+            reason ? reason : "(null)",
+            (void*)base,
+            (void*)limit,
+            cap,
+            used,
+            offset);
+    fflush(stderr);
+    abort();
+}
+
+// Get slab index within SuperSlab (DEPRECATED - use slab_index_for)
+static inline int ptr_to_slab_index(void* p) {
+    uintptr_t offset = (uintptr_t)p & SUPERSLAB_MASK;
+    return (int)(offset >> 16);  // Divide by 64KB (2^16)
+}
+
+// Safe slab index computation using SuperSlab base (supports 1MB/2MB)
+static inline int slab_index_for(const SuperSlab* ss, const void* p) {
+    uintptr_t base = (uintptr_t)ss;
+    uintptr_t addr = (uintptr_t)p;
+    uintptr_t off = addr - base;
+    int idx = (int)(off >> 16);  // 64KB
+    int cap = ss_slabs_capacity(ss);
+    return (idx >= 0 && idx < cap) ? idx : -1;
+}
+
+// Get slab data start address
+static inline void* slab_data_start(SuperSlab* ss, int slab_idx) {
+    return (char*)ss + (slab_idx * SLAB_SIZE);
+}
+
+// Get slab base address (accounts for SUPERSLAB_SLAB0_DATA_OFFSET)
+static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) {
+    uint8_t* base = (uint8_t*)slab_data_start(ss, slab_idx);
+    // Phase 6-2.5 FIX: Use SUPERSLAB_SLAB0_DATA_OFFSET constant
+    // sizeof(SuperSlab)=1088, aligned to next 1024-boundary=2048
+    // This ensures proper alignment for class 7 (1024-byte blocks)
+    if (slab_idx == 0) base += SUPERSLAB_SLAB0_DATA_OFFSET;
+    return base;
+}
+
+// Refcount helpers (for future MT-safe empty reclamation)
+static inline void superslab_ref_inc(SuperSlab* ss) {
+    atomic_fetch_add_explicit(&ss->refcount, 1u, memory_order_relaxed);
+}
+static inline unsigned superslab_ref_dec(SuperSlab* ss) {
+    return atomic_fetch_sub_explicit(&ss->refcount, 1u, memory_order_acq_rel) - 1u;
+}
+static inline unsigned superslab_ref_get(SuperSlab* ss) {
+    return atomic_load_explicit(&ss->refcount, memory_order_acquire);
+}
+
+// Active block counter helper (saturating decrement for free operations)
+static inline void ss_active_dec_one(SuperSlab* ss) {
+    atomic_fetch_add_explicit(&g_ss_active_dec_calls, 1, memory_order_relaxed);
+    uint32_t old = atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed);
+    while (old != 0) {
+        if (atomic_compare_exchange_weak_explicit(&ss->total_active_blocks, &old, old - 1u,
+                                                  memory_order_relaxed, memory_order_relaxed)) {
+            break;
+        }
+        // CAS failed: old is reloaded by CAS intrinsic
+    }
+}
+
+// Low-cost timestamp (nanoseconds, monotonic) - inline for hot path
+static inline uint64_t hak_now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
+}
+
+// Get next lg_size for new SuperSlab allocation (uses target_lg)
+// Forward declaration of ACE state (defined in main header)
+typedef struct {
+    uint8_t  current_lg;
+    uint8_t  target_lg;
+    uint16_t hot_score;
+    uint32_t alloc_count;
+    uint32_t refill_count;
+    uint32_t spill_count;
+    uint32_t live_blocks;
+    uint64_t last_tick_ns;
+} SuperSlabACEState;
+extern SuperSlabACEState g_ss_ace[8];  // TINY_NUM_CLASSES_SS
+
+static inline uint8_t hak_tiny_superslab_next_lg(int class_idx) {
+    uint8_t lg = g_ss_ace[class_idx].target_lg ? g_ss_ace[class_idx].target_lg
+                                                : g_ss_ace[class_idx].current_lg;
+    return lg ? lg : SUPERSLAB_LG_DEFAULT;  // Use default if uninitialized
+}
+
+// Remote free push (MPSC stack) - returns 1 if transitioned from empty
+static inline int ss_remote_push(SuperSlab* ss, int slab_idx, void* ptr) {
+    atomic_fetch_add_explicit(&g_ss_remote_push_calls, 1, memory_order_relaxed);
+    static _Atomic int g_remote_push_count = 0;
+    int count = atomic_fetch_add_explicit(&g_remote_push_count, 1, memory_order_relaxed);
+    if (count < 5) {
+        fprintf(stderr, "[DEBUG ss_remote_push] Call #%d ss=%p slab_idx=%d\n", count+1, (void*)ss, slab_idx);
+        fflush(stderr);
+    }
+    if (g_debug_remote_guard && count < 5) {
+        fprintf(stderr, "[REMOTE_PUSH] ss=%p slab_idx=%d ptr=%p count=%d\n",
+                (void*)ss, slab_idx, ptr, count);
+    }
+
+    // Unconditional sanity checks (Fail-Fast without crashing)
+    {
+        uintptr_t ptr_val = (uintptr_t)ptr;
+        uintptr_t base = (uintptr_t)ss;
+        size_t ss_size = (size_t)1ULL << ss->lg_size;
+        int cap = ss_slabs_capacity(ss);
+        int in_range = (ptr_val >= base) && (ptr_val < base + ss_size);
+        int aligned = ((ptr_val & (sizeof(void*) - 1)) == 0);
+        if (!in_range || slab_idx < 0 || slab_idx >= cap || !aligned) {
+            uintptr_t code = 0xB001u;
+            if (!in_range) code |= 0x01u;
+            if (!aligned)  code |= 0x02u;
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID,
+                                   (uint16_t)ss->size_class,
+                                   ptr,
+                                   ((uintptr_t)slab_idx << 32) | code);
+            return 0;
+        }
+    }
+    // A/B: global disable for remote MPSC — fallback to legacy freelist push
+    do {
+        static int g_disable_remote_glob = -1;
+        if (__builtin_expect(g_disable_remote_glob == -1, 0)) {
+            const char* e = getenv("HAKMEM_TINY_DISABLE_REMOTE");
+            g_disable_remote_glob = (e && *e && *e != '0') ? 1 : 0;
+        }
+        if (__builtin_expect(g_disable_remote_glob, 0)) {
+            TinySlabMeta* meta = &ss->slabs[slab_idx];
+            void* prev = meta->freelist;
+            *(void**)ptr = prev;
+            meta->freelist = ptr;
+            // Reflect accounting (callers also decrement used; keep idempotent here)
+            ss_active_dec_one(ss);
+            if (prev == NULL) {
+                // first item: mark this slab visible to adopters
+                uint32_t bit = (1u << slab_idx);
+                atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
+                return 1;
+            }
+            return 0;
+        }
+    } while (0);
+
+    _Atomic(uintptr_t)* head = &ss->remote_heads[slab_idx];
+    uintptr_t old;
+    do {
+        old = atomic_load_explicit(head, memory_order_acquire);
+        if (!g_remote_side_enable) {
+            *(void**)ptr = (void*)old;  // legacy embedding
+        }
+    } while (!atomic_compare_exchange_weak_explicit(head, &old, (uintptr_t)ptr,
+                                                    memory_order_release, memory_order_relaxed));
+    tiny_remote_side_set(ss, slab_idx, ptr, old);
+    tiny_remote_track_on_remote_push(ss, slab_idx, ptr, "remote_push", 0);
+    if (__builtin_expect(g_debug_remote_guard, 0)) {
+        // One-shot verify just-written next/ptr alignment and range
+        uintptr_t base = (uintptr_t)ss;
+        size_t ss_size = (size_t)1ULL << ss->lg_size;
+        uintptr_t pv = (uintptr_t)ptr;
+        int ptr_in = (pv >= base && pv < base + ss_size);
+        int ptr_al = ((pv & (sizeof(void*) - 1)) == 0);
+        int old_in = (old == 0) || ((old >= base) && (old < base + ss_size));
+        int old_al = (old == 0) || ((old & (sizeof(void*) - 1)) == 0);
+        if (!ptr_in || !ptr_al || !old_in || !old_al) {
+            uintptr_t flags = ((uintptr_t)ptr_al << 3) | ((uintptr_t)ptr_in << 2) | ((uintptr_t)old_al << 1) | (uintptr_t)old_in;
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID,
+                                   (uint16_t)ss->size_class,
+                                   ptr,
+                                   0xB100u | (flags & 0xFu));
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); }
+        }
+        fprintf(stderr, "[REMOTE_PUSH] cls=%u slab=%d ptr=%p old=%p transitioned=%d\n",
+                ss->size_class, slab_idx, ptr, (void*)old, old == 0);
+        // Pack: [slab_idx<<32 | bit0:old==0 | bit1:old_al | bit2:ptr_al]
+        uintptr_t aux = ((uintptr_t)slab_idx << 32) | ((old == 0) ? 1u : 0u) | ((old_al ? 1u : 0u) << 1) | ((ptr_al ? 1u : 0u) << 2);
+        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_PUSH,
+                               (uint16_t)ss->size_class,
+                               ptr,
+                               aux);
+    } else {
+        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_PUSH,
+                               (uint16_t)ss->size_class,
+                               ptr,
+                               ((uintptr_t)slab_idx << 32) | (uint32_t)(old == 0));
+    }
+    atomic_fetch_add_explicit(&ss->remote_counts[slab_idx], 1u, memory_order_relaxed);
+    ss_active_dec_one(ss);  // Fix: Decrement active blocks on cross-thread free
+    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
+    int transitioned = (old == 0);
+    // (optional hint to Ready ring moved to mailbox/aggregator to avoid header coupling)
+    if (transitioned) {
+        // First remote observed for this slab: mark slab_listed and notify publisher paths
+        unsigned prev = atomic_exchange_explicit(&ss->slab_listed[slab_idx], 1u, memory_order_acq_rel);
+        (void)prev; // best-effort
+        tiny_publish_notify((int)ss->size_class, ss, slab_idx);
+    } else {
+        // Optional: best-effort notify if already non-empty but not listed
+        if (__builtin_expect(g_remote_force_notify, 0)) {
+            unsigned listed = atomic_load_explicit(&ss->slab_listed[slab_idx], memory_order_acquire);
+            if (listed == 0) {
+                unsigned prev = atomic_exchange_explicit(&ss->slab_listed[slab_idx], 1u, memory_order_acq_rel);
+                (void)prev;
+                tiny_publish_notify((int)ss->size_class, ss, slab_idx);
+            }
+        }
+    }
+    return transitioned;
+}
+
+// Drain remote queue into freelist (no change to used/active; already adjusted at free)
+// INTERNAL UNSAFE VERSION - Only called by slab_handle.h after ownership verified!
+// DO NOT call directly - use slab_drain_remote() via SlabHandle instead.
+static inline void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta) {
+    do { // one-shot debug print when enabled
+        static int en = -1; static _Atomic int printed;
+        if (__builtin_expect(en == -1, 0)) {
+            const char* e = getenv("HAKMEM_TINY_REFILL_OPT_DEBUG");
+            en = (e && *e && *e != '0') ? 1 : 0;
+        }
+        if (en) {
+            int exp = 0; if (atomic_compare_exchange_strong(&printed, &exp, 1)) {
+                fprintf(stderr, "[DRAIN_OPT] chain splice active (cls=%u slab=%d)\n", ss ? ss->size_class : 0u, slab_idx);
+            }
+        }
+    } while (0);
+    _Atomic(uintptr_t)* head = &ss->remote_heads[slab_idx];
+    uintptr_t p = atomic_exchange_explicit(head, (uintptr_t)NULL, memory_order_acq_rel);
+    if (p == 0) return;
+
+    uint32_t drained = 0;
+    uintptr_t base = (uintptr_t)ss;
+    size_t ss_size = (size_t)1ULL << ss->lg_size;
+    uint32_t drain_tid = (uint32_t)(uintptr_t)pthread_self();
+    // Build a local chain then splice once into freelist to reduce writes
+    void* chain_head = NULL;
+    void* chain_tail = NULL;
+    while (p != 0) {
+        // Guard: range/alignment before deref
+        if (__builtin_expect(g_debug_remote_guard, 0)) {
+            if (p < base || p >= base + ss_size) {
+                uintptr_t aux = tiny_remote_pack_diag(0xA210u, base, ss_size, p);
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)p, aux);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                break;
+            }
+            if ((p & (uintptr_t)(sizeof(void*) - 1)) != 0) {
+                uintptr_t aux = tiny_remote_pack_diag(0xA211u, base, ss_size, p);
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)p, aux);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                break;
+            }
+        }
+        void* node = (void*)p;
+        uintptr_t next = tiny_remote_side_get(ss, slab_idx, node);
+        tiny_remote_watch_note("drain_pull", ss, slab_idx, node, 0xA238u, drain_tid, 0);
+        if (__builtin_expect(g_remote_side_enable, 0)) {
+            if (!tiny_remote_sentinel_ok(node)) {
+                uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, (uintptr_t)node);
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, node, aux);
+                uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)node, memory_order_relaxed);
+                tiny_remote_report_corruption("drain", node, observed);
+                TinySlabMeta* meta = &ss->slabs[slab_idx];
+                fprintf(stderr,
+                        "[REMOTE_SENTINEL-DRAIN] cls=%u slab=%d node=%p drained=%u observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p\n",
+                        ss->size_class,
+                        slab_idx,
+                        node,
+                        drained,
+                        observed,
+                        meta->owner_tid,
+                        (unsigned)meta->used,
+                        meta->freelist);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            }
+            tiny_remote_side_clear(ss, slab_idx, node);
+        }
+        tiny_remote_watch_note("drain_link", ss, slab_idx, node, 0xA239u, drain_tid, 0);
+        tiny_remote_track_on_remote_drain(ss, slab_idx, node, "remote_drain", drain_tid);
+        if (__builtin_expect(g_debug_remote_guard && drained < 3, 0)) {
+            // First few nodes: record low info for triage
+            uintptr_t aux = ((uintptr_t)slab_idx << 32) | (uintptr_t)(drained & 0xFFFF);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_DRAIN, (uint16_t)ss->size_class, node, aux);
+        }
+        // Link into local chain (avoid touching meta->freelist per node)
+        if (chain_head == NULL) {
+            chain_head = node;
+            chain_tail = node;
+            *(void**)node = NULL;
+        } else {
+            *(void**)node = chain_head;
+            chain_head = node;
+        }
+        p = next;
+        drained++;
+    }
+    // Splice the drained chain into freelist (single meta write)
+    if (chain_head != NULL) {
+        if (chain_tail != NULL) {
+            *(void**)chain_tail = meta->freelist;
+        }
+        void* prev = meta->freelist;
+        meta->freelist = chain_head;
+        tiny_failfast_log("remote_drain", ss->size_class, ss, meta, chain_head, prev);
+        // Optional: set freelist bit when transitioning from empty
+        do {
+            static int g_mask_en = -1;
+            if (__builtin_expect(g_mask_en == -1, 0)) {
+                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
+                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
+            }
+            if (__builtin_expect(g_mask_en, 0)) {
+                uint32_t bit = (1u << slab_idx);
+                atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
+            }
+        } while (0);
+    }
+    // Reset remote count after full drain
+    atomic_store_explicit(&ss->remote_counts[slab_idx], 0u, memory_order_relaxed);
+    tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_DRAIN,
+                           (uint16_t)ss->size_class,
+                           ss,
+                           ((uintptr_t)slab_idx << 32) | drained);
+}
+
+// Legacy wrapper for compatibility (UNSAFE - ownership NOT checked!)
+// DEPRECATED: Use slab_drain_remote() via SlabHandle instead
+static inline void ss_remote_drain_to_freelist(SuperSlab* ss, int slab_idx) {
+    TinySlabMeta* meta = &ss->slabs[slab_idx];
+    _ss_remote_drain_to_freelist_unsafe(ss, slab_idx, meta);
+}
+
+// Try to acquire exclusive ownership of slab (REQUIRED before draining remote queue!)
+// Returns 1 on success (now own slab), 0 on failure (another thread owns it)
+// CRITICAL: Only succeeds if slab is unowned (owner_tid==0) or already owned by us.
+static inline int ss_owner_try_acquire(TinySlabMeta* m, uint32_t self_tid) {
+    uint32_t cur = __atomic_load_n(&m->owner_tid, __ATOMIC_RELAXED);
+    if (cur == self_tid) return 1;  // Already owner - success
+    if (cur != 0) return 0;  // Another thread owns it - FAIL immediately
+
+    // Slab is unowned (cur==0) - try to claim it
+    uint32_t expected = 0;
+    return __atomic_compare_exchange_n(&m->owner_tid, &expected, self_tid, false,
+                                       __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+}
+
+// Drain remote queues where activity was observed (lightweight sweep).
+// CRITICAL: Must acquire ownership before draining each slab!
+static inline void ss_remote_drain_light(SuperSlab* ss) {
+    if (!ss) return;
+    uint32_t threshold = tiny_remote_drain_threshold();
+    uint32_t self_tid = (uint32_t)(uintptr_t)pthread_self();
+    int cap = ss_slabs_capacity(ss);
+    for (int s = 0; s < cap; s++) {
+        uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
+        if (rc <= threshold) continue;
+        if (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0) {
+            // BUGFIX: Must acquire ownership BEFORE draining!
+            // Without this, we can drain a slab owned by another thread → freelist corruption
+            TinySlabMeta* m = &ss->slabs[s];
+            if (!ss_owner_try_acquire(m, self_tid)) {
+                continue;  // Failed to acquire - skip this slab
+            }
+            ss_remote_drain_to_freelist(ss, s);
+        }
+    }
+}
+
+// Best-effort CAS to transfer slab ownership (DEPRECATED - use ss_owner_try_acquire!)
+static inline void ss_owner_cas(TinySlabMeta* m, uint32_t self_tid) {
+    (void)ss_owner_try_acquire(m, self_tid);  // Ignore result (unsafe)
+}
+
+#endif // SUPERSLAB_INLINE_H
--- a/core/superslab/superslab_types.h
+++ b/core/superslab/superslab_types.h
@ -0,0 +1,104 @@
+// superslab_types.h - SuperSlab Configuration & Data Structures
+// Purpose: Core types and configuration for SuperSlab allocator
+// Extracted from hakmem_tiny_superslab.h (Phase 6-2.8 Refactoring)
+
+#ifndef SUPERSLAB_TYPES_H
+#define SUPERSLAB_TYPES_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "hakmem_tiny_superslab_constants.h"  // SLAB_SIZE, SUPERSLAB_SLAB0_DATA_OFFSET
+
+// ============================================================================
+// SuperSlab Configuration
+// ============================================================================
+
+// Phase 8.3: ACE - Variable SuperSlab size (1MB ↔ 2MB)
+#define SUPERSLAB_SIZE_MAX  (2 * 1024 * 1024)  // 2MB max size
+#define SUPERSLAB_SIZE_MIN  (1 * 1024 * 1024)  // 1MB min size
+#define SUPERSLAB_LG_MAX    21  // lg(2MB)
+#define SUPERSLAB_LG_MIN    20  // lg(1MB)
+#define SUPERSLAB_LG_DEFAULT 21 // Default: 2MB (syscall reduction, ACE will adapt)
+
+// Number of tiny size classes (same as TINY_NUM_CLASSES to avoid circular include)
+#define TINY_NUM_CLASSES_SS 8  // 8-64 bytes (8, 16, 24, 32, 40, 48, 56, 64)
+
+// Legacy defines (kept for backward compatibility, use lg_size instead)
+#define SUPERSLAB_SIZE    SUPERSLAB_SIZE_MAX  // Default to 2MB (syscall reduction)
+#define SUPERSLAB_MASK    (SUPERSLAB_SIZE - 1)
+// IMPORTANT: Support variable-size SuperSlab (1MB=16 slabs, 2MB=32 slabs)
+// Arrays below must be sized for the MAX to avoid OOB when lg_size=21 (2MB)
+#define SLABS_PER_SUPERSLAB_MIN (SUPERSLAB_SIZE_MIN / SLAB_SIZE)  // 16 for 1MB
+#define SLABS_PER_SUPERSLAB_MAX (SUPERSLAB_SIZE_MAX / SLAB_SIZE)  // 32 for 2MB
+
+// Magic number for validation
+#define SUPERSLAB_MAGIC   0x48414B4D454D5353ULL  // "HAKMEMSS"
+
+// ============================================================================
+// SuperSlab Metadata Structure
+// ============================================================================
+
+// Per-slab metadata (16 bytes)
+typedef struct TinySlabMeta {
+    void*    freelist;       // Freelist head (NULL = linear mode, Phase 6.24)
+    uint16_t used;           // Blocks currently used
+    uint16_t capacity;       // Total blocks in slab
+    uint32_t owner_tid;      // Owner thread ID (for same-thread fast path)
+    // Phase 6.24: freelist == NULL → linear allocation mode (lazy init)
+    // Linear mode: allocate sequentially without building freelist
+    // Freelist mode: use freelist after first free() call
+} TinySlabMeta;
+
+// SuperSlab header (cache-line aligned, 64B)
+typedef struct SuperSlab {
+    // Header fields (64B total)
+    uint64_t magic;              // Magic number (0xHAKMEM_SUPERSLAB)
+    uint8_t  size_class;         // Size class (0-7 for 8-64B)
+    uint8_t  active_slabs;       // Number of active slabs (0-32 for 2MB, 0-16 for 1MB)
+    uint8_t  lg_size;            // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
+    uint8_t  _pad0;              // Padding
+    uint32_t slab_bitmap;        // 32-bit bitmap (1=active, 0=free)
+    _Atomic uint32_t freelist_mask; // Bit i=1 when slab i freelist is non-empty (opt-in)
+
+    // Phase 6-2.1: ChatGPT Pro P0 optimization - O(1) non-empty slab lookup
+    uint32_t nonempty_mask;      // Bit i = 1 if slabs[i].freelist != NULL (O(1) lookup via ctz)
+
+    // Phase 7.6: Deallocation support
+    atomic_uint total_active_blocks; // Total blocks in use (all slabs combined)
+    atomic_uint refcount;            // MT-safe refcount for empty detection/free（将来利用）
+    atomic_uint listed;              // 0/1: published to partial adopt ring（publish gating）
+    uint32_t partial_epoch;         // Last partial madvise epoch (optional)
+    uint8_t  publish_hint;          // Best slab index hint for adopt (0..31), 0xFF=none
+    uint8_t  _pad1[3];              // Padding
+
+    // Per-slab metadata (16B each)
+    // Sized for MAX; use ss->lg_size to bound loops at runtime
+    TinySlabMeta slabs[SLABS_PER_SUPERSLAB_MAX];
+
+    // Remote free queues (per slab): MPSC stack heads + counts
+    _Atomic(uintptr_t) remote_heads[SLABS_PER_SUPERSLAB_MAX];
+    _Atomic(uint32_t)  remote_counts[SLABS_PER_SUPERSLAB_MAX];
+
+    // Per-slab publish state: 0/1 = not listed/listed (for slab-granular republish hints)
+    atomic_uint slab_listed[SLABS_PER_SUPERSLAB_MAX];
+
+    // Partial adopt overflow linkage (single-linked, best-effort)
+    struct SuperSlab* partial_next;
+
+    // Padding to fill remaining space (2MB - 64B - 512B)
+    // Note: Actual slab data starts at offset SLAB_SIZE (64KB)
+
+} __attribute__((aligned(64))) SuperSlab;
+
+// Compile-time assertions
+_Static_assert(sizeof(TinySlabMeta) == 16, "TinySlabMeta must be 16 bytes");
+// Phase 8.3: Variable-size SuperSlab assertions (1MB=16 slabs, 2MB=32 slabs)
+_Static_assert((SUPERSLAB_SIZE_MIN / SLAB_SIZE) == 16, "1MB SuperSlab must have 16 slabs");
+_Static_assert((SUPERSLAB_SIZE_MAX / SLAB_SIZE) == 32, "2MB SuperSlab must have 32 slabs");
+_Static_assert((SUPERSLAB_SIZE & SUPERSLAB_MASK) == 0, "SUPERSLAB_SIZE must be power of 2");
+
+#endif // SUPERSLAB_TYPES_H