hakmem/core/hakmem_tiny_globals_box.inc

// ============================================================================
// Global State
// ============================================================================

// Global pool instance (extern declared in hakmem_tiny.h)
TinyPool g_tiny_pool;
int g_tiny_initialized = 0;  // Not static (extern in header for inline access)
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
//
// Results:
//   Phase 1 (Push - deferred free): +1 instruction, zero benefit
//   Phase 2 (Pull - background refill): +77 instructions, -3% performance
//
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
//   - TLS Magazine capacity: 2048 items
//   - Benchmark working set: 100 items
//   - Magazine hit rate: 100% after warmup
//   - Slow path never executed!
//
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
//   - glibc: ~40 instructions/op (5-7× faster)
//   - Gap is architectural (bitmap vs free-list, research features)
//
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
// Decision: Enable by default (proven production-ready)
static int g_wrap_tiny_enabled = 1;  // ON by default (faster than glibc!)
// Optional: allow limited trylock-based refill during wrapper calls
static int g_wrap_tiny_refill = 0;
// Remote-free drain controls
static int g_remote_drain_thresh = 32;   // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
static int g_remote_drain_tryrate = 16;  // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)

// ACE Learning Layer: Per-class remote drain thresholds
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
// Old: XOR RNG sampling (10-15 ns overhead)
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
static int g_tiny_count_sample_exp = 8;   // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)

// Step 2: Slab Registry (Hash Table)
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];

PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];

// Registry lock
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;

// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
static int g_use_registry = 1;  // Default ON for thread-safety

// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
static int g_tiny_refill_max = 64;        // HAKMEM_TINY_REFILL_MAX (default 64)
static int g_tiny_refill_max_hot = 192;   // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)

// hakmem_tiny_tls_list.h already included at top
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
static int g_tls_list_enable = 0;  // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
static int g_fast_enable = 1;
static int g_fastcache_enable = 1;           // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
static int g_ultra_bump_shadow = 0;          // HAKMEM_TINY_BUMP_SHADOW=1
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];


typedef void* (*TinyHotAllocFn)(void);
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
static __thread void* g_fast_head[TINY_NUM_CLASSES];
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);

uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];

// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
// NEW: Per-thread active slabs (up to 2 per class)
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];

static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
    TinySlab* cand = g_tls_active_slab_a[class_idx];
    if (cand) {
        uintptr_t base = (uintptr_t)cand->base;
        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
            return cand;
        }
    }
    cand = g_tls_active_slab_b[class_idx];
    if (cand) {
        uintptr_t base = (uintptr_t)cand->base;
        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
            return cand;
        }
    }
    return NULL;
}

// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
extern int g_use_superslab;

#if !HAKMEM_BUILD_RELEASE
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
    if (!ptr) return;
    // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
    void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
    if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
        SuperSlab* ss = hak_super_lookup(ptr);
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
            tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
        } else {
            int slab_idx = slab_index_for(ss, base_ptr);
            if (slab_idx < 0) {
                tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
            } else {
                // Fail-Fast: class vs SuperSlab size_class must be consistent.
    TinySlabMeta* meta = &ss->slabs[slab_idx];
    if (meta->class_idx != (uint8_t)cls) {
        tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
    }
                size_t blk = g_tiny_class_sizes[cls];
                uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
                uintptr_t delta = (uintptr_t)base_ptr - base;
                if (blk == 0 || (delta % blk) != 0) {
                    tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
                } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
                    tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
                }
            }
        }
    }
    if (!__builtin_expect(g_debug_remote_guard, 0)) return;
    if (!g_use_superslab) return;
    SuperSlab* ss = hak_super_lookup(ptr);
    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
    int slab_idx = slab_index_for(ss, base_ptr);
    if (slab_idx >= 0) {
        tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
    }
}
#else
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
#endif

// Debug counters for SuperSlab investigation
#if HAKMEM_DEBUG_COUNTERS
int g_superslab_alloc_count = 0;
int g_superslab_fail_count = 0;
int g_superslab_free_count = 0;   // Phase 7.6: Track SuperSlab frees
int g_empty_superslab_count = 0;  // Phase 7.6: Track empty SuperSlabs detected
int g_magazine_push_count = 0;    // Phase 7.6: Track Magazine pushes
int g_tiny_free_with_slab_count = 0;  // Phase 7.6: Track tiny_free_with_slab calls
#endif

// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
#define EMPTY_SUPERSLAB_RESERVE 0  // Keep up to N empty SuperSlabs per class (default)
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES];  // One empty SuperSlab per class
static int g_empty_counts[TINY_NUM_CLASSES] = {0};  // Count of empty SuperSlabs
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
static int g_ss_partial_enable = 1;  // Enable partial SuperSlab release by default
static uint32_t g_ss_partial_interval = 4;
static _Atomic uint32_t g_ss_partial_epoch = 0;

// Phase 6.24: Unified TLS slab cache (Medium fix)
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
// Phase E4: 64B alignment for L1 cache optimization
__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];

// ----------------------------------------------------------------------------
// Per-class partial SuperSlab slot (single-slot publish/adopt)
// ----------------------------------------------------------------------------
// Small ring of partial SuperSlabs per class (publish/adopt)
#ifndef SS_PARTIAL_RING
#define SS_PARTIAL_RING 64
#endif
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];

// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
_Atomic int g_ss_remote_seen = 0;  // becomes 1 when any remote free occurs
static int g_ss_adopt_env = -2;    // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
static _Atomic int g_ss_adopt_runtime = 0;  // 0=inactive, 1=active
static _Atomic int g_ss_adopt_log_once = 0;

static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
    if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
        fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
                reason ? reason : "unknown", class_idx);
    }
}

static inline void tiny_adopt_gate_parse_env(void) {
    if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
        const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
        if (!env || *env == '\0') {
            g_ss_adopt_env = 0;  // auto
        } else if (*env == '0') {
            g_ss_adopt_env = -1; // forced OFF
            atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
        } else {
            g_ss_adopt_env = 1;  // forced ON
            atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
            tiny_adopt_gate_log_activation("env", -1);
        }
    }
}

int tiny_adopt_gate_should_publish(void) {
    tiny_adopt_gate_parse_env();
    if (g_ss_adopt_env == 1) return 1;
    if (g_ss_adopt_env == -1) return 0;
    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}

int tiny_adopt_gate_should_adopt(void) {
    tiny_adopt_gate_parse_env();
    if (g_ss_adopt_env == 1) return 1;
    if (g_ss_adopt_env == -1) return 0;
    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}

void tiny_adopt_gate_on_remote_seen(int class_idx) {
    tiny_adopt_gate_parse_env();
    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
    if (g_ss_adopt_env == -1) return;
    int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
    if (prev == 0) {
        tiny_adopt_gate_log_activation("remote", class_idx);
    }
}

// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers
-												Refactor: Extract 5 Box modules from hakmem_tiny.c (-52% size reduction)

Split hakmem_tiny.c (2081 lines) into focused modules for better maintainability.

## Changes

**hakmem_tiny.c**: 2081 → 995 lines (-1086 lines, -52% reduction)

## Extracted Modules (5 boxes)

1. **config_box** (211 lines)
   - Size class tables, integrity counters
   - Debug flags, benchmark macros
   - HAK_RET_ALLOC/HAK_STAT_FREE instrumentation

2. **publish_box** (419 lines)
   - Publish/Adopt counters and statistics
   - Bench mailbox, partial ring
   - Live cap/Hot slot management
   - TLS helper functions (tiny_tls_default_*)

3. **globals_box** (256 lines)
   - Global variable declarations (~70 variables)
   - TinyPool instance and initialization flag
   - TLS variables (g_tls_lists, g_fast_head, g_fast_count)
   - SuperSlab configuration (partial ring, empty reserves)
   - Adopt gate functions

4. **phase6_wrappers_box** (122 lines)
   - Phase 6 Box Theory wrapper layer
   - hak_tiny_alloc_fast_wrapper()
   - hak_tiny_free_fast_wrapper()
   - Diagnostic instrumentation

5. **ace_guard_box** (100 lines)
   - ACE Learning Layer (hkm_ace_set_drain_threshold)
   - FastCache API (tiny_fc_room, tiny_fc_push_bulk)
   - Tiny Guard debugging system (5 functions)

## Benefits

- **Readability**: Giant 2k file → focused 1k core + 5 coherent modules
- **Maintainability**: Each box has clear responsibility and boundaries
- **Build**: All modules compile successfully ✅

## Technical Details

- Phase 1: ChatGPT extracted config_box + publish_box (-625 lines)
- Phase 2-4: Claude extracted globals_box + phase6_wrappers_box + ace_guard_box (-461 lines)
- All extractions use .inc files (same translation unit, preserves static/TLS linkage)
- Fixed Makefile: Added tiny_sizeclass_hist_box.o to OBJS_BASE and BENCH_HAKMEM_OBJS_BASE

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-21 01:16:45 +09:00
+								// ============================================================================
 								// Global State
 								// ============================================================================
 								// Global pool instance (extern declared in hakmem_tiny.h)
 								TinyPool g_tiny_pool;
 								int g_tiny_initialized = 0;  // Not static (extern in header for inline access)
 								// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
 								// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
 								//
 								// Results:
 								//   Phase 1 (Push - deferred free): +1 instruction, zero benefit
 								//   Phase 2 (Pull - background refill): +77 instructions, -3% performance
 								//
 								// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
 								//   - TLS Magazine capacity: 2048 items
 								//   - Benchmark working set: 100 items
 								//   - Magazine hit rate: 100% after warmup
 								//   - Slow path never executed!
 								//
 								// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
 								//   - glibc: ~40 instructions/op (5-7× faster)
 								//   - Gap is architectural (bitmap vs free-list, research features)
 								//
 								// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
 								// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
 								// Decision: Enable by default (proven production-ready)
 								static int g_wrap_tiny_enabled = 1;  // ON by default (faster than glibc!)
 								// Optional: allow limited trylock-based refill during wrapper calls
 								static int g_wrap_tiny_refill = 0;
 								// Remote-free drain controls
 								static int g_remote_drain_thresh = 32;   // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
 								static int g_remote_drain_tryrate = 16;  // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
 								// ACE Learning Layer: Per-class remote drain thresholds
 								int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
 								// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
 								// Old: XOR RNG sampling (10-15 ns overhead)
 								// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
 								static int g_tiny_count_sample_exp = 8;   // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
 								// Step 2: Slab Registry (Hash Table)
 								SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
 								PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
 								// Registry lock
 								pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
 								// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
 								// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
 								// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
 								static int g_use_registry = 1;  // Default ON for thread-safety
 								// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
 								// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
 								static int g_tiny_refill_max = 64;        // HAKMEM_TINY_REFILL_MAX (default 64)
 								static int g_tiny_refill_max_hot = 192;   // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
 								// hakmem_tiny_tls_list.h already included at top
 								static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
 								static int g_tls_list_enable = 0;  // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
 								static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
 								static int g_fast_enable = 1;
 								static int g_fastcache_enable = 1;           // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
 								static uint16_t g_fast_cap[TINY_NUM_CLASSES];
 								static int g_ultra_bump_shadow = 0;          // HAKMEM_TINY_BUMP_SHADOW=1
 								static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
 								typedef void* (*TinyHotAllocFn)(void);
 								static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
 								static __thread void* g_fast_head[TINY_NUM_CLASSES];
 								static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
 								static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
 								uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
 								uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
 								uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
 								uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
 								uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
 								uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
 								// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
 								// NEW: Per-thread active slabs (up to 2 per class)
 								static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
 								static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
 								static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
 								    TinySlab* cand = g_tls_active_slab_a[class_idx];
 								    if (cand) {
 								        uintptr_t base = (uintptr_t)cand->base;
 								        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
 								            return cand;
 								        }
 								    }
 								    cand = g_tls_active_slab_b[class_idx];
 								    if (cand) {
 								        uintptr_t base = (uintptr_t)cand->base;
 								        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
 								            return cand;
 								        }
 								    }
 								    return NULL;
 								}
 								// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
 								// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
 								extern int g_use_superslab;
 								#if !HAKMEM_BUILD_RELEASE
 								static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
 								    if (!ptr) return;
 								    // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
 								    void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
 								    if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
 								        SuperSlab* ss = hak_super_lookup(ptr);
 								        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
 								            tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
 								        } else {
 								            int slab_idx = slab_index_for(ss, base_ptr);
 								            if (slab_idx < 0) {
 								                tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
 								            } else {
 								                // Fail-Fast: class vs SuperSlab size_class must be consistent.
 								    TinySlabMeta* meta = &ss->slabs[slab_idx];
 								    if (meta->class_idx != (uint8_t)cls) {
 								        tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
 								    }
 								                size_t blk = g_tiny_class_sizes[cls];
 								                uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
 								                uintptr_t delta = (uintptr_t)base_ptr - base;
 								                if (blk == 0 || (delta % blk) != 0) {
 								                    tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
 								                } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
 								                    tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
 								                }
 								            }
 								        }
 								    }
 								    if (!__builtin_expect(g_debug_remote_guard, 0)) return;
 								    if (!g_use_superslab) return;
 								    SuperSlab* ss = hak_super_lookup(ptr);
 								    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
 								    int slab_idx = slab_index_for(ss, base_ptr);
 								    if (slab_idx >= 0) {
 								        tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
 								    }
 								}
 								#else
 								static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
 								#endif
 								// Debug counters for SuperSlab investigation
 								#if HAKMEM_DEBUG_COUNTERS
 								int g_superslab_alloc_count = 0;
 								int g_superslab_fail_count = 0;
 								int g_superslab_free_count = 0;   // Phase 7.6: Track SuperSlab frees
 								int g_empty_superslab_count = 0;  // Phase 7.6: Track empty SuperSlabs detected
 								int g_magazine_push_count = 0;    // Phase 7.6: Track Magazine pushes
 								int g_tiny_free_with_slab_count = 0;  // Phase 7.6: Track tiny_free_with_slab calls
 								#endif
 								// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
 								// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
 								// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
 								#define EMPTY_SUPERSLAB_RESERVE 0  // Keep up to N empty SuperSlabs per class (default)
 								static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES];  // One empty SuperSlab per class
 								static int g_empty_counts[TINY_NUM_CLASSES] = {0};  // Count of empty SuperSlabs
 								static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
 								static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
 								static int g_ss_partial_enable = 1;  // Enable partial SuperSlab release by default
 								static uint32_t g_ss_partial_interval = 4;
 								static _Atomic uint32_t g_ss_partial_epoch = 0;
 								// Phase 6.24: Unified TLS slab cache (Medium fix)
 								// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
 								// Phase E4: 64B alignment for L1 cache optimization
 								__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
 								static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
 								static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
 								static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
 								static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
 								static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
 								static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
 								static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
 								// ----------------------------------------------------------------------------
 								// Per-class partial SuperSlab slot (single-slot publish/adopt)
 								// ----------------------------------------------------------------------------
 								// Small ring of partial SuperSlabs per class (publish/adopt)
 								#ifndef SS_PARTIAL_RING
 								#define SS_PARTIAL_RING 64
 								#endif
 								static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
 								static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
 								static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
 								// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
 								unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
 								unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
 								_Atomic int g_ss_remote_seen = 0;  // becomes 1 when any remote free occurs
 								static int g_ss_adopt_env = -2;    // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
 								static _Atomic int g_ss_adopt_runtime = 0;  // 0=inactive, 1=active
 								static _Atomic int g_ss_adopt_log_once = 0;
 								static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
 								    if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
 								        fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
 								                reason ? reason : "unknown", class_idx);
 								    }
 								}
 								static inline void tiny_adopt_gate_parse_env(void) {
 								    if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
 								        const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
 								        if (!env || *env == '\0') {
 								            g_ss_adopt_env = 0;  // auto
 								        } else if (*env == '0') {
 								            g_ss_adopt_env = -1; // forced OFF
 								            atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
 								        } else {
 								            g_ss_adopt_env = 1;  // forced ON
 								            atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
 								            tiny_adopt_gate_log_activation("env", -1);
 								        }
 								    }
 								}
 								int tiny_adopt_gate_should_publish(void) {
 								    tiny_adopt_gate_parse_env();
 								    if (g_ss_adopt_env == 1) return 1;
 								    if (g_ss_adopt_env == -1) return 0;
 								    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
 								}
 								int tiny_adopt_gate_should_adopt(void) {
 								    tiny_adopt_gate_parse_env();
 								    if (g_ss_adopt_env == 1) return 1;
 								    if (g_ss_adopt_env == -1) return 0;
 								    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
 								}
 								void tiny_adopt_gate_on_remote_seen(int class_idx) {
 								    tiny_adopt_gate_parse_env();
 								    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
 								    if (g_ss_adopt_env == -1) return;
 								    int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
 								    if (prev == 0) {
 								        tiny_adopt_gate_log_activation("remote", class_idx);
 								    }
 								}
 								// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers