hakmem/core/hakmem_tiny_globals_box.inc

// ============================================================================
// Global State
// ============================================================================

// Global pool instance (extern declared in hakmem_tiny.h)
TinyPool g_tiny_pool;
int g_tiny_initialized = 0;  // Not static (extern in header for inline access)
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
//
// Results:
//   Phase 1 (Push - deferred free): +1 instruction, zero benefit
//   Phase 2 (Pull - background refill): +77 instructions, -3% performance
//
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
//   - TLS Magazine capacity: 2048 items
//   - Benchmark working set: 100 items
//   - Magazine hit rate: 100% after warmup
//   - Slow path never executed!
//
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
//   - glibc: ~40 instructions/op (5-7× faster)
//   - Gap is architectural (bitmap vs free-list, research features)
//
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
// Decision: Enable by default (proven production-ready)
static int g_wrap_tiny_enabled = 1;  // ON by default (faster than glibc!)
// Optional: allow limited trylock-based refill during wrapper calls
static int g_wrap_tiny_refill = 0;
// Remote-free drain controls
static int g_remote_drain_thresh = 32;   // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
static int g_remote_drain_tryrate = 16;  // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)

// ACE Learning Layer: Per-class remote drain thresholds
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
// Old: XOR RNG sampling (10-15 ns overhead)
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
static int g_tiny_count_sample_exp = 8;   // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)

// Step 2: Slab Registry (Hash Table)
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];

PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];

// Registry lock
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;

// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
static int g_use_registry = 1;  // Default ON for thread-safety

// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
static int g_tiny_refill_max = 64;        // HAKMEM_TINY_REFILL_MAX (default 64)
static int g_tiny_refill_max_hot = 192;   // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)

// hakmem_tiny_tls_list.h already included at top
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
static int g_tls_list_enable = 0;  // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
static int g_fast_enable = 1;
static int g_fastcache_enable = 1;           // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
static int g_ultra_bump_shadow = 0;          // HAKMEM_TINY_BUMP_SHADOW=1
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];


typedef void* (*TinyHotAllocFn)(void);
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
static __thread void* g_fast_head[TINY_NUM_CLASSES];
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);

uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];

// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
// NEW: Per-thread active slabs (up to 2 per class)
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];

static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
    TinySlab* cand = g_tls_active_slab_a[class_idx];
    if (cand) {
        uintptr_t base = (uintptr_t)cand->base;
        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
            return cand;
        }
    }
    cand = g_tls_active_slab_b[class_idx];
    if (cand) {
        uintptr_t base = (uintptr_t)cand->base;
        if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
            return cand;
        }
    }
    return NULL;
}

// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
extern int g_use_superslab;

#if !HAKMEM_BUILD_RELEASE
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
    if (!ptr) return;
    // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
    void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
    if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
        SuperSlab* ss = hak_super_lookup(ptr);
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
            tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
        } else {
            int slab_idx = slab_index_for(ss, base_ptr);
            if (slab_idx < 0) {
                tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
            } else {
                // Fail-Fast: class vs SuperSlab size_class must be consistent.
    TinySlabMeta* meta = &ss->slabs[slab_idx];
    if (meta->class_idx != (uint8_t)cls) {
        tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
    }
                size_t blk = g_tiny_class_sizes[cls];
                uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
                uintptr_t delta = (uintptr_t)base_ptr - base;
                if (blk == 0 || (delta % blk) != 0) {
                    tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
                } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
                    tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
                }
            }
        }
    }
    if (!__builtin_expect(g_debug_remote_guard, 0)) return;
    if (!g_use_superslab) return;
    SuperSlab* ss = hak_super_lookup(ptr);
    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
    int slab_idx = slab_index_for(ss, base_ptr);
    if (slab_idx >= 0) {
        tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
    }
}
#else
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
#endif

// Debug counters for SuperSlab investigation
#if HAKMEM_DEBUG_COUNTERS
int g_superslab_alloc_count = 0;
int g_superslab_fail_count = 0;
int g_superslab_free_count = 0;   // Phase 7.6: Track SuperSlab frees
int g_empty_superslab_count = 0;  // Phase 7.6: Track empty SuperSlabs detected
int g_magazine_push_count = 0;    // Phase 7.6: Track Magazine pushes
int g_tiny_free_with_slab_count = 0;  // Phase 7.6: Track tiny_free_with_slab calls
#endif

// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
#define EMPTY_SUPERSLAB_RESERVE 0  // Keep up to N empty SuperSlabs per class (default)
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES];  // One empty SuperSlab per class
static int g_empty_counts[TINY_NUM_CLASSES] = {0};  // Count of empty SuperSlabs
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
static int g_ss_partial_enable = 1;  // Enable partial SuperSlab release by default
static uint32_t g_ss_partial_interval = 4;
static _Atomic uint32_t g_ss_partial_epoch = 0;

// Phase 6.24: Unified TLS slab cache (Medium fix)
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
// Phase E4: 64B alignment for L1 cache optimization
__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];

// ----------------------------------------------------------------------------
// Per-class partial SuperSlab slot (single-slot publish/adopt)
// ----------------------------------------------------------------------------
// Small ring of partial SuperSlabs per class (publish/adopt)
#ifndef SS_PARTIAL_RING
#define SS_PARTIAL_RING 64
#endif
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];

// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
_Atomic int g_ss_remote_seen = 0;  // becomes 1 when any remote free occurs
static int g_ss_adopt_env = -2;    // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
static _Atomic int g_ss_adopt_runtime = 0;  // 0=inactive, 1=active
static _Atomic int g_ss_adopt_log_once = 0;

static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
    if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
        fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
                reason ? reason : "unknown", class_idx);
    }
}

static inline void tiny_adopt_gate_parse_env(void) {
    if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
        const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
        if (!env || *env == '\0') {
            g_ss_adopt_env = 0;  // auto
        } else if (*env == '0') {
            g_ss_adopt_env = -1; // forced OFF
            atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
        } else {
            g_ss_adopt_env = 1;  // forced ON
            atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
            tiny_adopt_gate_log_activation("env", -1);
        }
    }
}

int tiny_adopt_gate_should_publish(void) {
    tiny_adopt_gate_parse_env();
    if (g_ss_adopt_env == 1) return 1;
    if (g_ss_adopt_env == -1) return 0;
    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}

int tiny_adopt_gate_should_adopt(void) {
    tiny_adopt_gate_parse_env();
    if (g_ss_adopt_env == 1) return 1;
    if (g_ss_adopt_env == -1) return 0;
    return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}

void tiny_adopt_gate_on_remote_seen(int class_idx) {
    tiny_adopt_gate_parse_env();
    atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
    if (g_ss_adopt_env == -1) return;
    int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
    if (prev == 0) {
        tiny_adopt_gate_log_activation("remote", class_idx);
    }
}

// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers