// ============================================================================ // Global State // ============================================================================ // Global pool instance (extern declared in hakmem_tiny.h) TinyPool g_tiny_pool; int g_tiny_initialized = 0; // Not static (extern in header for inline access) // Runtime toggle: allow Tiny allocations even inside malloc/free wrappers // Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED // // Results: // Phase 1 (Push - deferred free): +1 instruction, zero benefit // Phase 2 (Pull - background refill): +77 instructions, -3% performance // // Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9% // - TLS Magazine capacity: 2048 items // - Benchmark working set: 100 items // - Magazine hit rate: 100% after warmup // - Slow path never executed! // // Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op // - glibc: ~40 instructions/op (5-7× faster) // - Gap is architectural (bitmap vs free-list, research features) // // Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc! // Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅ // Decision: Enable by default (proven production-ready) static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!) // Optional: allow limited trylock-based refill during wrapper calls static int g_wrap_tiny_refill = 0; // Remote-free drain controls static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback) static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability) // ACE Learning Layer: Per-class remote drain thresholds int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32}; // Sampled counter updates (Phase 3: Replaced with batched TLS counters) // Old: XOR RNG sampling (10-15 ns overhead) // New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead) static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility) // Step 2: Slab Registry (Hash Table) SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE]; PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES]; // Registry lock pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER; // Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF) // O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs) // Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash) static int g_use_registry = 1; // Default ON for thread-safety // TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h // Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable) static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64) static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192) // hakmem_tiny_tls_list.h already included at top static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1 static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want); static int g_fast_enable = 1; static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0 static uint16_t g_fast_cap[TINY_NUM_CLASSES]; static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1 static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES]; typedef void* (*TinyHotAllocFn)(void); static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES]; static __thread void* g_fast_head[TINY_NUM_CLASSES]; static __thread uint16_t g_fast_count[TINY_NUM_CLASSES]; static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls); uint64_t g_tls_hit_count[TINY_NUM_CLASSES]; uint64_t g_tls_miss_count[TINY_NUM_CLASSES]; uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES]; uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES]; uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES]; uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES]; // Legacy magazine definitions have been moved to hakmem_tiny_magazine.h // NEW: Per-thread active slabs (up to 2 per class) static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES]; static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES]; static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) { TinySlab* cand = g_tls_active_slab_a[class_idx]; if (cand) { uintptr_t base = (uintptr_t)cand->base; if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { return cand; } } cand = g_tls_active_slab_b[class_idx]; if (cand) { uintptr_t base = (uintptr_t)cand->base; if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { return cand; } } return NULL; } // Phase 6.23: SuperSlab support (mimalloc-style fast allocation) // Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line. extern int g_use_superslab; #if !HAKMEM_BUILD_RELEASE static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { if (!ptr) return; // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL; if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { SuperSlab* ss = hak_super_lookup(ptr); if (!(ss && ss->magic == SUPERSLAB_MAGIC)) { tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail"); } else { int slab_idx = slab_index_for(ss, base_ptr); if (slab_idx < 0) { tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch"); } else { // Fail-Fast: class vs SuperSlab size_class must be consistent. TinySlabMeta* meta = &ss->slabs[slab_idx]; if (meta->class_idx != (uint8_t)cls) { tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch"); } size_t blk = g_tiny_class_sizes[cls]; uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx); uintptr_t delta = (uintptr_t)base_ptr - base; if (blk == 0 || (delta % blk) != 0) { tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned"); } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) { tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity"); } } } } if (!__builtin_expect(g_debug_remote_guard, 0)) return; if (!g_use_superslab) return; SuperSlab* ss = hak_super_lookup(ptr); if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return; int slab_idx = slab_index_for(ss, base_ptr); if (slab_idx >= 0) { tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0); } } #else static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; } #endif // Debug counters for SuperSlab investigation #if HAKMEM_DEBUG_COUNTERS int g_superslab_alloc_count = 0; int g_superslab_fail_count = 0; int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls #endif // Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve // Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact) // Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress) #define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default) static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1) static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER; static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default static uint32_t g_ss_partial_interval = 4; static _Atomic uint32_t g_ss_partial_epoch = 0; // Phase 6.24: Unified TLS slab cache (Medium fix) // Reduces TLS reads from 3 to 1 (cache-line aligned for performance) // Phase E4: 64B alignment for L1 cache optimization __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64))); static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES]; static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES]; static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES]; static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES]; // ---------------------------------------------------------------------------- // Per-class partial SuperSlab slot (single-slot publish/adopt) // ---------------------------------------------------------------------------- // Small ring of partial SuperSlabs per class (publish/adopt) #ifndef SS_PARTIAL_RING #define SS_PARTIAL_RING 64 #endif static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING]; static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES]; static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES]; static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES]; static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN // Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS) unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0}; _Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active static _Atomic int g_ss_adopt_log_once = 0; static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) { if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) { fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n", reason ? reason : "unknown", class_idx); } } static inline void tiny_adopt_gate_parse_env(void) { if (__builtin_expect(g_ss_adopt_env == -2, 0)) { const char* env = getenv("HAKMEM_TINY_SS_ADOPT"); if (!env || *env == '\0') { g_ss_adopt_env = 0; // auto } else if (*env == '0') { g_ss_adopt_env = -1; // forced OFF atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release); } else { g_ss_adopt_env = 1; // forced ON atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release); tiny_adopt_gate_log_activation("env", -1); } } } int tiny_adopt_gate_should_publish(void) { tiny_adopt_gate_parse_env(); if (g_ss_adopt_env == 1) return 1; if (g_ss_adopt_env == -1) return 0; return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; } int tiny_adopt_gate_should_adopt(void) { tiny_adopt_gate_parse_env(); if (g_ss_adopt_env == 1) return 1; if (g_ss_adopt_env == -1) return 0; return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; } void tiny_adopt_gate_on_remote_seen(int class_idx) { tiny_adopt_gate_parse_env(); atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed); if (g_ss_adopt_env == -1) return; int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel); if (prev == 0) { tiny_adopt_gate_log_activation("remote", class_idx); } } // Publish/adopt instrumentation, bench mailboxes, and TLS target helpers