#include "hakmem_tiny.h" #include "hakmem_tiny_config.h" // Centralized configuration #include "hakmem_phase7_config.h" // Phase 7: Task 3 constants (PREWARM_COUNT, etc.) #include "hakmem_tiny_superslab.h" // Phase 6.22: SuperSlab allocator #include "hakmem_super_registry.h" // Phase 8.2: SuperSlab registry for memory profiling #include "hakmem_internal.h" #include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD) #include "hakmem_tiny_magazine.h" // Phase 1 modules (must come AFTER hakmem_tiny.h for TinyPool definition) #include "hakmem_tiny_batch_refill.h" // Phase 1: Batch refill/spill for mini-magazine #include "hakmem_tiny_stats.h" // Phase 1: Batched statistics (replaces XOR RNG) // Phase 2B modules #include "tiny_api.h" // Consolidated: stats_api, query_api, rss_api, registry_api #include "tiny_tls.h" #include "tiny_debug.h" #include "tiny_mmap_gate.h" #include "tiny_debug_ring.h" #include "tiny_route.h" #include "tiny_tls_guard.h" #include "tiny_ready.h" #include "hakmem_tiny_tls_list.h" #include "hakmem_tiny_remote_target.h" // Phase 2C-1: Remote target queue #include "hakmem_tiny_bg_spill.h" // Phase 2C-2: Background spill queue #include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive TLS cache sizing // NOTE: hakmem_tiny_tls_ops.h included later (after type definitions) #include "tiny_system.h" // Consolidated: stdio, stdlib, string, etc. #include "hakmem_prof.h" #include "hakmem_trace.h" // Optional USDT (perf) tracepoints extern uint64_t g_bytes_allocated; // from hakmem_tiny_superslab.c // ============================================================================ // Size class table (Box 3 dependency) // ============================================================================ // Definition for g_tiny_class_sizes (declared in hakmem_tiny_config.h) // Used by Box 3 (tiny_box_geometry.h) for stride calculations const size_t g_tiny_class_sizes[TINY_NUM_CLASSES] = { 8, // Class 0: 8 bytes 16, // Class 1: 16 bytes 32, // Class 2: 32 bytes 64, // Class 3: 64 bytes 128, // Class 4: 128 bytes 256, // Class 5: 256 bytes 512, // Class 6: 512 bytes 1024 // Class 7: 1024 bytes }; // Build-time gate for debug counters (path/ultra). Default OFF. #ifndef HAKMEM_DEBUG_COUNTERS #define HAKMEM_DEBUG_COUNTERS 0 #endif int g_debug_fast0 = 0; int g_debug_remote_guard = 0; int g_remote_force_notify = 0; // Tiny free safety (debug) int g_tiny_safe_free = 1; // ULTRATHINK FIX: Enable by default to catch double-frees. env: HAKMEM_SAFE_FREE=1 int g_tiny_safe_free_strict = 0; // env: HAKMEM_SAFE_FREE_STRICT=1 int g_tiny_force_remote = 0; // env: HAKMEM_TINY_FORCE_REMOTE=1 // Build-time gate: Minimal Tiny front (bench-only) static inline int superslab_trace_enabled(void) { static int g_ss_trace_flag = -1; if (__builtin_expect(g_ss_trace_flag == -1, 0)) { const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE"); g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0; } return g_ss_trace_flag; } // When enabled, physically excludes optional front tiers from the hot path // (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving: // SLL → TLS Magazine → SuperSlab → (remaining slow path) #ifndef HAKMEM_TINY_MINIMAL_FRONT #define HAKMEM_TINY_MINIMAL_FRONT 0 #endif // Strict front: compile-out optional front tiers but keep baseline structure intact #ifndef HAKMEM_TINY_STRICT_FRONT #define HAKMEM_TINY_STRICT_FRONT 0 #endif // Bench-only fast path knobs (defaults) #ifndef HAKMEM_TINY_BENCH_REFILL #define HAKMEM_TINY_BENCH_REFILL 8 #endif // Optional per-class overrides (bench-only) #ifndef HAKMEM_TINY_BENCH_REFILL8 #define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL #endif #ifndef HAKMEM_TINY_BENCH_REFILL16 #define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL #endif #ifndef HAKMEM_TINY_BENCH_REFILL32 #define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL #endif #ifndef HAKMEM_TINY_BENCH_REFILL64 #define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL #endif // Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class) #ifndef HAKMEM_TINY_BENCH_WARMUP8 #define HAKMEM_TINY_BENCH_WARMUP8 64 #endif #ifndef HAKMEM_TINY_BENCH_WARMUP16 #define HAKMEM_TINY_BENCH_WARMUP16 96 #endif #ifndef HAKMEM_TINY_BENCH_WARMUP32 #define HAKMEM_TINY_BENCH_WARMUP32 160 #endif #ifndef HAKMEM_TINY_BENCH_WARMUP64 #define HAKMEM_TINY_BENCH_WARMUP64 192 #endif #ifdef HAKMEM_TINY_BENCH_FASTPATH static __thread unsigned char g_tls_bench_warm_done[4]; #endif #if HAKMEM_DEBUG_COUNTERS #define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0) #define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0) #else #define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0) #define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0) #endif // Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0) #if HAKMEM_DEBUG_COUNTERS #define HAK_DBG_INC(var) do { (var)++; } while(0) #else #define HAK_DBG_INC(var) do { (void)0; } while(0) #endif // Return helper: record tiny alloc stat (guarded) then return pointer static inline void tiny_debug_track_alloc_ret(int cls, void* ptr); // ========== HAK_RET_ALLOC: Single Definition Point ========== // Choose implementation based on HAKMEM_TINY_HEADER_CLASSIDX // - Phase 7 enabled: Write header and return user pointer // - Phase 7 disabled: Legacy behavior (stats + route + return) #if HAKMEM_TINY_HEADER_CLASSIDX // Phase 7: Write class_idx to header before returning #define HAK_RET_ALLOC(cls, ptr) return tiny_region_id_write_header((ptr), (cls)) #else // Legacy: Stats and routing before return #ifdef HAKMEM_ENABLE_STATS // Optional: sampling(ビルド時に有効化)。ホットパスは直接インライン呼び出し(間接分岐なし)。 #ifdef HAKMEM_TINY_STAT_SAMPLING static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES]; static int g_stat_rate_lg = 0; // 0=毎回、それ以外=2^lgごと static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; } unsigned m = (1u << g_stat_rate_lg) - 1u; if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls); } #else static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); } #endif #define HAK_RET_ALLOC(cls, ptr) do { \ tiny_debug_track_alloc_ret((cls), (ptr)); \ hkm_stat_alloc((cls)); \ ROUTE_COMMIT((cls), 0x7F); \ return (ptr); \ } while(0) #else #define HAK_RET_ALLOC(cls, ptr) do { \ tiny_debug_track_alloc_ret((cls), (ptr)); \ ROUTE_COMMIT((cls), 0x7F); \ return (ptr); \ } while(0) #endif #endif // HAKMEM_TINY_HEADER_CLASSIDX // Free-side stats: compile-time zero when stats disabled #ifdef HAKMEM_ENABLE_STATS #define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0) #else #define HAK_STAT_FREE(cls) do { } while(0) #endif // Forward declarations for static helpers used before definition struct TinySlab; // forward static void move_to_free_list(int class_idx, struct TinySlab* target_slab); static void move_to_full_list(int class_idx, struct TinySlab* target_slab); static void release_slab(struct TinySlab* slab); static TinySlab* allocate_new_slab(int class_idx); static void tiny_tls_cache_drain(int class_idx); static void tiny_apply_mem_diet(void); // Phase 6.23: SuperSlab allocation forward declaration static inline void* hak_tiny_alloc_superslab(int class_idx); static inline void* superslab_tls_bump_fast(int class_idx); static SuperSlab* superslab_refill(int class_idx); static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx); static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap); // Forward decl: used by tiny_spec_pop_path before its definition // Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c) // Note: Remove 'inline' to provide linkable definition for LTO // P0 Fix: When P0 is enabled, use sll_refill_batch_from_ss instead #if HAKMEM_TINY_P0_BATCH_REFILL // P0 enabled: use batch refill static inline int sll_refill_batch_from_ss(int class_idx, int max_take); #else // P0 disabled: use original refill #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR int sll_refill_small_from_ss(int class_idx, int max_take); #else static inline int sll_refill_small_from_ss(int class_idx, int max_take); #endif #endif static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss); static void* __attribute__((cold, noinline)) tiny_slow_alloc_fast(int class_idx); static inline void tiny_remote_drain_owner(struct TinySlab* slab); static void tiny_remote_drain_locked(struct TinySlab* slab); // Ultra-fast try-only variant: attempt a direct SuperSlab bump/freelist pop // without any refill or slow-path work. Returns NULL on miss. /* moved below TinyTLSSlab definition */ // Step 3d: Forced inlining for readability + performance (306M target) __attribute__((always_inline)) static inline void* hak_tiny_alloc_wrapper(int class_idx); // Helpers for SuperSlab active block accounting (atomic, saturating dec) static inline __attribute__((always_inline)) void ss_active_add(SuperSlab* ss, uint32_t n) { atomic_fetch_add_explicit(&ss->total_active_blocks, n, memory_order_relaxed); } static inline __attribute__((always_inline)) void ss_active_inc(SuperSlab* ss) { atomic_fetch_add_explicit(&ss->total_active_blocks, 1u, memory_order_relaxed); } // EXTRACTED: ss_active_dec_one() moved to hakmem_tiny_superslab.h (Phase 2C-2) // Front refill count global config (declare before init.inc uses them) extern int g_refill_count_global; extern int g_refill_count_hot; extern int g_refill_count_mid; extern int g_refill_count_class[TINY_NUM_CLASSES]; // Step 3d: Forced inlining for slow path (maintain monolithic performance) // Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c) #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx); #else static void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx); #endif // --------------------------------------------------------------------------- // Box: adopt_gate_try (implementation moved from header for robust linkage) // --------------------------------------------------------------------------- #include "box/adopt_gate_box.h" extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; extern int g_super_reg_class_size[TINY_NUM_CLASSES]; extern unsigned long long g_adopt_gate_calls[]; extern unsigned long long g_adopt_gate_success[]; extern unsigned long long g_reg_scan_attempts[]; extern unsigned long long g_reg_scan_hits[]; SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) { g_adopt_gate_calls[class_idx]++; ROUTE_MARK(13); SuperSlab* ss = tiny_refill_try_fast(class_idx, tls); if (ss) { g_adopt_gate_success[class_idx]++; return ss; } g_reg_scan_attempts[class_idx]++; int reg_size = g_super_reg_class_size[class_idx]; int scan_limit = tiny_reg_scan_max(); if (scan_limit > reg_size) scan_limit = reg_size; uint32_t self_tid = tiny_self_u32(); // Local helper (mirror adopt_bind_if_safe) to avoid including alloc inline here auto int adopt_bind_if_safe_local(TinyTLSSlab* tls_l, SuperSlab* ss, int slab_idx, int class_idx_l) { uint32_t self_tid = tiny_self_u32(); SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid); if (!slab_is_valid(&h)) return 0; slab_drain_remote_full(&h); if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) { tiny_tls_bind_slab(tls_l, h.ss, h.slab_idx); slab_release(&h); return 1; } slab_release(&h); return 0; } for (int i = 0; i < scan_limit; i++) { SuperSlab* cand = g_super_reg_by_class[class_idx][i]; if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue; // Fast path: use nonempty_mask / freelist_mask to locate candidates in O(1) uint32_t mask = cand->nonempty_mask; // Fallback to atomic freelist_mask for cross-thread visibility if (mask == 0) { mask = atomic_load_explicit(&cand->freelist_mask, memory_order_acquire); } if (mask == 0) continue; // No visible freelists in this SS int cap = ss_slabs_capacity(cand); while (mask) { int sidx = __builtin_ctz(mask); mask &= (mask - 1); if (sidx >= cap) continue; if (adopt_bind_if_safe_local(tls, cand, sidx, class_idx)) { g_adopt_gate_success[class_idx]++; g_reg_scan_hits[class_idx]++; ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07); return cand; } } } return NULL; } // ============================================================================ // Global State // ============================================================================ // Global pool instance (extern declared in hakmem_tiny.h) TinyPool g_tiny_pool; int g_tiny_initialized = 0; // Not static (extern in header for inline access) // Runtime toggle: allow Tiny allocations even inside malloc/free wrappers // Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED // // Results: // Phase 1 (Push - deferred free): +1 instruction, zero benefit // Phase 2 (Pull - background refill): +77 instructions, -3% performance // // Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9% // - TLS Magazine capacity: 2048 items // - Benchmark working set: 100 items // - Magazine hit rate: 100% after warmup // - Slow path never executed! // // Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op // - glibc: ~40 instructions/op (5-7× faster) // - Gap is architectural (bitmap vs free-list, research features) // // Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc! // Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅ // Decision: Enable by default (proven production-ready) static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!) // Optional: allow limited trylock-based refill during wrapper calls static int g_wrap_tiny_refill = 0; // Remote-free drain controls static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback) static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability) // ACE Learning Layer: Per-class remote drain thresholds int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32}; // Sampled counter updates (Phase 3: Replaced with batched TLS counters) // Old: XOR RNG sampling (10-15 ns overhead) // New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead) static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility) // Step 2: Slab Registry (Hash Table) SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE]; PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES]; // Registry lock pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER; // Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF) // O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs) // Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash) static int g_use_registry = 1; // Default ON for thread-safety // TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h // Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable) static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64) static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192) // hakmem_tiny_tls_list.h already included at top static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; static int g_tls_list_enable = 1; static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want); static int g_fast_enable = 1; static uint16_t g_fast_cap[TINY_NUM_CLASSES]; static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1 static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES]; typedef void* (*TinyHotAllocFn)(void); static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES]; static __thread void* g_fast_head[TINY_NUM_CLASSES]; static __thread uint16_t g_fast_count[TINY_NUM_CLASSES]; static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls); static uint64_t g_tls_hit_count[TINY_NUM_CLASSES]; static uint64_t g_tls_miss_count[TINY_NUM_CLASSES]; static uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES]; static uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES]; static uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES]; static uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES]; // Legacy magazine definitions have been moved to hakmem_tiny_magazine.h // NEW: Per-thread active slabs (up to 2 per class) static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES]; static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES]; static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) { TinySlab* cand = g_tls_active_slab_a[class_idx]; if (cand) { uintptr_t base = (uintptr_t)cand->base; if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { return cand; } } cand = g_tls_active_slab_b[class_idx]; if (cand) { uintptr_t base = (uintptr_t)cand->base; if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { return cand; } } return NULL; } // Phase 6.23: SuperSlab support (mimalloc-style fast allocation) // Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line. extern int g_use_superslab; #if !HAKMEM_BUILD_RELEASE static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { if (!ptr) return; if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { SuperSlab* ss = hak_super_lookup(ptr); if (!(ss && ss->magic == SUPERSLAB_MAGIC)) { tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail"); } else { int slab_idx = slab_index_for(ss, ptr); if (slab_idx < 0) { tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch"); } else { // Fail-Fast: class vs SuperSlab size_class must be consistent. if (ss->size_class != cls) { tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch"); } size_t blk = g_tiny_class_sizes[cls]; uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx); uintptr_t delta = (uintptr_t)ptr - base; if (blk == 0 || (delta % blk) != 0) { tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned"); } else if (delta / blk >= ss->slabs[slab_idx].capacity) { tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity"); } } } } if (!__builtin_expect(g_debug_remote_guard, 0)) return; if (!g_use_superslab) return; SuperSlab* ss = hak_super_lookup(ptr); if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return; int slab_idx = slab_index_for(ss, ptr); if (slab_idx >= 0) { tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0); } } #else static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; } #endif // Debug counters for SuperSlab investigation #if HAKMEM_DEBUG_COUNTERS int g_superslab_alloc_count = 0; int g_superslab_fail_count = 0; int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls #endif // Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve // Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact) // Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress) #define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default) static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1) static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER; static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default static uint32_t g_ss_partial_interval = 4; static _Atomic uint32_t g_ss_partial_epoch = 0; // Phase 6.24: Unified TLS slab cache (Medium fix) // Reduces TLS reads from 3 to 1 (cache-line aligned for performance) static __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES]; static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES]; static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES]; static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES]; // ---------------------------------------------------------------------------- // Per-class partial SuperSlab slot (single-slot publish/adopt) // ---------------------------------------------------------------------------- // Small ring of partial SuperSlabs per class (publish/adopt) #ifndef SS_PARTIAL_RING #define SS_PARTIAL_RING 64 #endif static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING]; static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES]; static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES]; static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES]; static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN // Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS) unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0}; _Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active static _Atomic int g_ss_adopt_log_once = 0; static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) { if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) { fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n", reason ? reason : "unknown", class_idx); } } static inline void tiny_adopt_gate_parse_env(void) { if (__builtin_expect(g_ss_adopt_env == -2, 0)) { const char* env = getenv("HAKMEM_TINY_SS_ADOPT"); if (!env || *env == '\0') { g_ss_adopt_env = 0; // auto } else if (*env == '0') { g_ss_adopt_env = -1; // forced OFF atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release); } else { g_ss_adopt_env = 1; // forced ON atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release); tiny_adopt_gate_log_activation("env", -1); } } } int tiny_adopt_gate_should_publish(void) { tiny_adopt_gate_parse_env(); if (g_ss_adopt_env == 1) return 1; if (g_ss_adopt_env == -1) return 0; return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; } int tiny_adopt_gate_should_adopt(void) { tiny_adopt_gate_parse_env(); if (g_ss_adopt_env == 1) return 1; if (g_ss_adopt_env == -1) return 0; return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; } void tiny_adopt_gate_on_remote_seen(int class_idx) { tiny_adopt_gate_parse_env(); atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed); if (g_ss_adopt_env == -1) return; int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel); if (prev == 0) { tiny_adopt_gate_log_activation("remote", class_idx); } } // TLS hint: last adopted SuperSlab/slab to avoid rescans #include "tiny_sticky.h" // Mailbox box #include "box/mailbox_box.h" // Publish pipeline counters (visibility) unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0}; unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0}; unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0}; // Slab-ring counters (debug) unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0}; // Slab entry encoding helpers (used by Bench/Slab-ring paths) static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) { return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu); } static inline SuperSlab* slab_entry_ss(uintptr_t ent) { // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u)); } static inline int slab_entry_idx(uintptr_t ent) { return (int)(ent & 0x3Fu); } // ---------------------------------------------------------------------------- // Bench Mode Publish Mailbox (single-slot per class) // ---------------------------------------------------------------------------- static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1 static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES]; #ifndef BENCH_MAILBOX_WIDTH #define BENCH_MAILBOX_WIDTH 16 #endif static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH]; static inline int bench_mode_enabled(void) { if (__builtin_expect(g_bench_mode == -1, 0)) { const char* b = getenv("HAKMEM_TINY_BENCH_MODE"); g_bench_mode = (b && atoi(b) != 0) ? 1 : 0; } return g_bench_mode; } static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) { if (!bench_mode_enabled()) return; uintptr_t ent = slab_entry_make(ss, slab_idx); uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed); idx &= (BENCH_MAILBOX_WIDTH - 1); atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release); } static inline uintptr_t bench_pub_pop(int class_idx) { if (!bench_mode_enabled()) return (uintptr_t)0; for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) { uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel); if (ent) return ent; } return 0; } // ---------------------------------------------------------------------------- // Slab-Granular Partial Publish/Adopt (encoded entries) // ---------------------------------------------------------------------------- #ifndef SLAB_PARTIAL_RING #define SLAB_PARTIAL_RING 128 #endif static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING]; static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES]; // ---------------------------------------------------------------------------- // Refill-stage counters (per class) // ---------------------------------------------------------------------------- unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0}; // Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0) unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0}; // Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE) unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0}; // Refill item source breakdown (freelist vs carve) unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0}; static int g_rf_trace_en = -1; static inline int rf_trace_enabled(void) { if (__builtin_expect(g_rf_trace_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_RF_TRACE"); g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0; } return g_rf_trace_en; } static inline unsigned long long rf_now_ns(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec; } // moved to tiny_sticky.c // moved to tiny_remote.c // moved to tiny_mailbox.c // Publish-side counters (debug) unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0}; // Free pipeline counters unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0}; // Front Gate Breakdown (debug counters) unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0}; // Free-side trigger counters unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0}; unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0}; // Adopt/Registry gate counters unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0}; unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0}; unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_lookup_none = 0; // ---------------------------------------------------------------------------- // Live Superslab cap (must-adopt-before-mmap support) // ---------------------------------------------------------------------------- static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value __thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0}; static inline int live_cap_for_class(int class_idx) { if (__builtin_expect(g_live_cap_env == -2, 0)) { const char* s = getenv("HAKMEM_SS_LIVE_CAP"); if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); } } (void)class_idx; return g_live_cap_env; } // ---------------------------------------------------------------------------- // Hot Slot (global simple path) // ---------------------------------------------------------------------------- static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot) static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES]; static inline int hot_slot_enabled(void) { if (__builtin_expect(g_hot_slot_en == -1, 0)) { const char* s = getenv("HAKMEM_HOT_SLOT"); g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0; } return g_hot_slot_en || bench_mode_enabled(); } static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) { if (!hot_slot_enabled()) return; uintptr_t ent = slab_entry_make(ss, slab_idx); atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release); } static inline uintptr_t hot_slot_pop(int class_idx) { if (!hot_slot_enabled()) return (uintptr_t)0; return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel); } // moved to tiny_publish.c static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { if (!ss) return; uintptr_t ent = slab_entry_make(ss, slab_idx); for (int i = 0; i < SLAB_PARTIAL_RING; i++) { uintptr_t expected = 0; if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent, memory_order_release, memory_order_relaxed)) { g_slab_publish_dbg[class_idx]++; return; } } // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel); if (old) { for (int t = 0; t < 8; t++) { uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; uintptr_t expected = 0; if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old, memory_order_release, memory_order_relaxed)) { g_slab_requeue_dbg[class_idx]++; old = 0; break; } } } g_slab_publish_dbg[class_idx]++; } static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) { for (int i = 0; i < SLAB_PARTIAL_RING; i++) { uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel); if (ent) return ent; } return 0; } void ss_partial_publish(int class_idx, SuperSlab* ss) { if (!ss) return; // Gate by listed flag to avoid repeated publishes of the same SS unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel); if (prev != 0u) return; // already listed // CRITICAL: Release ownership of all slabs so adopters can claim them! // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0). // The publishing thread must stop using this SS after publishing. int cap_pub = ss_slabs_capacity(ss); for (int s = 0; s < cap_pub; s++) { uint32_t prev = __atomic_exchange_n(&ss->slabs[s].owner_tid, 0u, __ATOMIC_RELEASE); if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) { uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev; tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE, (uint16_t)ss->size_class, &ss->slabs[s], aux); } } // CRITICAL: Unbind current thread's TLS if it points to this SS! // Otherwise, the publishing thread will continue allocating from the published SS, // racing with adopters who acquire ownership. extern __thread TinyTLSSlab g_tls_slabs[]; if (g_tls_slabs[class_idx].ss == ss) { g_tls_slabs[class_idx].ss = NULL; g_tls_slabs[class_idx].meta = NULL; g_tls_slabs[class_idx].slab_base = NULL; g_tls_slabs[class_idx].slab_idx = 0; } // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1) int best = -1; uint32_t best_score = 0; for (int s = 0; s < cap_pub; s++) { TinySlabMeta* m = &ss->slabs[s]; uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed); int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0); unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u; uint32_t score = rc + (m->freelist ? (1u<<30) : 0u) + (listed ? (1u<<29) : 0u) + (has_remote ? 1u : 0u); if (score > best_score) { best_score = score; best = s; } } if (best >= 0 && best < 256) { ss->publish_hint = (uint8_t)best; // Box: Ready push — provide slab-level candidate to adopters tiny_ready_push(class_idx, ss, best); } else { ss->publish_hint = 0xFF; } for (int i = 0; i < SS_PARTIAL_RING; i++) { SuperSlab* expected = NULL; if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss, memory_order_release, memory_order_relaxed)) { g_ss_publish_dbg[class_idx]++; return; // published } } // Ring full: replace one entry in round-robin to avoid dropping supply uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed); idx %= SS_PARTIAL_RING; SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel); if (old) { // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads // that just adopted from it. Draining without ownership checks causes freelist corruption. // The adopter will drain when needed (with proper ownership checks in tiny_refill.h). // // Previous code (UNSAFE): // for (int s = 0; s < cap; s++) { // ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter! // } // Keep listed=1 while in overflow so it stays eligible for adopt // Push old into overflow stack (待機箱) SuperSlab* head; do { head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); old->partial_next = head; } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old, memory_order_release, memory_order_relaxed)); } g_ss_publish_dbg[class_idx]++; } SuperSlab* ss_partial_adopt(int class_idx) { for (int i = 0; i < SS_PARTIAL_RING; i++) { SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel); if (ss) { // Clear listed flag on adopt to allow future publish of this SS atomic_store_explicit(&ss->listed, 0u, memory_order_release); g_ss_adopt_dbg[class_idx]++; return ss; } } // Fallback: adopt from overflow stack (LIFO) while (1) { SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); if (!head) break; SuperSlab* next = head->partial_next; if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next, memory_order_acq_rel, memory_order_relaxed)) { atomic_store_explicit(&head->listed, 0u, memory_order_release); g_ss_adopt_dbg[class_idx]++; return head; } } return NULL; } static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) { // Canonical binding: // - ss->size_class defines block size for this SuperSlab // - slab_idx is the owning slab index within ss // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx) tls->ss = ss; tls->slab_idx = (uint8_t)slab_idx; tls->meta = &ss->slabs[slab_idx]; tls->slab_base = tiny_slab_base_for(ss, slab_idx); } static inline uint32_t tiny_tls_default_refill(uint32_t cap) { if (cap == 0u) return 8u; uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u; if (low < 4u) low = 4u; return low; } static inline uint32_t tiny_tls_default_spill(uint32_t cap) { if (cap == 0u) return 0u; uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u); if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP; if (spill < cap) spill = cap; return (uint32_t)spill; } static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) { atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release); atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed); atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed); atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); } static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) { atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release); atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); } static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) { uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire); if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) { return; } uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire); if (target_cap != 0u && tls->cap != target_cap) { tls->cap = target_cap; uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed); if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap); tls->refill_low = target_refill; uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed); if (target_spill < target_cap) target_spill = target_cap; tls->spill_high = target_spill; } uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire); if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) { g_tls_trim_seen[class_idx] = trim_epoch; if (tls->count > tls->cap) { tls_list_spill_excess(class_idx, tls); } } g_tls_param_seen[class_idx] = seq; } // ============================================================================ // EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1) // ============================================================================ // Functions: tiny_fast_pop(), tiny_fast_push() - 28 lines (lines 377-404) // Forward declarations for functions defined in hakmem_tiny_fastcache.inc.h static inline void* tiny_fast_pop(int class_idx); static inline int tiny_fast_push(int class_idx, void* ptr); // ============================================================================ // EXTRACTED TO hakmem_tiny_hot_pop.inc.h (Phase 2D-1) // ============================================================================ // Functions: tiny_hot_pop_class0(), tiny_hot_pop_class1(), tiny_hot_pop_class2(), tiny_hot_pop_class3() // 88 lines (lines 407-494) static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) { int tls_enabled = g_tls_list_enable; TinyTLSList* tls = &g_tls_lists[class_idx]; pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; pthread_mutex_lock(lock); TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; if (slab) { g_tiny_pool.free_slabs[class_idx] = slab->next; } else { slab = allocate_new_slab(class_idx); if (!slab) { pthread_mutex_unlock(lock); return NULL; } } slab->next = NULL; if (atomic_load_explicit(&slab->remote_head, memory_order_acquire)) { tiny_remote_drain_locked(slab); } int block_idx = hak_tiny_find_free_block(slab); if (block_idx < 0) { slab->next = g_tiny_pool.free_slabs[class_idx]; g_tiny_pool.free_slabs[class_idx] = slab; pthread_mutex_unlock(lock); return NULL; } hak_tiny_set_used(slab, block_idx); slab->free_count--; size_t block_size = g_tiny_class_sizes[class_idx]; uint8_t* base = (uint8_t*)slab->base; void* ret = (void*)(base + ((size_t)block_idx * block_size)); g_tiny_pool.alloc_count[class_idx]++; uint16_t cap = g_fast_cap_defaults[class_idx]; uint16_t count = g_fast_count[class_idx]; uint16_t fast_need = (cap > count) ? (uint16_t)(cap - count) : 0; if (fast_need > slab->free_count) fast_need = (uint16_t)slab->free_count; uint32_t tls_need = 0; if (tls_enabled && tls_list_needs_refill(tls)) { uint32_t target = tls_list_refill_threshold(tls); if (tls->count < target) { tls_need = target - tls->count; } } uint32_t remaining = slab->free_count; if (fast_need > remaining) fast_need = (uint16_t)remaining; remaining -= fast_need; if (tls_need > remaining) tls_need = remaining; while (fast_need > 0) { int extra_idx = hak_tiny_find_free_block(slab); if (extra_idx < 0) break; hak_tiny_set_used(slab, extra_idx); slab->free_count--; void* extra = (void*)(base + ((size_t)extra_idx * block_size)); if (!tiny_fast_push(class_idx, extra)) { if (tls_enabled) { tiny_tls_list_guard_push(class_idx, tls, extra); tls_list_push(tls, extra); } } fast_need--; } while (tls_enabled && tls_need > 0) { int extra_idx = hak_tiny_find_free_block(slab); if (extra_idx < 0) break; hak_tiny_set_used(slab, extra_idx); slab->free_count--; void* extra = (void*)(base + ((size_t)extra_idx * block_size)); tiny_tls_list_guard_push(class_idx, tls, extra); tls_list_push(tls, extra); tls_need--; } if (slab->free_count == 0) { move_to_full_list(class_idx, slab); } else { slab->next = g_tiny_pool.free_slabs[class_idx]; g_tiny_pool.free_slabs[class_idx] = slab; } pthread_mutex_unlock(lock); return ret; } // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: tiny_fast_refill_and_take() - 39 lines (lines 584-622) // Hot-path cheap sampling counter to avoid rand() in allocation path // Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B) int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable // Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c) // CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR __thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0}; __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0}; #else static __thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0}; static __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0}; #endif static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation // Ultra debug counters #if HAKMEM_DEBUG_COUNTERS static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0}; static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0}; #endif // Path counters (normal mode visibility): lightweight, for debugging/bench only #if HAKMEM_DEBUG_COUNTERS static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0}; // New: slow/bitmap/bump/bin instrumentation static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0}; #endif static int g_path_debug_enabled = 0; // Spill hysteresis(freeホットパスからgetenvを排除) static int g_spill_hyst = 32; // default margin (configured at init; never getenv on hot path) // Optional per-class refill batch overrides (0=use global defaults) static int g_refill_max_c[TINY_NUM_CLASSES] = {0}; static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0}; static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) { int v = g_refill_max_c[class_idx]; if (v > 0) return v; if (class_idx <= 3) { int hv = g_refill_max_hot_c[class_idx]; if (hv > 0) return hv; return g_tiny_refill_max_hot; } return g_tiny_refill_max; } // Phase 9.5: Frontend/Backend split - Tiny FastCache (array stack) // Enabled via HAKMEM_TINY_FASTCACHE=1 (default: 0) // Compile-out: define HAKMEM_TINY_NO_FRONT_CACHE=1 to exclude this path #define TINY_FASTCACHE_CAP 128 typedef struct __attribute__((aligned(64))) { void* items[TINY_FASTCACHE_CAP]; int top; int _pad[15]; } TinyFastCache; static int g_fastcache_enable = 0; // HAKMEM_TINY_FASTCACHE=1 static __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES]; static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend) // SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER) static int g_sll_multiplier = 2; // Cached thread id (uint32) to avoid repeated pthread_self() in hot paths static __thread uint32_t g_tls_tid32; static __thread int g_tls_tid32_inited; // Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { #else static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { #endif if (__builtin_expect(!g_tls_tid32_inited, 0)) { g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self(); g_tls_tid32_inited = 1; } return g_tls_tid32; } // Cached pthread_t as-is for APIs that require pthread_t comparison static __thread pthread_t g_tls_pt_self; static __thread int g_tls_pt_inited; // Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { #else static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { #endif if (__builtin_expect(!g_tls_pt_inited, 0)) { g_tls_pt_self = pthread_self(); g_tls_pt_inited = 1; } return g_tls_pt_self; } #include "tiny_refill.h" // tiny_mmap_gate.h already included at top #include "tiny_publish.h" static int g_sll_cap_override[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_SLL_CAP_C{0..7} // Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1) static int g_tiny_prefetch = 0; // Small-class magazine pre-initialization (to avoid cap==0 checks on hot path) // Hot-class small TLS magazine(実体とスイッチ) typedef struct { void* slots[128]; uint16_t top; // 0..128 uint16_t cap; // =128 } TinyHotMag; static int g_hotmag_cap_default = 128; // default capacity (env override) static int g_hotmag_refill_default = 32; // default refill batch (env override) static int g_hotmag_enable = 0; // 既定OFF(A/B用)。envでON可。 static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES]; static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES]; static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES]; static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES]; static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES]; // 0=disabled for class, 1=enabled static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES]; // Inline helpers #include "box/tls_sll_box.h" // Box TLS-SLL: Safe SLL operations API (needed by hotmag) #include "hakmem_tiny_hotmag.inc.h" // Size-specialized tiny alloc (32B/64B) via function pointers (A/B用) // TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata) // Opt-in via HAKMEM_TINY_QUICK=1 // NOTE: This type definition must come BEFORE the Phase 2D-1 includes below typedef struct __attribute__((aligned(64))) { void* items[6]; // 48B uint8_t top; // 1B (0..6) uint8_t _pad1; // 1B uint16_t _pad2; // 2B uint32_t _pad3; // 4B (padding to 64B) } TinyQuickSlot; static int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1 static __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below // Phase 2D-1: Hot-path inline function extractions // NOTE: These includes require TinyFastCache, TinyQuickSlot, and TinyTLSSlab to be fully defined #include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3} #include "hakmem_tiny_fastcache.inc.h" // 5 functions: tiny_fast_pop/push, fastcache_pop/push, quick_pop #include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations // Phase 7 Task 3: Pre-warm TLS cache at init // Pre-allocate blocks to reduce first-allocation miss penalty #if HAKMEM_TINY_PREWARM_TLS void hak_tiny_prewarm_tls_cache(void) { // Pre-warm each class with HAKMEM_TINY_PREWARM_COUNT blocks // This reduces the first-allocation miss penalty by populating TLS cache for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { // CRITICAL: C7 (1KB) is headerless - skip TLS SLL refill, but create SuperSlab if (class_idx == 7) { // Create C7 SuperSlab explicitly (refill functions skip C7) // Note: superslab_refill is already declared in hakmem_tiny_refill.inc.h (void)superslab_refill(class_idx); continue; } int count = HAKMEM_TINY_PREWARM_COUNT; // Default: 16 blocks per class // Trigger refill to populate TLS cache // P0 Fix: Use appropriate refill function based on P0 status #if HAKMEM_TINY_P0_BATCH_REFILL sll_refill_batch_from_ss(class_idx, count); #else sll_refill_small_from_ss(class_idx, count); #endif } } #endif // Ultra-Simple front (small per-class stack) — combines tiny front to minimize // instructions and memory touches on alloc/free. Uses existing TLS bump shadow // (g_tls_bcur/bend) when enabled to avoid per-alloc header writes. // UltraFront capacity for 32/64B fast pop #ifndef ULTRA_FRONT_CAP #define ULTRA_FRONT_CAP 64 #endif typedef struct __attribute__((aligned(64))) { void* slots[ULTRA_FRONT_CAP]; uint16_t top; // 0..ULTRA_FRONT_CAP uint16_t _pad; } TinyUltraFront; static int g_ultra_simple = 0; // HAKMEM_TINY_ULTRA_SIMPLE=1 static __thread TinyUltraFront g_tls_ultra[TINY_NUM_CLASSES]; // Inline helpers #include "hakmem_tiny_ultra_front.inc.h" // Ultra-Bump TLS shadow (bench/opt-in): keep a TLS-only bump window // to avoid per-alloc header writes. Header is updated per-chunk reservation. // NOTE: Non-static because used in hakmem_tiny_refill.inc.h int g_bump_chunk = 32; // HAKMEM_TINY_BUMP_CHUNK (blocks) __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES] = {0}; __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES] = {0}; // SLL small refill batch for specialized class (32/64B) // Specialized order toggle: 1 = mag-first, 0 = sll-first // HotMag helpers (for classes 0..3) static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; } // Optional front (Ultra/HotMag) push helper: compile-out in release builds static inline int tiny_optional_push(int class_idx, void* ptr) { #if HAKMEM_BUILD_RELEASE (void)class_idx; (void)ptr; return 0; #else if (__builtin_expect(g_ultra_simple && is_hot_class(class_idx), 0)) { if (__builtin_expect(ultra_push(class_idx, ptr), 0)) { return 1; } } if (__builtin_expect(is_hot_class(class_idx), 0)) { if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) { return 1; } } return 0; #endif } // Ultra-Simple helpers // Phase 9.6: Deferred Intelligence (event queue + background) // Extended event for FLINT Intelligence (lightweight; recorded off hot path only) // Observability, ACE, and intelligence helpers #include "hakmem_tiny_intel.inc" // ============================================================================ // EXTRACTED TO hakmem_tiny_rss.c (Phase 2B-2) // ============================================================================ // EXTRACTED: static int get_rss_kb_self(void) { // EXTRACTED: FILE* f = fopen("/proc/self/status", "r"); // EXTRACTED: if (!f) return 0; // EXTRACTED: char buf[256]; // EXTRACTED: int kb = 0; // EXTRACTED: while (fgets(buf, sizeof(buf), f)) { // EXTRACTED: if (strncmp(buf, "VmRSS:", 6) == 0) { // EXTRACTED: char* p = buf; // EXTRACTED: while (*p && (*p < '0' || *p > '9')) { // EXTRACTED: p++; // EXTRACTED: } // EXTRACTED: kb = atoi(p); // EXTRACTED: break; // EXTRACTED: } // EXTRACTED: } // EXTRACTED: fclose(f); // EXTRACTED: return kb; // EXTRACTED: } // Miss時にマガジンへ大量リフィルせず、1個だけ確保して即返すオプション // Env: HAKMEM_TINY_REFILL_ONE_ON_MISS=1 で有効(デフォルト: 0) int g_refill_one_on_miss = 0; // Frontend fill target per class (adaptive) // NOTE: Non-static because used in hakmem_tiny_refill.inc.h _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES]; // Forward declarations for helpers referenced by frontend_refill_fc static inline int ultra_batch_for_class(int class_idx); enum { HAK_TIER_SLL=1, HAK_TIER_MAG=2, HAK_TIER_SLAB=3, HAK_TIER_SUPER=4, HAK_TIER_FRONT=5 }; static inline uint16_t hak_thread_id16(void) { // best-effort compress cached thread id to 16 bits uint32_t tid = tiny_self_u32(); return (uint16_t)(tid ^ (tid >> 16)); } static inline void eventq_push_ex(int class_idx, uint32_t size, uint8_t tier, uint8_t flags, uint32_t site_id, uint16_t lat_bucket) { (void)flags; (void)lat_bucket; (void)site_id; if (!g_int_engine) return; // Lightweight sampling: if mask set, log 1 out of 2^N unsigned m = g_int_sample_mask; if (m != 0) { unsigned x = g_tls_ev_seq++; if ((x & m) != 0) return; } uint32_t t = atomic_fetch_add_explicit(&g_ev_tail, 1u, memory_order_relaxed); AllocEvent ev; ev.ts_ns = g_int_event_ts ? hak_now_ns() : 0; ev.size = size; ev.site_id = 0; // keep minimal ev.latency_bucket = 0; ev.tier_hit = tier; ev.flags = 0; ev.class_idx = (uint16_t)class_idx; ev.thread_id = 0; g_ev_ring[t & EVENTQ_MASK] = ev; // best-effort overwrite on overflow } // Background refill workers and intelligence engine #include "hakmem_tiny_background.inc" // ============================================================================ // EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1) // ============================================================================ // Functions: fastcache_pop(), fastcache_push(), quick_pop() - 25 lines (lines 873-896) // Ultra-fast try-only variant: attempt a direct SuperSlab bump/freelist pop // without any refill or slow-path work. Returns NULL on miss. static inline void* hak_tiny_alloc_superslab_try_fast(int class_idx) { if (!g_use_superslab) return NULL; TinyTLSSlab* tls = &g_tls_slabs[class_idx]; TinySlabMeta* meta = tls->meta; if (!meta) return NULL; // Try linear (bump) allocation first when freelist is empty if (meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { size_t block_size = g_tiny_class_sizes[tls->ss->size_class]; void* block = tls->slab_base + ((size_t)meta->used * block_size); meta->used++; // Track active blocks in SuperSlab for conservative reclamation ss_active_inc(tls->ss); return block; } // Do not pop freelist here (keep magazine/SLL handling consistent) return NULL; } // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Functions: quick_refill_from_sll(), quick_refill_from_mag() - 31 lines (lines 918-949) // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: sll_refill_small_from_ss() - 45 lines (lines 952-996) // Phase 2C-3: TLS operations module (included after helper function definitions) #include "hakmem_tiny_tls_ops.h" // New TLS list refill: owner-only bulk take from TLS-cached SuperSlab slab // ============================================================================ // EXTRACTED TO hakmem_tiny_tls_ops.h (Phase 2C-3) // ============================================================================ // Function: tls_refill_from_tls_slab() - 101 lines // Hot path refill operation, moved to inline function in header // ============================================================================ // EXTRACTED TO hakmem_tiny_tls_ops.h (Phase 2C-3) // ============================================================================ // Function: tls_list_spill_excess() - 97 lines // Hot path spill operation, moved to inline function in header // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: superslab_tls_bump_fast() - 45 lines (lines 1016-1060) // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: frontend_refill_fc() - 44 lines (lines 1063-1106) // SLL capacity policy: for hot tiny classes (0..3), allow larger SLL up to multiplier * mag_cap // for >=4 keep current conservative half (to limit footprint). static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap) { // Absolute override if (g_sll_cap_override[class_idx] > 0) { uint32_t cap = (uint32_t)g_sll_cap_override[class_idx]; if (cap > TINY_TLS_MAG_CAP) cap = TINY_TLS_MAG_CAP; return cap; } uint32_t cap = mag_cap; if (class_idx <= 3) { uint32_t mult = (g_sll_multiplier > 0 ? (uint32_t)g_sll_multiplier : 1u); uint64_t want = (uint64_t)cap * (uint64_t)mult; if (want > (uint64_t)TINY_TLS_MAG_CAP) cap = TINY_TLS_MAG_CAP; else cap = (uint32_t)want; } else if (class_idx >= 4) { cap = (mag_cap > 1u ? (mag_cap / 2u) : 1u); } return cap; } // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: bulk_mag_to_sll_if_room() - 22 lines (lines 1133-1154) // Ultra helpers forward declarations (defined later) static inline int ultra_sll_cap_for_class(int class_idx); static inline int ultra_validate_sll_head(int class_idx, void* head); // Ultra-mode (SLL-only) helpers // Ultra batch overrides via env: HAKMEM_TINY_ULTRA_BATCH_C{0..7} static int g_ultra_batch_override[TINY_NUM_CLASSES] = {0}; static int g_ultra_sll_cap_override[TINY_NUM_CLASSES] = {0}; static inline int ultra_batch_for_class(int class_idx) { int ov = g_ultra_batch_override[class_idx]; if (ov > 0) return ov; switch (class_idx) { case 0: return 64; // 8B case 1: return 96; // 16B(A/B最良) case 2: return 96; // 32B(A/B最良) case 3: return 224; // 64B(A/B最良) case 4: return 96; // 128B (promote front refill a bit) case 5: return 64; // 256B (promote front refill) case 6: return 64; // 512B (promote front refill) default: return 32; // 1024B and others } } // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: ultra_refill_sll() - 56 lines (lines 1178-1233) #include "hakmem_tiny_remote.inc" // ============================================================================ // Internal Helpers // ============================================================================ // Step 2: Slab Registry Operations // Hash function for slab_base (64KB aligned) // ============================================================================ // EXTRACTED TO hakmem_tiny_registry.c (Phase 2B-3) // ============================================================================ // EXTRACTED: static inline int registry_hash(uintptr_t slab_base) { // EXTRACTED: return (slab_base >> 16) & SLAB_REGISTRY_MASK; // EXTRACTED: } // Register slab in hash table (returns 1 on success, 0 on failure) // EXTRACTED: static int registry_register(uintptr_t slab_base, TinySlab* owner) { // EXTRACTED: pthread_mutex_lock(&g_tiny_registry_lock); // EXTRACTED: int hash = registry_hash(slab_base); // EXTRACTED: // EXTRACTED: // Linear probing (max 8 attempts) // EXTRACTED: for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) { // EXTRACTED: int idx = (hash + i) & SLAB_REGISTRY_MASK; // EXTRACTED: SlabRegistryEntry* entry = &g_slab_registry[idx]; // EXTRACTED: // EXTRACTED: if (entry->slab_base == 0) { // EXTRACTED: // Empty slot found // EXTRACTED: entry->slab_base = slab_base; // EXTRACTED: atomic_store_explicit(&entry->owner, owner, memory_order_release); // EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock); // EXTRACTED: return 1; // EXTRACTED: } // EXTRACTED: } // EXTRACTED: // EXTRACTED: // Registry full (collision limit exceeded) // EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock); // EXTRACTED: return 0; // EXTRACTED: } // Unregister slab from hash table // EXTRACTED: static void registry_unregister(uintptr_t slab_base) { // EXTRACTED: pthread_mutex_lock(&g_tiny_registry_lock); // EXTRACTED: int hash = registry_hash(slab_base); // EXTRACTED: // EXTRACTED: // Linear probing search // EXTRACTED: for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) { // EXTRACTED: int idx = (hash + i) & SLAB_REGISTRY_MASK; // EXTRACTED: SlabRegistryEntry* entry = &g_slab_registry[idx]; // EXTRACTED: // EXTRACTED: if (entry->slab_base == slab_base) { // EXTRACTED: // Found - clear entry (atomic store prevents TOCTOU race) // EXTRACTED: atomic_store_explicit(&entry->owner, NULL, memory_order_release); // EXTRACTED: entry->slab_base = 0; // EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock); // EXTRACTED: return; // EXTRACTED: } // EXTRACTED: // EXTRACTED: if (entry->slab_base == 0) { // EXTRACTED: // Empty slot - not found // EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock); // EXTRACTED: return; // EXTRACTED: } // EXTRACTED: } // EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock); // EXTRACTED: } // Lookup slab by base address (O(1) average) static TinySlab* registry_lookup(uintptr_t slab_base) { // Lock-free read with atomic owner access (MT-safe) int hash = registry_hash(slab_base); // Linear probing search for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) { int idx = (hash + i) & SLAB_REGISTRY_MASK; SlabRegistryEntry* entry = &g_slab_registry[idx]; if (entry->slab_base == slab_base) { // Atomic load to prevent TOCTOU race with registry_unregister() TinySlab* owner = atomic_load_explicit(&entry->owner, memory_order_acquire); if (!owner) return NULL; // Entry cleared by unregister return owner; } if (entry->slab_base == 0) { return NULL; // Empty slot - not found } } return NULL; // Not found after max probes } // ============================================================================ // EXTRACTED TO hakmem_tiny_slab_mgmt.inc (Phase 2D-4 FINAL) // ============================================================================ // Function: allocate_new_slab() - 79 lines (lines 952-1030) // Allocate new slab for a class // Function: release_slab() - 23 lines (lines 1033-1055) // Release a slab back to system // Step 2: Find slab owner by pointer (O(1) via hash table registry, or O(N) fallback) TinySlab* hak_tiny_owner_slab(void* ptr) { if (!ptr || !g_tiny_initialized) return NULL; // Phase 6.14: Runtime toggle between Registry (O(1)) and List (O(N)) if (g_use_registry) { // O(1) lookup via hash table uintptr_t slab_base = (uintptr_t)ptr & ~(TINY_SLAB_SIZE - 1); TinySlab* slab = registry_lookup(slab_base); if (!slab) return NULL; // SAFETY: validate membership (ptr must be inside [base, base+64KB)) uintptr_t start = (uintptr_t)slab->base; uintptr_t end = start + TINY_SLAB_SIZE; if ((uintptr_t)ptr < start || (uintptr_t)ptr >= end) { return NULL; // false positive from registry → treat as non-Tiny } return slab; } else { // O(N) fallback: linear search through all slab lists (lock per class) for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; pthread_mutex_lock(lock); // Search free slabs for (TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; slab; slab = slab->next) { uintptr_t slab_start = (uintptr_t)slab->base; uintptr_t slab_end = slab_start + TINY_SLAB_SIZE; if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) { pthread_mutex_unlock(lock); return slab; } } // Search full slabs for (TinySlab* slab = g_tiny_pool.full_slabs[class_idx]; slab; slab = slab->next) { uintptr_t slab_start = (uintptr_t)slab->base; uintptr_t slab_end = slab_start + TINY_SLAB_SIZE; if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) { pthread_mutex_unlock(lock); return slab; } } pthread_mutex_unlock(lock); } return NULL; // Not found } } // Function: move_to_full_list() - 20 lines (lines 1104-1123) // Move slab to full list // Function: move_to_free_list() - 20 lines (lines 1126-1145) // Move slab to free list // ============================================================================ // Public API // ============================================================================ // ============================================================================ // Phase 2D-2: Initialization function (extracted to hakmem_tiny_init.inc) // ============================================================================ #include "hakmem_tiny_init.inc" // ============================================================================ // 3-Layer Architecture (2025-11-01 Simplification) // ============================================================================ // Layer 1: TLS Bump Allocator (ultra-fast, 2-3 instructions/op) #include "hakmem_tiny_bump.inc.h" // Layer 2: TLS Small Magazine (fast, 5-10 instructions/op) #include "hakmem_tiny_smallmag.inc.h" // ============================================================================ // Phase 6 Fast Path Options (mutually exclusive) // ============================================================================ // Choose ONE of the following Phase 6 optimizations: // // Phase 6-1.5: Alignment Guessing (LEGACY - committed 2025-11-02) // - Enable: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1 // - Speed: 235 M ops/sec // - Memory: 0% overhead // - Method: Guess size class from pointer alignment (__builtin_ctzl) // - Risk: Alignment assumptions may break with future changes // // Phase 6-1.6: Metadata Header (NEW - recommended for production) // - Enable: -DHAKMEM_TINY_PHASE6_METADATA=1 // - Speed: 450-480 M ops/sec (expected, Phase 6-1 level) // - Memory: ~6-12% overhead (8 bytes/allocation) // - Method: Store pool_type + size_class in 8-byte header // - Benefit: Extends to ALL pools (Tiny/Mid/L25/Whale) // - Eliminates: Registry lookups, mid_lookup, owner checks // ============================================================================ // Forward declarations for Phase 6 alloc/free functions #ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE void* hak_tiny_alloc_ultra_simple(size_t size); void hak_tiny_free_ultra_simple(void* ptr); #endif #if defined(HAKMEM_TINY_PHASE6_METADATA) && defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) #error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE" #endif // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check #if HAKMEM_TINY_PHASE6_BOX_REFACTOR #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options" #endif // Box 1: Atomic Operations (Layer 0 - Foundation) #include "tiny_atomic.h" // Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions) #include "tiny_alloc_fast.inc.h" // Box 6: Free Fast Path (Layer 2 - 2-3 instructions) #include "tiny_free_fast.inc.h" // ---------------- Refill count (Front) global config ---------------- // Parsed once at init; hot path reads plain ints (no getenv). int g_refill_count_global = 0; // HAKMEM_TINY_REFILL_COUNT int g_refill_count_hot = 0; // HAKMEM_TINY_REFILL_COUNT_HOT int g_refill_count_mid = 0; // HAKMEM_TINY_REFILL_COUNT_MID int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7} // Export wrapper functions for hakmem.c to call // Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining void* hak_tiny_alloc_fast_wrapper(size_t size) { // Diagnostic removed - use HAKMEM_TINY_FRONT_DIAG in tiny_alloc_fast_pop if needed return tiny_alloc_fast(size); } void hak_tiny_free_fast_wrapper(void* ptr) { tiny_free_fast(ptr); } #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) // Phase 6-1.5: Alignment guessing (legacy) // Refill count globals (needed for compatibility) int g_refill_count_global = 0; int g_refill_count_hot = 0; int g_refill_count_mid = 0; int g_refill_count_class[TINY_NUM_CLASSES] = {0}; #include "hakmem_tiny_ultra_simple.inc" // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking) void* hak_tiny_alloc_fast_wrapper(size_t size) { return hak_tiny_alloc_ultra_simple(size); } void hak_tiny_free_fast_wrapper(void* ptr) { hak_tiny_free_ultra_simple(ptr); } #elif defined(HAKMEM_TINY_PHASE6_METADATA) // Phase 6-1.6: Metadata header (recommended) #include "hakmem_tiny_metadata.inc" #endif // Layer 1-3: Main allocation function (simplified) // Build-time configurable via: -DHAKMEM_TINY_USE_NEW_3LAYER=1 #ifndef HAKMEM_TINY_USE_NEW_3LAYER #define HAKMEM_TINY_USE_NEW_3LAYER 0 // default OFF (legacy path) #endif #if HAKMEM_TINY_USE_NEW_3LAYER #include "hakmem_tiny_alloc_new.inc" #else // Old 6-7 layer architecture (backup) #include "hakmem_tiny_alloc.inc" #endif #include "hakmem_tiny_slow.inc" // Free path implementations #include "hakmem_tiny_free.inc" // ============================================================================ // EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3) // ============================================================================ // Function: hak_tiny_trim() - 116 lines (lines 1164-1279) // Public trim and cleanup operation for lifecycle management // Forward decl for internal registry lookup used by ultra safety validation static TinySlab* registry_lookup(uintptr_t slab_base); // Ultra helpers: per-class SLL cap and pointer validation static inline int ultra_sll_cap_for_class(int class_idx) { int ov = g_ultra_sll_cap_override[class_idx]; if (ov > 0) return ov; switch (class_idx) { case 0: return 256; // 8B case 1: return 384; // 16B(A/B最良) case 2: return 384; // 32B(A/B最良) case 3: return 768; // 64B(A/B最良) case 4: return 256; // 128B default: return 128; // others } } static inline int ultra_validate_sll_head(int class_idx, void* head) { uintptr_t base = ((uintptr_t)head) & ~(TINY_SLAB_SIZE - 1); TinySlab* owner = registry_lookup(base); if (!owner) return 0; uintptr_t start = (uintptr_t)owner->base; if ((uintptr_t)head < start || (uintptr_t)head >= start + TINY_SLAB_SIZE) return 0; return (owner->class_idx == class_idx); } // Optional: wrapper TLS guard(ラッパー再入検知をTLSカウンタで) #ifndef HAKMEM_WRAPPER_TLS_GUARD #define HAKMEM_WRAPPER_TLS_GUARD 0 #endif #if HAKMEM_WRAPPER_TLS_GUARD extern __thread int g_tls_in_wrapper; #endif // ============================================================================ // EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3) // ============================================================================ // Function: tiny_tls_cache_drain() - 90 lines (lines 1314-1403) // Static function for draining TLS caches // // Function: tiny_apply_mem_diet() - 20 lines (lines 1405-1424) // Static function for memory diet mode application // // Phase 2D-3: Lifecycle management functions (226 lines total) #include "hakmem_tiny_lifecycle.inc" // Phase 2D-4 (FINAL): Slab management functions (142 lines total) #include "hakmem_tiny_slab_mgmt.inc" // ============================================================================ // ACE Learning Layer: Runtime parameter setters // ============================================================================ void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) { // Validate inputs if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { return; } if (threshold < 16 || threshold > 2048) { return; } // Set per-class threshold (used by remote free drain logic) g_remote_drain_thresh_per_class[class_idx] = (int)threshold; } #include "tiny_fc_api.h" int tiny_fc_room(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0; TinyFastCache* fc = &g_fast_cache[class_idx]; // Effective per-class cap comes from g_fast_cap (env-tunable), // clamped by the static storage capacity TINY_FASTCACHE_CAP. uint16_t eff_cap = g_fast_cap[class_idx]; if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP; int room = (int)eff_cap - fc->top; return room > 0 ? room : 0; } int tiny_fc_push_bulk(int class_idx, void** arr, int n) { if (!arr || n <= 0) return 0; if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0; TinyFastCache* fc = &g_fast_cache[class_idx]; uint16_t eff_cap = g_fast_cap[class_idx]; if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP; int room = (int)eff_cap - fc->top; if (room <= 0) return 0; int take = n < room ? n : room; // Forward fill with light unrolling to reduce branch overhead int i = 0; for (; i + 3 < take; i += 4) { fc->items[fc->top++] = arr[i]; fc->items[fc->top++] = arr[i + 1]; fc->items[fc->top++] = arr[i + 2]; fc->items[fc->top++] = arr[i + 3]; } for (; i < take; i++) { fc->items[fc->top++] = arr[i]; } return take; }