From 6b6ad69acae1f766e196771e9ebb0ab7b70f6938 Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Fri, 21 Nov 2025 01:16:45 +0900 Subject: [PATCH] Refactor: Extract 5 Box modules from hakmem_tiny.c (-52% size reduction) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split hakmem_tiny.c (2081 lines) into focused modules for better maintainability. ## Changes **hakmem_tiny.c**: 2081 → 995 lines (-1086 lines, -52% reduction) ## Extracted Modules (5 boxes) 1. **config_box** (211 lines) - Size class tables, integrity counters - Debug flags, benchmark macros - HAK_RET_ALLOC/HAK_STAT_FREE instrumentation 2. **publish_box** (419 lines) - Publish/Adopt counters and statistics - Bench mailbox, partial ring - Live cap/Hot slot management - TLS helper functions (tiny_tls_default_*) 3. **globals_box** (256 lines) - Global variable declarations (~70 variables) - TinyPool instance and initialization flag - TLS variables (g_tls_lists, g_fast_head, g_fast_count) - SuperSlab configuration (partial ring, empty reserves) - Adopt gate functions 4. **phase6_wrappers_box** (122 lines) - Phase 6 Box Theory wrapper layer - hak_tiny_alloc_fast_wrapper() - hak_tiny_free_fast_wrapper() - Diagnostic instrumentation 5. **ace_guard_box** (100 lines) - ACE Learning Layer (hkm_ace_set_drain_threshold) - FastCache API (tiny_fc_room, tiny_fc_push_bulk) - Tiny Guard debugging system (5 functions) ## Benefits - **Readability**: Giant 2k file → focused 1k core + 5 coherent modules - **Maintainability**: Each box has clear responsibility and boundaries - **Build**: All modules compile successfully ✅ ## Technical Details - Phase 1: ChatGPT extracted config_box + publish_box (-625 lines) - Phase 2-4: Claude extracted globals_box + phase6_wrappers_box + ace_guard_box (-461 lines) - All extractions use .inc files (same translation unit, preserves static/TLS linkage) - Fixed Makefile: Added tiny_sizeclass_hist_box.o to OBJS_BASE and BENCH_HAKMEM_OBJS_BASE 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- Makefile | 4 +- core/hakmem_tiny.c | 1112 +--------------------- core/hakmem_tiny_ace_guard_box.inc | 100 ++ core/hakmem_tiny_config_box.inc | 211 ++++ core/hakmem_tiny_globals_box.inc | 256 +++++ core/hakmem_tiny_phase6_wrappers_box.inc | 122 +++ core/hakmem_tiny_publish_box.inc | 419 ++++++++ 7 files changed, 1123 insertions(+), 1101 deletions(-) create mode 100644 core/hakmem_tiny_ace_guard_box.inc create mode 100644 core/hakmem_tiny_config_box.inc create mode 100644 core/hakmem_tiny_globals_box.inc create mode 100644 core/hakmem_tiny_phase6_wrappers_box.inc create mode 100644 core/hakmem_tiny_publish_box.inc diff --git a/Makefile b/Makefile index bbd02d91..7a0715c1 100644 --- a/Makefile +++ b/Makefile @@ -190,7 +190,7 @@ LDFLAGS += $(EXTRA_LDFLAGS) # Targets TARGET = test_hakmem -OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o +OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o OBJS = $(OBJS_BASE) # Shared library @@ -222,7 +222,7 @@ endif # Benchmark targets BENCH_HAKMEM = bench_allocators_hakmem BENCH_SYSTEM = bench_allocators_system -BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o +BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE) ifeq ($(POOL_TLS_PHASE1),1) BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 663f4303..76e62bce 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -37,212 +37,8 @@ extern uint64_t g_bytes_allocated; // from hakmem_tiny_superslab.c -// ============================================================================ -// Size class table (Box 3 dependency) -// ============================================================================ -// Phase E1-CORRECT: ALL classes have 1-byte header -// These sizes represent TOTAL BLOCK SIZE (stride) = [Header 1B][Data N-1B] -// Usable data = stride - 1 (implicit) -const size_t g_tiny_class_sizes[TINY_NUM_CLASSES] = { - 8, // Class 0: 8B total = [Header 1B][Data 7B] - 16, // Class 1: 16B total = [Header 1B][Data 15B] - 32, // Class 2: 32B total = [Header 1B][Data 31B] - 64, // Class 3: 64B total = [Header 1B][Data 63B] - 128, // Class 4: 128B total = [Header 1B][Data 127B] - 256, // Class 5: 256B total = [Header 1B][Data 255B] - 512, // Class 6: 512B total = [Header 1B][Data 511B] - 1024 // Class 7: 1024B total = [Header 1B][Data 1023B] -}; - -// ============================================================================ -// Phase 16: Dynamic Tiny Max Size (ENV: HAKMEM_TINY_MAX_CLASS) -// Phase 17-1: Auto-adjust when Small-Mid enabled -// ============================================================================ - -// Forward declaration for Small-Mid check -extern bool smallmid_is_enabled(void); - -// Get dynamic max size for Tiny allocator based on ENV configuration -// Default: 1023B (C0-C7), can be reduced to 255B (C0-C5) -// Phase 17-1: Auto-reduces to 255B when Small-Mid is enabled -size_t tiny_get_max_size(void) { - static int g_max_class = -1; - if (__builtin_expect(g_max_class == -1, 0)) { - const char* env = getenv("HAKMEM_TINY_MAX_CLASS"); - if (env && *env) { - int max_class = atoi(env); - if (max_class >= 0 && max_class < TINY_NUM_CLASSES) { - g_max_class = max_class; - } else { - g_max_class = 7; // Default: all classes (C0-C7) - } - } else { - g_max_class = 7; // Default: all classes - } - } - - // Phase 17-1: Auto-adjust when Small-Mid enabled - // Small-Mid handles 256B-1KB, so Tiny should only handle 0-255B - int effective_class = g_max_class; - if (smallmid_is_enabled() && effective_class > 5) { - effective_class = 5; // Limit to C0-C5 (0-255B) - } - - // Map class to max usable size (stride - 1) - // C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B - static const size_t class_to_max_size[TINY_NUM_CLASSES] = { - 7, 15, 31, 63, 127, 255, 511, 1023 - }; - return class_to_max_size[effective_class]; -} - -// ============================================================================ -// PRIORITY 1-4: Integrity Check Counters -// ============================================================================ -_Atomic uint64_t g_integrity_check_class_bounds = 0; -_Atomic uint64_t g_integrity_check_freelist = 0; -_Atomic uint64_t g_integrity_check_canary = 0; -_Atomic uint64_t g_integrity_check_header = 0; - -// Build-time gate for debug counters (path/ultra). Default OFF. -#ifndef HAKMEM_DEBUG_COUNTERS -#define HAKMEM_DEBUG_COUNTERS 0 -#endif - -int g_debug_fast0 = 0; -int g_debug_remote_guard = 0; -int g_remote_force_notify = 0; -// Tiny free safety (debug) -int g_tiny_safe_free = 0; // Default OFF for performance; env: HAKMEM_SAFE_FREE=1 でON -int g_tiny_safe_free_strict = 0; // env: HAKMEM_SAFE_FREE_STRICT=1 -int g_tiny_force_remote = 0; // env: HAKMEM_TINY_FORCE_REMOTE=1 - -// Build-time gate: Minimal Tiny front (bench-only) - -static inline int superslab_trace_enabled(void) { - static int g_ss_trace_flag = -1; - if (__builtin_expect(g_ss_trace_flag == -1, 0)) { - const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE"); - g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0; - } - return g_ss_trace_flag; -} -// When enabled, physically excludes optional front tiers from the hot path -// (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving: -// SLL → TLS Magazine → SuperSlab → (remaining slow path) -#ifndef HAKMEM_TINY_MINIMAL_FRONT -#define HAKMEM_TINY_MINIMAL_FRONT 1 -#endif -// Strict front: compile-out optional front tiers but keep baseline structure intact -#ifndef HAKMEM_TINY_STRICT_FRONT -#define HAKMEM_TINY_STRICT_FRONT 0 -#endif - -// Bench-only fast path knobs (defaults) -#ifndef HAKMEM_TINY_BENCH_REFILL -#define HAKMEM_TINY_BENCH_REFILL 8 -#endif -// Optional per-class overrides (bench-only) -#ifndef HAKMEM_TINY_BENCH_REFILL8 -#define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL -#endif -#ifndef HAKMEM_TINY_BENCH_REFILL16 -#define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL -#endif -#ifndef HAKMEM_TINY_BENCH_REFILL32 -#define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL -#endif -#ifndef HAKMEM_TINY_BENCH_REFILL64 -#define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL -#endif - -// Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class) -#ifndef HAKMEM_TINY_BENCH_WARMUP8 -#define HAKMEM_TINY_BENCH_WARMUP8 64 -#endif -#ifndef HAKMEM_TINY_BENCH_WARMUP16 -#define HAKMEM_TINY_BENCH_WARMUP16 96 -#endif -#ifndef HAKMEM_TINY_BENCH_WARMUP32 -#define HAKMEM_TINY_BENCH_WARMUP32 160 -#endif -#ifndef HAKMEM_TINY_BENCH_WARMUP64 -#define HAKMEM_TINY_BENCH_WARMUP64 192 -#endif - -#ifdef HAKMEM_TINY_BENCH_FASTPATH -static __thread unsigned char g_tls_bench_warm_done[4]; -#endif - -#if HAKMEM_DEBUG_COUNTERS -#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0) -#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0) -#else -#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0) -#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0) -#endif -// Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0) -#if HAKMEM_DEBUG_COUNTERS -#define HAK_DBG_INC(var) do { (var)++; } while(0) -#else -#define HAK_DBG_INC(var) do { (void)0; } while(0) -#endif -// Return helper: record tiny alloc stat (guarded) then return pointer -static inline void tiny_debug_track_alloc_ret(int cls, void* ptr); - -// ========== HAK_RET_ALLOC: Single Definition Point ========== -// Choose implementation based on HAKMEM_TINY_HEADER_CLASSIDX -// - Phase 7 enabled: Write header and return user pointer -// - Phase 7 disabled: Legacy behavior (stats + route + return) - -#if HAKMEM_TINY_HEADER_CLASSIDX - #if HAKMEM_BUILD_RELEASE - // Phase E1-CORRECT: ALL classes have 1-byte headers (including C7) - // Ultra-fast inline macro (3-4 instructions) - #define HAK_RET_ALLOC(cls, base_ptr) do { \ - *(uint8_t*)(base_ptr) = HEADER_MAGIC | ((cls) & HEADER_CLASS_MASK); \ - return (void*)((uint8_t*)(base_ptr) + 1); \ - } while(0) - #else - // Debug: Keep full validation via tiny_region_id_write_header() - #define HAK_RET_ALLOC(cls, ptr) return tiny_region_id_write_header((ptr), (cls)) - #endif -#else - // Legacy: Stats and routing before return - #ifdef HAKMEM_ENABLE_STATS - // Optional: sampling(ビルド時に有効化)。ホットパスは直接インライン呼び出し(間接分岐なし)。 - #ifdef HAKMEM_TINY_STAT_SAMPLING - static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES]; - static int g_stat_rate_lg = 0; // 0=毎回、それ以外=2^lgごと - static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { - if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; } - unsigned m = (1u << g_stat_rate_lg) - 1u; - if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls); - } - #else - static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); } - #endif - #define HAK_RET_ALLOC(cls, ptr) do { \ - tiny_debug_track_alloc_ret((cls), (ptr)); \ - hkm_stat_alloc((cls)); \ - ROUTE_COMMIT((cls), 0x7F); \ - return (ptr); \ - } while(0) - #else - #define HAK_RET_ALLOC(cls, ptr) do { \ - tiny_debug_track_alloc_ret((cls), (ptr)); \ - ROUTE_COMMIT((cls), 0x7F); \ - return (ptr); \ - } while(0) - #endif -#endif // HAKMEM_TINY_HEADER_CLASSIDX - -// Free-side stats: compile-time zero when stats disabled -#ifdef HAKMEM_ENABLE_STATS -#define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0) -#else -#define HAK_STAT_FREE(cls) do { } while(0) -#endif +// Tiny allocator configuration, debug counters, and return helpers +#include "hakmem_tiny_config_box.inc" // Forward declarations for static helpers used before definition struct TinySlab; // forward @@ -367,683 +163,13 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) { return NULL; } + // ============================================================================ -// Global State +// Global State - EXTRACTED to hakmem_tiny_globals_box.inc // ============================================================================ +#include "hakmem_tiny_globals_box.inc" -// Global pool instance (extern declared in hakmem_tiny.h) -TinyPool g_tiny_pool; -int g_tiny_initialized = 0; // Not static (extern in header for inline access) -// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers -// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED -// -// Results: -// Phase 1 (Push - deferred free): +1 instruction, zero benefit -// Phase 2 (Pull - background refill): +77 instructions, -3% performance -// -// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9% -// - TLS Magazine capacity: 2048 items -// - Benchmark working set: 100 items -// - Magazine hit rate: 100% after warmup -// - Slow path never executed! -// -// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op -// - glibc: ~40 instructions/op (5-7× faster) -// - Gap is architectural (bitmap vs free-list, research features) -// -// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc! -// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅ -// Decision: Enable by default (proven production-ready) -static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!) -// Optional: allow limited trylock-based refill during wrapper calls -static int g_wrap_tiny_refill = 0; -// Remote-free drain controls -static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback) -static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability) - -// ACE Learning Layer: Per-class remote drain thresholds -int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32}; -// Sampled counter updates (Phase 3: Replaced with batched TLS counters) -// Old: XOR RNG sampling (10-15 ns overhead) -// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead) -static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility) - -// Step 2: Slab Registry (Hash Table) -SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE]; - -PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES]; - -// Registry lock -pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER; - -// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF) -// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs) -// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash) -static int g_use_registry = 1; // Default ON for thread-safety - -// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h -// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable) -static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64) -static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192) - -// hakmem_tiny_tls_list.h already included at top -static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; -static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1 -static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want); -static int g_fast_enable = 1; -static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0 -static uint16_t g_fast_cap[TINY_NUM_CLASSES]; -static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1 -static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES]; - - -typedef void* (*TinyHotAllocFn)(void); -static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES]; -static __thread void* g_fast_head[TINY_NUM_CLASSES]; -static __thread uint16_t g_fast_count[TINY_NUM_CLASSES]; -static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls); - -uint64_t g_tls_hit_count[TINY_NUM_CLASSES]; -uint64_t g_tls_miss_count[TINY_NUM_CLASSES]; -uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES]; -uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES]; -uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES]; -uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES]; - -// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h -// NEW: Per-thread active slabs (up to 2 per class) -static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES]; -static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES]; - -static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) { - TinySlab* cand = g_tls_active_slab_a[class_idx]; - if (cand) { - uintptr_t base = (uintptr_t)cand->base; - if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { - return cand; - } - } - cand = g_tls_active_slab_b[class_idx]; - if (cand) { - uintptr_t base = (uintptr_t)cand->base; - if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { - return cand; - } - } - return NULL; -} - -// Phase 6.23: SuperSlab support (mimalloc-style fast allocation) -// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line. -extern int g_use_superslab; - -#if !HAKMEM_BUILD_RELEASE -static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { - if (!ptr) return; - // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE - void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL; - if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { - SuperSlab* ss = hak_super_lookup(ptr); - if (!(ss && ss->magic == SUPERSLAB_MAGIC)) { - tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail"); - } else { - int slab_idx = slab_index_for(ss, base_ptr); - if (slab_idx < 0) { - tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch"); - } else { - // Fail-Fast: class vs SuperSlab size_class must be consistent. - TinySlabMeta* meta = &ss->slabs[slab_idx]; - if (meta->class_idx != (uint8_t)cls) { - tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch"); - } - size_t blk = g_tiny_class_sizes[cls]; - uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx); - uintptr_t delta = (uintptr_t)base_ptr - base; - if (blk == 0 || (delta % blk) != 0) { - tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned"); - } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) { - tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity"); - } - } - } - } - if (!__builtin_expect(g_debug_remote_guard, 0)) return; - if (!g_use_superslab) return; - SuperSlab* ss = hak_super_lookup(ptr); - if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return; - int slab_idx = slab_index_for(ss, base_ptr); - if (slab_idx >= 0) { - tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0); - } -} -#else -static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; } -#endif - -// Debug counters for SuperSlab investigation -#if HAKMEM_DEBUG_COUNTERS -int g_superslab_alloc_count = 0; -int g_superslab_fail_count = 0; -int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees -int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected -int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes -int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls -#endif - -// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve -// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact) -// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress) -#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default) -static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class -static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs -static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1) -static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER; -static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default -static uint32_t g_ss_partial_interval = 4; -static _Atomic uint32_t g_ss_partial_epoch = 0; - -// Phase 6.24: Unified TLS slab cache (Medium fix) -// Reduces TLS reads from 3 to 1 (cache-line aligned for performance) -// Phase E4: 64B alignment for L1 cache optimization -__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64))); -static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES]; -static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES]; -static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES]; -static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES]; -static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES]; -static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES]; -static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES]; - -// ---------------------------------------------------------------------------- -// Per-class partial SuperSlab slot (single-slot publish/adopt) -// ---------------------------------------------------------------------------- -// Small ring of partial SuperSlabs per class (publish/adopt) -#ifndef SS_PARTIAL_RING -#define SS_PARTIAL_RING 64 -#endif -static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING]; -static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES]; -static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES]; -static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES]; -static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN - -// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS) -unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0}; -unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0}; -_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs -static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON -static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active -static _Atomic int g_ss_adopt_log_once = 0; - -static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) { - if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) { - fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n", - reason ? reason : "unknown", class_idx); - } -} - -static inline void tiny_adopt_gate_parse_env(void) { - if (__builtin_expect(g_ss_adopt_env == -2, 0)) { - const char* env = getenv("HAKMEM_TINY_SS_ADOPT"); - if (!env || *env == '\0') { - g_ss_adopt_env = 0; // auto - } else if (*env == '0') { - g_ss_adopt_env = -1; // forced OFF - atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release); - } else { - g_ss_adopt_env = 1; // forced ON - atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release); - tiny_adopt_gate_log_activation("env", -1); - } - } -} - -int tiny_adopt_gate_should_publish(void) { - tiny_adopt_gate_parse_env(); - if (g_ss_adopt_env == 1) return 1; - if (g_ss_adopt_env == -1) return 0; - return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; -} - -int tiny_adopt_gate_should_adopt(void) { - tiny_adopt_gate_parse_env(); - if (g_ss_adopt_env == 1) return 1; - if (g_ss_adopt_env == -1) return 0; - return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; -} - -void tiny_adopt_gate_on_remote_seen(int class_idx) { - tiny_adopt_gate_parse_env(); - atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed); - if (g_ss_adopt_env == -1) return; - int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel); - if (prev == 0) { - tiny_adopt_gate_log_activation("remote", class_idx); - } -} - -// TLS hint: last adopted SuperSlab/slab to avoid rescans -#include "tiny_sticky.h" - -// Mailbox box -#include "box/mailbox_box.h" - -// Publish pipeline counters (visibility) -unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0}; -unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0}; -unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0}; -unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0}; -unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0}; - -// Slab-ring counters (debug) -unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0}; -unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0}; -unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0}; -unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0}; - -// Slab entry encoding helpers (used by Bench/Slab-ring paths) -static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) { - return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu); -} -static inline SuperSlab* slab_entry_ss(uintptr_t ent) { - // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base - return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u)); -} -static inline int slab_entry_idx(uintptr_t ent) { - return (int)(ent & 0x3Fu); -} - -// ---------------------------------------------------------------------------- -// Bench Mode Publish Mailbox (single-slot per class) -// ---------------------------------------------------------------------------- -static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1 -static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES]; -#ifndef BENCH_MAILBOX_WIDTH -#define BENCH_MAILBOX_WIDTH 16 -#endif -static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH]; - -static inline int bench_mode_enabled(void) { - if (__builtin_expect(g_bench_mode == -1, 0)) { - const char* b = getenv("HAKMEM_TINY_BENCH_MODE"); - g_bench_mode = (b && atoi(b) != 0) ? 1 : 0; - } - return g_bench_mode; -} - -static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) { - if (!bench_mode_enabled()) return; - uintptr_t ent = slab_entry_make(ss, slab_idx); - uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed); - idx &= (BENCH_MAILBOX_WIDTH - 1); - atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release); -} - -static inline uintptr_t bench_pub_pop(int class_idx) { - if (!bench_mode_enabled()) return (uintptr_t)0; - for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) { - uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel); - if (ent) return ent; - } - return 0; -} - -// ---------------------------------------------------------------------------- -// Slab-Granular Partial Publish/Adopt (encoded entries) -// ---------------------------------------------------------------------------- -#ifndef SLAB_PARTIAL_RING -#define SLAB_PARTIAL_RING 128 -#endif -static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING]; -static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES]; - -// ---------------------------------------------------------------------------- -// Refill-stage counters (per class) -// ---------------------------------------------------------------------------- -unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0}; - -// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0) -unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0}; - -// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE) -unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0}; - -// Refill item source breakdown (freelist vs carve) -unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0}; -unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0}; - -static int g_rf_trace_en = -1; -static inline int rf_trace_enabled(void) { - if (__builtin_expect(g_rf_trace_en == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_RF_TRACE"); - g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0; - } - return g_rf_trace_en; -} - -static inline unsigned long long rf_now_ns(void) { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec; -} - -// moved to tiny_sticky.c - -// moved to tiny_remote.c - -// moved to tiny_mailbox.c - -// Publish-side counters (debug) -unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0}; -unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0}; -unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0}; - -// Free pipeline counters -unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0}; -unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0}; -unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0}; -unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0}; - -// Front Gate Breakdown (debug counters) -unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0}; -unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0}; -unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0}; -unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0}; - -// Free-side trigger counters -unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0}; -unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0}; - -// Adopt/Registry gate counters -unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0}; -unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0}; -unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0}; -unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0}; -unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0}; -unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0}; -unsigned long long g_fast_lookup_none = 0; - -// ---------------------------------------------------------------------------- -// Live Superslab cap (must-adopt-before-mmap support) -// ---------------------------------------------------------------------------- -static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value -__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0}; -static inline int live_cap_for_class(int class_idx) { - if (__builtin_expect(g_live_cap_env == -2, 0)) { - const char* s = getenv("HAKMEM_SS_LIVE_CAP"); - if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); } - } - (void)class_idx; - return g_live_cap_env; -} - -// ---------------------------------------------------------------------------- -// Hot Slot (global simple path) -// ---------------------------------------------------------------------------- -static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot) -static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES]; -static inline int hot_slot_enabled(void) { - if (__builtin_expect(g_hot_slot_en == -1, 0)) { - const char* s = getenv("HAKMEM_HOT_SLOT"); - g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0; - } - return g_hot_slot_en || bench_mode_enabled(); -} -static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) { - if (!hot_slot_enabled()) return; - uintptr_t ent = slab_entry_make(ss, slab_idx); - atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release); -} -static inline uintptr_t hot_slot_pop(int class_idx) { - if (!hot_slot_enabled()) return (uintptr_t)0; - return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel); -} - -// moved to tiny_publish.c - -static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { - if (!ss) return; - uintptr_t ent = slab_entry_make(ss, slab_idx); - for (int i = 0; i < SLAB_PARTIAL_RING; i++) { - uintptr_t expected = 0; - if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent, - memory_order_release, memory_order_relaxed)) { - g_slab_publish_dbg[class_idx]++; - return; - } - } - // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot - uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; - uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel); - if (old) { - for (int t = 0; t < 8; t++) { - uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; - uintptr_t expected = 0; - if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old, - memory_order_release, memory_order_relaxed)) { - g_slab_requeue_dbg[class_idx]++; - old = 0; break; - } - } - } - g_slab_publish_dbg[class_idx]++; -} - -static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) { - for (int i = 0; i < SLAB_PARTIAL_RING; i++) { - uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel); - if (ent) return ent; - } - return 0; -} - -void ss_partial_publish(int class_idx, SuperSlab* ss) { - if (!ss) return; - // Gate by listed flag to avoid repeated publishes of the same SS - unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel); - if (prev != 0u) return; // already listed - - // CRITICAL: Release ownership of all slabs so adopters can claim them! - // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0). - // The publishing thread must stop using this SS after publishing. - int cap_pub = ss_slabs_capacity(ss); - for (int s = 0; s < cap_pub; s++) { - // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split - TinySlabMeta* meta = ss_slab_meta_ptr(ss, s); - uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE); - if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) { - uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev; - tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE, - (uint16_t)ss_slab_meta_class_idx_get(ss, s), - meta, - aux); - } - } - - // CRITICAL: Unbind current thread's TLS if it points to this SS! - // Otherwise, the publishing thread will continue allocating from the published SS, - // racing with adopters who acquire ownership. - extern __thread TinyTLSSlab g_tls_slabs[]; - if (g_tls_slabs[class_idx].ss == ss) { - g_tls_slabs[class_idx].ss = NULL; - g_tls_slabs[class_idx].meta = NULL; - g_tls_slabs[class_idx].slab_base = NULL; - g_tls_slabs[class_idx].slab_idx = 0; - } - - // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1) - int best = -1; uint32_t best_score = 0; - for (int s = 0; s < cap_pub; s++) { - TinySlabMeta* m = &ss->slabs[s]; - uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed); - int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0); - unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u; - uint32_t score = rc - + (m->freelist ? (1u<<30) : 0u) - + (listed ? (1u<<29) : 0u) - + (has_remote ? 1u : 0u); - if (score > best_score) { best_score = score; best = s; } - } - if (best >= 0 && best < 256) { - ss->publish_hint = (uint8_t)best; - // Box: Ready push — provide slab-level candidate to adopters - tiny_ready_push(class_idx, ss, best); - } else { - ss->publish_hint = 0xFF; - } - for (int i = 0; i < SS_PARTIAL_RING; i++) { - SuperSlab* expected = NULL; - if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss, - memory_order_release, memory_order_relaxed)) { - g_ss_publish_dbg[class_idx]++; - return; // published - } - } - // Ring full: replace one entry in round-robin to avoid dropping supply - uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed); - idx %= SS_PARTIAL_RING; - SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel); - if (old) { - // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads - // that just adopted from it. Draining without ownership checks causes freelist corruption. - // The adopter will drain when needed (with proper ownership checks in tiny_refill.h). - // - // Previous code (UNSAFE): - // for (int s = 0; s < cap; s++) { - // ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter! - // } - - // Keep listed=1 while in overflow so it stays eligible for adopt - // Push old into overflow stack (待機箱) - SuperSlab* head; - do { - head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); - old->partial_next = head; - } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old, - memory_order_release, memory_order_relaxed)); - } - g_ss_publish_dbg[class_idx]++; -} - -SuperSlab* ss_partial_adopt(int class_idx) { - for (int i = 0; i < SS_PARTIAL_RING; i++) { - SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel); - if (ss) { - // Clear listed flag on adopt to allow future publish of this SS - atomic_store_explicit(&ss->listed, 0u, memory_order_release); - g_ss_adopt_dbg[class_idx]++; - return ss; - } - } - // Fallback: adopt from overflow stack (LIFO) - while (1) { - SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); - if (!head) break; - SuperSlab* next = head->partial_next; - if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next, - memory_order_acq_rel, memory_order_relaxed)) { - atomic_store_explicit(&head->listed, 0u, memory_order_release); - g_ss_adopt_dbg[class_idx]++; - return head; - } - } - return NULL; -} - -static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) { - // Canonical binding under Phase 12: - // - Per-slab TinySlabMeta.class_idx defines class for this slab - // - slab_idx is the owning slab index within ss - // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx) - tls->ss = ss; - tls->slab_idx = (uint8_t)slab_idx; - tls->meta = &ss->slabs[slab_idx]; - tls->slab_base = tiny_slab_base_for(ss, slab_idx); -} - -static inline uint32_t tiny_tls_default_refill(uint32_t cap) { - if (cap == 0u) return 8u; - uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u; - if (low < 4u) low = 4u; - return low; -} - -static inline uint32_t tiny_tls_default_spill(uint32_t cap) { - if (cap == 0u) return 0u; - uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u); - if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP; - if (spill < cap) spill = cap; - return (uint32_t)spill; -} - -static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) { - atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release); - atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed); - atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed); - atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); -} - -static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) { - atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release); - atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); -} - -static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) { - uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire); - if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) { - return; - } - uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire); - if (target_cap != 0u && tls->cap != target_cap) { - tls->cap = target_cap; - uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed); - if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap); - tls->refill_low = target_refill; - uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed); - if (target_spill < target_cap) target_spill = target_cap; - tls->spill_high = target_spill; - } - uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire); - if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) { - g_tls_trim_seen[class_idx] = trim_epoch; - if (tls->count > tls->cap) { - tls_list_spill_excess(class_idx, tls); - } - } - g_tls_param_seen[class_idx] = seq; -} +#include "hakmem_tiny_publish_box.inc" // ============================================================================ // EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1) @@ -1774,128 +900,12 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { #error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE" #endif - // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check -#if HAKMEM_TINY_PHASE6_BOX_REFACTOR - #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) - #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options" - #endif - // Box 1: Atomic Operations (Layer 0 - Foundation) - #include "tiny_atomic.h" +// ============================================================================ +// Phase 6 Wrapper Functions - EXTRACTED to hakmem_tiny_phase6_wrappers_box.inc +// ============================================================================ +#include "hakmem_tiny_phase6_wrappers_box.inc" - // Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions) - #include "tiny_alloc_fast.inc.h" - - // Box 6: Free Fast Path (Layer 2 - 2-3 instructions) - #include "tiny_free_fast.inc.h" - - // ---------------- Refill count (Front) global config ---------------- - // Parsed once at init; hot path reads plain ints (no getenv). - int g_refill_count_global = 0; // HAKMEM_TINY_REFILL_COUNT - int g_refill_count_hot = 0; // HAKMEM_TINY_REFILL_COUNT_HOT - int g_refill_count_mid = 0; // HAKMEM_TINY_REFILL_COUNT_MID - int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7} - - // Export wrapper functions for hakmem.c to call - // Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining - void* hak_tiny_alloc_fast_wrapper(size_t size) { - // Phase E5: Ultra fast path (8-instruction alloc, bypasses all layers) - // Enable with: HAKMEM_ULTRA_FAST_PATH=1 (compile-time) - #if HAKMEM_ULTRA_FAST_PATH - void* ret = tiny_alloc_fast_ultra(size); - if (ret) return ret; - // Miss → fallback to full fast path - #endif - - // Bench-only ultra-short path: bypass diagnostics and pointer tracking - // Enable with: HAKMEM_BENCH_FAST_FRONT=1 - static int g_bench_fast_front = -1; - if (__builtin_expect(g_bench_fast_front == -1, 0)) { - const char* e = getenv("HAKMEM_BENCH_FAST_FRONT"); - g_bench_fast_front = (e && *e && *e != '0') ? 1 : 0; - } - if (__builtin_expect(g_bench_fast_front, 0)) { - return tiny_alloc_fast(size); - } - - static _Atomic uint64_t wrapper_call_count = 0; - uint64_t call_num = atomic_fetch_add(&wrapper_call_count, 1); - - // Pointer tracking init (first call only) - PTR_TRACK_INIT(); - - // PRIORITY 3: Periodic canary validation (every 1000 ops) - periodic_canary_check(call_num, "hak_tiny_alloc_fast_wrapper"); - - // Box I: Periodic full integrity check (every 5000 ops) - #if HAKMEM_INTEGRITY_LEVEL >= 3 - if ((call_num % 5000) == 0) { - extern void integrity_periodic_full_check(const char*); - integrity_periodic_full_check("periodic check in alloc wrapper"); - } - #endif - - #if !HAKMEM_BUILD_RELEASE - if (call_num > 14250 && call_num < 14280 && size <= 1024) { - fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu size=%zu\n", call_num, size); - fflush(stderr); - } - #endif - - void* result = tiny_alloc_fast(size); - - #if !HAKMEM_BUILD_RELEASE - if (call_num > 14250 && call_num < 14280 && size <= 1024) { - fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu returned %p\n", call_num, result); - fflush(stderr); - } - #endif - return result; - } - - void hak_tiny_free_fast_wrapper(void* ptr) { - // Phase E5: Ultra fast path (6-8 instruction free) - #if HAKMEM_ULTRA_FAST_PATH - tiny_free_fast_ultra(ptr); - return; - #endif - - static _Atomic uint64_t free_call_count = 0; - uint64_t call_num = atomic_fetch_add(&free_call_count, 1); - if (call_num > 14135 && call_num < 14145) { - fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu ptr=%p\n", call_num, ptr); - fflush(stderr); - } - tiny_free_fast(ptr); - if (call_num > 14135 && call_num < 14145) { - fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu completed\n", call_num); - fflush(stderr); - } - } - -#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) - // Phase 6-1.5: Alignment guessing (legacy) - - // Refill count globals (needed for compatibility) - int g_refill_count_global = 0; - int g_refill_count_hot = 0; - int g_refill_count_mid = 0; - int g_refill_count_class[TINY_NUM_CLASSES] = {0}; - - #include "hakmem_tiny_ultra_simple.inc" - - // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking) - void* hak_tiny_alloc_fast_wrapper(size_t size) { - return hak_tiny_alloc_ultra_simple(size); - } - - void hak_tiny_free_fast_wrapper(void* ptr) { - hak_tiny_free_ultra_simple(ptr); - } -#elif defined(HAKMEM_TINY_PHASE6_METADATA) - // Phase 6-1.6: Metadata header (recommended) - #include "hakmem_tiny_metadata.inc" -#endif // Layer 1-3: Main allocation function (simplified) // Build-time configurable via: -DHAKMEM_TINY_USE_NEW_3LAYER=1 @@ -1978,104 +988,8 @@ extern __thread int g_tls_in_wrapper; // Phase 2D-4 (FINAL): Slab management functions (142 lines total) #include "hakmem_tiny_slab_mgmt.inc" + // ============================================================================ -// ACE Learning Layer: Runtime parameter setters +// ACE Learning Layer & Tiny Guard - EXTRACTED to hakmem_tiny_ace_guard_box.inc // ============================================================================ - -void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) { - // Validate inputs - if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { - return; - } - if (threshold < 16 || threshold > 2048) { - return; - } - - // Set per-class threshold (used by remote free drain logic) - g_remote_drain_thresh_per_class[class_idx] = (int)threshold; -} -#include "tiny_fc_api.h" -int tiny_fc_room(int class_idx) { - if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0; - TinyFastCache* fc = &g_fast_cache[class_idx]; - // Effective per-class cap comes from g_fast_cap (env-tunable), - // clamped by the static storage capacity TINY_FASTCACHE_CAP. - uint16_t eff_cap = g_fast_cap[class_idx]; - if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP; - int room = (int)eff_cap - fc->top; - return room > 0 ? room : 0; -} - -int tiny_fc_push_bulk(int class_idx, void** arr, int n) { - if (!arr || n <= 0) return 0; - if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0; - TinyFastCache* fc = &g_fast_cache[class_idx]; - uint16_t eff_cap = g_fast_cap[class_idx]; - if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP; - int room = (int)eff_cap - fc->top; - if (room <= 0) return 0; - int take = n < room ? n : room; - // Forward fill with light unrolling to reduce branch overhead - int i = 0; - for (; i + 3 < take; i += 4) { - fc->items[fc->top++] = arr[i]; - fc->items[fc->top++] = arr[i + 1]; - fc->items[fc->top++] = arr[i + 2]; - fc->items[fc->top++] = arr[i + 3]; - } - for (; i < take; i++) { - fc->items[fc->top++] = arr[i]; - } - return take; -} - -// ========= Tiny Guard (targeted debug; low overhead when disabled) ========= -static int g_tiny_guard_enabled = -1; -static int g_tiny_guard_class = 2; -static int g_tiny_guard_limit = 8; -static __thread int g_tiny_guard_seen = 0; - -static inline int tiny_guard_enabled_runtime(void) { - if (__builtin_expect(g_tiny_guard_enabled == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_GUARD"); - g_tiny_guard_enabled = (e && *e && *e != '0') ? 1 : 0; - const char* ec = getenv("HAKMEM_TINY_GUARD_CLASS"); - if (ec && *ec) g_tiny_guard_class = atoi(ec); - const char* el = getenv("HAKMEM_TINY_GUARD_MAX"); - if (el && *el) g_tiny_guard_limit = atoi(el); - if (g_tiny_guard_limit <= 0) g_tiny_guard_limit = 8; - } - return g_tiny_guard_enabled; -} - -int tiny_guard_is_enabled(void) { return tiny_guard_enabled_runtime(); } - -static void tiny_guard_dump_bytes(const char* tag, const uint8_t* p, size_t n) { - fprintf(stderr, "[TGUARD] %s:", tag); - for (size_t i = 0; i < n; i++) fprintf(stderr, " %02x", p[i]); - fprintf(stderr, "\n"); -} - -void tiny_guard_on_alloc(int cls, void* base, void* user, size_t stride) { - if (!tiny_guard_enabled_runtime() || cls != g_tiny_guard_class) return; - if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return; - uint8_t* b = (uint8_t*)base; - uint8_t* u = (uint8_t*)user; - fprintf(stderr, "[TGUARD] alloc cls=%d base=%p user=%p stride=%zu hdr=%02x\n", - cls, base, user, stride, b[0]); - // 隣接ヘッダ可視化(前後) - tiny_guard_dump_bytes("around_base", b, (stride >= 8 ? 8 : stride)); - tiny_guard_dump_bytes("next_header", b + stride, 4); -} - -void tiny_guard_on_invalid(void* user_ptr, uint8_t hdr) { - if (!tiny_guard_enabled_runtime()) return; - if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return; - uint8_t* u = (uint8_t*)user_ptr; - fprintf(stderr, "[TGUARD] invalid header at user=%p hdr=%02x prev=%02x next=%02x\n", - user_ptr, hdr, *(u - 2), *(u)); - tiny_guard_dump_bytes("dump_before", u - 8, 8); - tiny_guard_dump_bytes("dump_after", u, 8); -} - - +#include "hakmem_tiny_ace_guard_box.inc" diff --git a/core/hakmem_tiny_ace_guard_box.inc b/core/hakmem_tiny_ace_guard_box.inc new file mode 100644 index 00000000..02821773 --- /dev/null +++ b/core/hakmem_tiny_ace_guard_box.inc @@ -0,0 +1,100 @@ +// ============================================================================ +// ACE Learning Layer: Runtime parameter setters +// ============================================================================ + +void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) { + // Validate inputs + if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { + return; + } + if (threshold < 16 || threshold > 2048) { + return; + } + + // Set per-class threshold (used by remote free drain logic) + g_remote_drain_thresh_per_class[class_idx] = (int)threshold; +} +#include "tiny_fc_api.h" +int tiny_fc_room(int class_idx) { + if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0; + TinyFastCache* fc = &g_fast_cache[class_idx]; + // Effective per-class cap comes from g_fast_cap (env-tunable), + // clamped by the static storage capacity TINY_FASTCACHE_CAP. + uint16_t eff_cap = g_fast_cap[class_idx]; + if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP; + int room = (int)eff_cap - fc->top; + return room > 0 ? room : 0; +} + +int tiny_fc_push_bulk(int class_idx, void** arr, int n) { + if (!arr || n <= 0) return 0; + if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0; + TinyFastCache* fc = &g_fast_cache[class_idx]; + uint16_t eff_cap = g_fast_cap[class_idx]; + if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP; + int room = (int)eff_cap - fc->top; + if (room <= 0) return 0; + int take = n < room ? n : room; + // Forward fill with light unrolling to reduce branch overhead + int i = 0; + for (; i + 3 < take; i += 4) { + fc->items[fc->top++] = arr[i]; + fc->items[fc->top++] = arr[i + 1]; + fc->items[fc->top++] = arr[i + 2]; + fc->items[fc->top++] = arr[i + 3]; + } + for (; i < take; i++) { + fc->items[fc->top++] = arr[i]; + } + return take; +} + +// ========= Tiny Guard (targeted debug; low overhead when disabled) ========= +static int g_tiny_guard_enabled = -1; +static int g_tiny_guard_class = 2; +static int g_tiny_guard_limit = 8; +static __thread int g_tiny_guard_seen = 0; + +static inline int tiny_guard_enabled_runtime(void) { + if (__builtin_expect(g_tiny_guard_enabled == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_GUARD"); + g_tiny_guard_enabled = (e && *e && *e != '0') ? 1 : 0; + const char* ec = getenv("HAKMEM_TINY_GUARD_CLASS"); + if (ec && *ec) g_tiny_guard_class = atoi(ec); + const char* el = getenv("HAKMEM_TINY_GUARD_MAX"); + if (el && *el) g_tiny_guard_limit = atoi(el); + if (g_tiny_guard_limit <= 0) g_tiny_guard_limit = 8; + } + return g_tiny_guard_enabled; +} + +int tiny_guard_is_enabled(void) { return tiny_guard_enabled_runtime(); } + +static void tiny_guard_dump_bytes(const char* tag, const uint8_t* p, size_t n) { + fprintf(stderr, "[TGUARD] %s:", tag); + for (size_t i = 0; i < n; i++) fprintf(stderr, " %02x", p[i]); + fprintf(stderr, "\n"); +} + +void tiny_guard_on_alloc(int cls, void* base, void* user, size_t stride) { + if (!tiny_guard_enabled_runtime() || cls != g_tiny_guard_class) return; + if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return; + uint8_t* b = (uint8_t*)base; + uint8_t* u = (uint8_t*)user; + fprintf(stderr, "[TGUARD] alloc cls=%d base=%p user=%p stride=%zu hdr=%02x\n", + cls, base, user, stride, b[0]); + // 隣接ヘッダ可視化(前後) + tiny_guard_dump_bytes("around_base", b, (stride >= 8 ? 8 : stride)); + tiny_guard_dump_bytes("next_header", b + stride, 4); +} + +void tiny_guard_on_invalid(void* user_ptr, uint8_t hdr) { + if (!tiny_guard_enabled_runtime()) return; + if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return; + uint8_t* u = (uint8_t*)user_ptr; + fprintf(stderr, "[TGUARD] invalid header at user=%p hdr=%02x prev=%02x next=%02x\n", + user_ptr, hdr, *(u - 2), *(u)); + tiny_guard_dump_bytes("dump_before", u - 8, 8); + tiny_guard_dump_bytes("dump_after", u, 8); +} + diff --git a/core/hakmem_tiny_config_box.inc b/core/hakmem_tiny_config_box.inc new file mode 100644 index 00000000..5907a16c --- /dev/null +++ b/core/hakmem_tiny_config_box.inc @@ -0,0 +1,211 @@ +// hakmem_tiny_config_box.inc +// Box: Tiny allocator configuration, debug counters, and return helpers. +// Extracted from hakmem_tiny.c to reduce file size and isolate config logic. + +// ============================================================================ +// Size class table (Box 3 dependency) +// ============================================================================ +// Phase E1-CORRECT: ALL classes have 1-byte header +// These sizes represent TOTAL BLOCK SIZE (stride) = [Header 1B][Data N-1B] +// Usable data = stride - 1 (implicit) +const size_t g_tiny_class_sizes[TINY_NUM_CLASSES] = { + 8, // Class 0: 8B total = [Header 1B][Data 7B] + 16, // Class 1: 16B total = [Header 1B][Data 15B] + 32, // Class 2: 32B total = [Header 1B][Data 31B] + 64, // Class 3: 64B total = [Header 1B][Data 63B] + 128, // Class 4: 128B total = [Header 1B][Data 127B] + 256, // Class 5: 256B total = [Header 1B][Data 255B] + 512, // Class 6: 512B total = [Header 1B][Data 511B] + 1024 // Class 7: 1024B total = [Header 1B][Data 1023B] +}; + +// ============================================================================ +// Phase 16: Dynamic Tiny Max Size (ENV: HAKMEM_TINY_MAX_CLASS) +// Phase 17-1: Auto-adjust when Small-Mid enabled +// ============================================================================ + +// Forward declaration for Small-Mid check +extern bool smallmid_is_enabled(void); + +// Get dynamic max size for Tiny allocator based on ENV configuration +// Default: 1023B (C0-C7), can be reduced to 255B (C0-C5) +// Phase 17-1: Auto-reduces to 255B when Small-Mid is enabled +size_t tiny_get_max_size(void) { + static int g_max_class = -1; + if (__builtin_expect(g_max_class == -1, 0)) { + const char* env = getenv("HAKMEM_TINY_MAX_CLASS"); + if (env && *env) { + int max_class = atoi(env); + if (max_class >= 0 && max_class < TINY_NUM_CLASSES) { + g_max_class = max_class; + } else { + g_max_class = 7; // Default: all classes (C0-C7) + } + } else { + g_max_class = 7; // Default: all classes + } + } + + // Phase 17-1: Auto-adjust when Small-Mid enabled + // Small-Mid handles 256B-1KB, so Tiny should only handle 0-255B + int effective_class = g_max_class; + if (smallmid_is_enabled() && effective_class > 5) { + effective_class = 5; // Limit to C0-C5 (0-255B) + } + + // Map class to max usable size (stride - 1) + // C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B + static const size_t class_to_max_size[TINY_NUM_CLASSES] = { + 7, 15, 31, 63, 127, 255, 511, 1023 + }; + return class_to_max_size[effective_class]; +} + +// ============================================================================ +// PRIORITY 1-4: Integrity Check Counters +// ============================================================================ +_Atomic uint64_t g_integrity_check_class_bounds = 0; +_Atomic uint64_t g_integrity_check_freelist = 0; +_Atomic uint64_t g_integrity_check_canary = 0; +_Atomic uint64_t g_integrity_check_header = 0; + +// Build-time gate for debug counters (path/ultra). Default OFF. +#ifndef HAKMEM_DEBUG_COUNTERS +#define HAKMEM_DEBUG_COUNTERS 0 +#endif + +int g_debug_fast0 = 0; +int g_debug_remote_guard = 0; +int g_remote_force_notify = 0; +// Tiny free safety (debug) +int g_tiny_safe_free = 0; // Default OFF for performance; env: HAKMEM_SAFE_FREE=1 でON +int g_tiny_safe_free_strict = 0; // env: HAKMEM_SAFE_FREE_STRICT=1 +int g_tiny_force_remote = 0; // env: HAKMEM_TINY_FORCE_REMOTE=1 + +// Build-time gate: Minimal Tiny front (bench-only) + +static inline int superslab_trace_enabled(void) { + static int g_ss_trace_flag = -1; + if (__builtin_expect(g_ss_trace_flag == -1, 0)) { + const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE"); + g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0; + } + return g_ss_trace_flag; +} +// When enabled, physically excludes optional front tiers from the hot path +// (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving: +// SLL → TLS Magazine → SuperSlab → (remaining slow path) +#ifndef HAKMEM_TINY_MINIMAL_FRONT +#define HAKMEM_TINY_MINIMAL_FRONT 1 +#endif +// Strict front: compile-out optional front tiers but keep baseline structure intact +#ifndef HAKMEM_TINY_STRICT_FRONT +#define HAKMEM_TINY_STRICT_FRONT 0 +#endif + +// Bench-only fast path knobs (defaults) +#ifndef HAKMEM_TINY_BENCH_REFILL +#define HAKMEM_TINY_BENCH_REFILL 8 +#endif +// Optional per-class overrides (bench-only) +#ifndef HAKMEM_TINY_BENCH_REFILL8 +#define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL +#endif +#ifndef HAKMEM_TINY_BENCH_REFILL16 +#define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL +#endif +#ifndef HAKMEM_TINY_BENCH_REFILL32 +#define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL +#endif +#ifndef HAKMEM_TINY_BENCH_REFILL64 +#define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL +#endif + +// Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class) +#ifndef HAKMEM_TINY_BENCH_WARMUP8 +#define HAKMEM_TINY_BENCH_WARMUP8 64 +#endif +#ifndef HAKMEM_TINY_BENCH_WARMUP16 +#define HAKMEM_TINY_BENCH_WARMUP16 96 +#endif +#ifndef HAKMEM_TINY_BENCH_WARMUP32 +#define HAKMEM_TINY_BENCH_WARMUP32 160 +#endif +#ifndef HAKMEM_TINY_BENCH_WARMUP64 +#define HAKMEM_TINY_BENCH_WARMUP64 192 +#endif + +#ifdef HAKMEM_TINY_BENCH_FASTPATH +static __thread unsigned char g_tls_bench_warm_done[4]; +#endif + +#if HAKMEM_DEBUG_COUNTERS +#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0) +#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0) +#else +#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0) +#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0) +#endif +// Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0) +#if HAKMEM_DEBUG_COUNTERS +#define HAK_DBG_INC(var) do { (var)++; } while(0) +#else +#define HAK_DBG_INC(var) do { (void)0; } while(0) +#endif +// Return helper: record tiny alloc stat (guarded) then return pointer +static inline void tiny_debug_track_alloc_ret(int cls, void* ptr); + +// ========== HAK_RET_ALLOC: Single Definition Point ========== +// Choose implementation based on HAKMEM_TINY_HEADER_CLASSIDX +// - Phase 7 enabled: Write header and return user pointer +// - Phase 7 disabled: Legacy behavior (stats + route + return) + +#if HAKMEM_TINY_HEADER_CLASSIDX + #if HAKMEM_BUILD_RELEASE + // Phase E1-CORRECT: ALL classes have 1-byte headers (including C7) + // Ultra-fast inline macro (3-4 instructions) + #define HAK_RET_ALLOC(cls, base_ptr) do { \ + *(uint8_t*)(base_ptr) = HEADER_MAGIC | ((cls) & HEADER_CLASS_MASK); \ + return (void*)((uint8_t*)(base_ptr) + 1); \ + } while(0) + #else + // Debug: Keep full validation via tiny_region_id_write_header() + #define HAK_RET_ALLOC(cls, ptr) return tiny_region_id_write_header((ptr), (cls)) + #endif +#else + // Legacy: Stats and routing before return + #ifdef HAKMEM_ENABLE_STATS + // Optional: sampling(ビルド時に有効化)。ホットパスは直接インライン呼び出し(間接分岐なし)。 + #ifdef HAKMEM_TINY_STAT_SAMPLING + static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES]; + static int g_stat_rate_lg = 0; // 0=毎回、それ以外=2^lgごと + static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { + if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; } + unsigned m = (1u << g_stat_rate_lg) - 1u; + if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls); + } + #else + static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); } + #endif + #define HAK_RET_ALLOC(cls, ptr) do { \ + tiny_debug_track_alloc_ret((cls), (ptr)); \ + hkm_stat_alloc((cls)); \ + ROUTE_COMMIT((cls), 0x7F); \ + return (ptr); \ + } while(0) + #else + #define HAK_RET_ALLOC(cls, ptr) do { \ + tiny_debug_track_alloc_ret((cls), (ptr)); \ + ROUTE_COMMIT((cls), 0x7F); \ + return (ptr); \ + } while(0) + #endif +#endif // HAKMEM_TINY_HEADER_CLASSIDX + +// Free-side stats: compile-time zero when stats disabled +#ifdef HAKMEM_ENABLE_STATS +#define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0) +#else +#define HAK_STAT_FREE(cls) do { } while(0) +#endif + diff --git a/core/hakmem_tiny_globals_box.inc b/core/hakmem_tiny_globals_box.inc new file mode 100644 index 00000000..dc056d00 --- /dev/null +++ b/core/hakmem_tiny_globals_box.inc @@ -0,0 +1,256 @@ +// ============================================================================ +// Global State +// ============================================================================ + +// Global pool instance (extern declared in hakmem_tiny.h) +TinyPool g_tiny_pool; +int g_tiny_initialized = 0; // Not static (extern in header for inline access) +// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers +// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED +// +// Results: +// Phase 1 (Push - deferred free): +1 instruction, zero benefit +// Phase 2 (Pull - background refill): +77 instructions, -3% performance +// +// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9% +// - TLS Magazine capacity: 2048 items +// - Benchmark working set: 100 items +// - Magazine hit rate: 100% after warmup +// - Slow path never executed! +// +// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op +// - glibc: ~40 instructions/op (5-7× faster) +// - Gap is architectural (bitmap vs free-list, research features) +// +// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc! +// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅ +// Decision: Enable by default (proven production-ready) +static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!) +// Optional: allow limited trylock-based refill during wrapper calls +static int g_wrap_tiny_refill = 0; +// Remote-free drain controls +static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback) +static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability) + +// ACE Learning Layer: Per-class remote drain thresholds +int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32}; +// Sampled counter updates (Phase 3: Replaced with batched TLS counters) +// Old: XOR RNG sampling (10-15 ns overhead) +// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead) +static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility) + +// Step 2: Slab Registry (Hash Table) +SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE]; + +PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES]; + +// Registry lock +pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER; + +// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF) +// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs) +// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash) +static int g_use_registry = 1; // Default ON for thread-safety + +// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h +// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable) +static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64) +static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192) + +// hakmem_tiny_tls_list.h already included at top +static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; +static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1 +static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want); +static int g_fast_enable = 1; +static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0 +static uint16_t g_fast_cap[TINY_NUM_CLASSES]; +static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1 +static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES]; + + +typedef void* (*TinyHotAllocFn)(void); +static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES]; +static __thread void* g_fast_head[TINY_NUM_CLASSES]; +static __thread uint16_t g_fast_count[TINY_NUM_CLASSES]; +static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls); + +uint64_t g_tls_hit_count[TINY_NUM_CLASSES]; +uint64_t g_tls_miss_count[TINY_NUM_CLASSES]; +uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES]; +uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES]; +uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES]; +uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES]; + +// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h +// NEW: Per-thread active slabs (up to 2 per class) +static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES]; +static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES]; + +static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) { + TinySlab* cand = g_tls_active_slab_a[class_idx]; + if (cand) { + uintptr_t base = (uintptr_t)cand->base; + if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { + return cand; + } + } + cand = g_tls_active_slab_b[class_idx]; + if (cand) { + uintptr_t base = (uintptr_t)cand->base; + if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) { + return cand; + } + } + return NULL; +} + +// Phase 6.23: SuperSlab support (mimalloc-style fast allocation) +// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line. +extern int g_use_superslab; + +#if !HAKMEM_BUILD_RELEASE +static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { + if (!ptr) return; + // ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE + void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL; + if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { + SuperSlab* ss = hak_super_lookup(ptr); + if (!(ss && ss->magic == SUPERSLAB_MAGIC)) { + tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail"); + } else { + int slab_idx = slab_index_for(ss, base_ptr); + if (slab_idx < 0) { + tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch"); + } else { + // Fail-Fast: class vs SuperSlab size_class must be consistent. + TinySlabMeta* meta = &ss->slabs[slab_idx]; + if (meta->class_idx != (uint8_t)cls) { + tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch"); + } + size_t blk = g_tiny_class_sizes[cls]; + uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx); + uintptr_t delta = (uintptr_t)base_ptr - base; + if (blk == 0 || (delta % blk) != 0) { + tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned"); + } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) { + tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity"); + } + } + } + } + if (!__builtin_expect(g_debug_remote_guard, 0)) return; + if (!g_use_superslab) return; + SuperSlab* ss = hak_super_lookup(ptr); + if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return; + int slab_idx = slab_index_for(ss, base_ptr); + if (slab_idx >= 0) { + tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0); + } +} +#else +static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; } +#endif + +// Debug counters for SuperSlab investigation +#if HAKMEM_DEBUG_COUNTERS +int g_superslab_alloc_count = 0; +int g_superslab_fail_count = 0; +int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees +int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected +int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes +int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls +#endif + +// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve +// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact) +// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress) +#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default) +static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class +static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs +static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1) +static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER; +static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default +static uint32_t g_ss_partial_interval = 4; +static _Atomic uint32_t g_ss_partial_epoch = 0; + +// Phase 6.24: Unified TLS slab cache (Medium fix) +// Reduces TLS reads from 3 to 1 (cache-line aligned for performance) +// Phase E4: 64B alignment for L1 cache optimization +__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64))); +static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES]; +static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES]; +static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES]; +static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES]; +static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES]; +static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES]; +static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES]; + +// ---------------------------------------------------------------------------- +// Per-class partial SuperSlab slot (single-slot publish/adopt) +// ---------------------------------------------------------------------------- +// Small ring of partial SuperSlabs per class (publish/adopt) +#ifndef SS_PARTIAL_RING +#define SS_PARTIAL_RING 64 +#endif +static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING]; +static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES]; +static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES]; +static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES]; +static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN + +// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS) +unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0}; +unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0}; +_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs +static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON +static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active +static _Atomic int g_ss_adopt_log_once = 0; + +static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) { + if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) { + fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n", + reason ? reason : "unknown", class_idx); + } +} + +static inline void tiny_adopt_gate_parse_env(void) { + if (__builtin_expect(g_ss_adopt_env == -2, 0)) { + const char* env = getenv("HAKMEM_TINY_SS_ADOPT"); + if (!env || *env == '\0') { + g_ss_adopt_env = 0; // auto + } else if (*env == '0') { + g_ss_adopt_env = -1; // forced OFF + atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release); + } else { + g_ss_adopt_env = 1; // forced ON + atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release); + tiny_adopt_gate_log_activation("env", -1); + } + } +} + +int tiny_adopt_gate_should_publish(void) { + tiny_adopt_gate_parse_env(); + if (g_ss_adopt_env == 1) return 1; + if (g_ss_adopt_env == -1) return 0; + return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; +} + +int tiny_adopt_gate_should_adopt(void) { + tiny_adopt_gate_parse_env(); + if (g_ss_adopt_env == 1) return 1; + if (g_ss_adopt_env == -1) return 0; + return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0; +} + +void tiny_adopt_gate_on_remote_seen(int class_idx) { + tiny_adopt_gate_parse_env(); + atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed); + if (g_ss_adopt_env == -1) return; + int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel); + if (prev == 0) { + tiny_adopt_gate_log_activation("remote", class_idx); + } +} + +// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers diff --git a/core/hakmem_tiny_phase6_wrappers_box.inc b/core/hakmem_tiny_phase6_wrappers_box.inc new file mode 100644 index 00000000..a7ce59d4 --- /dev/null +++ b/core/hakmem_tiny_phase6_wrappers_box.inc @@ -0,0 +1,122 @@ + // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check +#if HAKMEM_TINY_PHASE6_BOX_REFACTOR + #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) + #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options" + #endif + + // Box 1: Atomic Operations (Layer 0 - Foundation) + #include "tiny_atomic.h" + + // Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions) + #include "tiny_alloc_fast.inc.h" + + // Box 6: Free Fast Path (Layer 2 - 2-3 instructions) + #include "tiny_free_fast.inc.h" + + // ---------------- Refill count (Front) global config ---------------- + // Parsed once at init; hot path reads plain ints (no getenv). + int g_refill_count_global = 0; // HAKMEM_TINY_REFILL_COUNT + int g_refill_count_hot = 0; // HAKMEM_TINY_REFILL_COUNT_HOT + int g_refill_count_mid = 0; // HAKMEM_TINY_REFILL_COUNT_MID + int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7} + + // Export wrapper functions for hakmem.c to call + // Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining + void* hak_tiny_alloc_fast_wrapper(size_t size) { + // Phase E5: Ultra fast path (8-instruction alloc, bypasses all layers) + // Enable with: HAKMEM_ULTRA_FAST_PATH=1 (compile-time) + #if HAKMEM_ULTRA_FAST_PATH + void* ret = tiny_alloc_fast_ultra(size); + if (ret) return ret; + // Miss → fallback to full fast path + #endif + + // Bench-only ultra-short path: bypass diagnostics and pointer tracking + // Enable with: HAKMEM_BENCH_FAST_FRONT=1 + static int g_bench_fast_front = -1; + if (__builtin_expect(g_bench_fast_front == -1, 0)) { + const char* e = getenv("HAKMEM_BENCH_FAST_FRONT"); + g_bench_fast_front = (e && *e && *e != '0') ? 1 : 0; + } + if (__builtin_expect(g_bench_fast_front, 0)) { + return tiny_alloc_fast(size); + } + + static _Atomic uint64_t wrapper_call_count = 0; + uint64_t call_num = atomic_fetch_add(&wrapper_call_count, 1); + + // Pointer tracking init (first call only) + PTR_TRACK_INIT(); + + // PRIORITY 3: Periodic canary validation (every 1000 ops) + periodic_canary_check(call_num, "hak_tiny_alloc_fast_wrapper"); + + // Box I: Periodic full integrity check (every 5000 ops) + #if HAKMEM_INTEGRITY_LEVEL >= 3 + if ((call_num % 5000) == 0) { + extern void integrity_periodic_full_check(const char*); + integrity_periodic_full_check("periodic check in alloc wrapper"); + } + #endif + + #if !HAKMEM_BUILD_RELEASE + if (call_num > 14250 && call_num < 14280 && size <= 1024) { + fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu size=%zu\n", call_num, size); + fflush(stderr); + } + #endif + + void* result = tiny_alloc_fast(size); + + #if !HAKMEM_BUILD_RELEASE + if (call_num > 14250 && call_num < 14280 && size <= 1024) { + fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu returned %p\n", call_num, result); + fflush(stderr); + } + #endif + return result; + } + + void hak_tiny_free_fast_wrapper(void* ptr) { + // Phase E5: Ultra fast path (6-8 instruction free) + #if HAKMEM_ULTRA_FAST_PATH + tiny_free_fast_ultra(ptr); + return; + #endif + + static _Atomic uint64_t free_call_count = 0; + uint64_t call_num = atomic_fetch_add(&free_call_count, 1); + if (call_num > 14135 && call_num < 14145) { + fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu ptr=%p\n", call_num, ptr); + fflush(stderr); + } + tiny_free_fast(ptr); + if (call_num > 14135 && call_num < 14145) { + fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu completed\n", call_num); + fflush(stderr); + } + } + +#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) + // Phase 6-1.5: Alignment guessing (legacy) + + // Refill count globals (needed for compatibility) + int g_refill_count_global = 0; + int g_refill_count_hot = 0; + int g_refill_count_mid = 0; + int g_refill_count_class[TINY_NUM_CLASSES] = {0}; + + #include "hakmem_tiny_ultra_simple.inc" + + // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking) + void* hak_tiny_alloc_fast_wrapper(size_t size) { + return hak_tiny_alloc_ultra_simple(size); + } + + void hak_tiny_free_fast_wrapper(void* ptr) { + hak_tiny_free_ultra_simple(ptr); + } +#elif defined(HAKMEM_TINY_PHASE6_METADATA) + // Phase 6-1.6: Metadata header (recommended) + #include "hakmem_tiny_metadata.inc" +#endif diff --git a/core/hakmem_tiny_publish_box.inc b/core/hakmem_tiny_publish_box.inc new file mode 100644 index 00000000..5ee2896e --- /dev/null +++ b/core/hakmem_tiny_publish_box.inc @@ -0,0 +1,419 @@ +// hakmem_tiny_publish_box.inc +// Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers. +// Extracted from hakmem_tiny.c to keep hot-path logic focused. + +// TLS hint: last adopted SuperSlab/slab to avoid rescans +#include "tiny_sticky.h" + +// Mailbox box +#include "box/mailbox_box.h" + +// Publish pipeline counters (visibility) +unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0}; +unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0}; +unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0}; +unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0}; +unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0}; + +// Slab-ring counters (debug) +unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0}; +unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0}; +unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0}; +unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0}; + +// Slab entry encoding helpers (used by Bench/Slab-ring paths) +static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) { + return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu); +} +static inline SuperSlab* slab_entry_ss(uintptr_t ent) { + // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base + return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u)); +} +static inline int slab_entry_idx(uintptr_t ent) { + return (int)(ent & 0x3Fu); +} + +// ---------------------------------------------------------------------------- +// Bench Mode Publish Mailbox (single-slot per class) +// ---------------------------------------------------------------------------- +static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1 +static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES]; +#ifndef BENCH_MAILBOX_WIDTH +#define BENCH_MAILBOX_WIDTH 16 +#endif +static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH]; + +static inline int bench_mode_enabled(void) { + if (__builtin_expect(g_bench_mode == -1, 0)) { + const char* b = getenv("HAKMEM_TINY_BENCH_MODE"); + g_bench_mode = (b && atoi(b) != 0) ? 1 : 0; + } + return g_bench_mode; +} + +static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) { + if (!bench_mode_enabled()) return; + uintptr_t ent = slab_entry_make(ss, slab_idx); + uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed); + idx &= (BENCH_MAILBOX_WIDTH - 1); + atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release); +} + +static inline uintptr_t bench_pub_pop(int class_idx) { + if (!bench_mode_enabled()) return (uintptr_t)0; + for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) { + uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel); + if (ent) return ent; + } + return 0; +} + +// ---------------------------------------------------------------------------- +// Slab-Granular Partial Publish/Adopt (encoded entries) +// ---------------------------------------------------------------------------- +#ifndef SLAB_PARTIAL_RING +#define SLAB_PARTIAL_RING 128 +#endif +static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING]; +static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES]; + +// ---------------------------------------------------------------------------- +// Refill-stage counters (per class) +// ---------------------------------------------------------------------------- +unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0}; + +// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0) +unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0}; + +// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE) +unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0}; + +// Refill item source breakdown (freelist vs carve) +unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0}; + +static int g_rf_trace_en = -1; +static inline int rf_trace_enabled(void) { + if (__builtin_expect(g_rf_trace_en == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_RF_TRACE"); + g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0; + } + return g_rf_trace_en; +} + +static inline unsigned long long rf_now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec; +} + +// Publish-side counters (debug) +unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0}; +unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0}; +unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0}; + +// Free pipeline counters +unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0}; +unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0}; +unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0}; +unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0}; + +// Front Gate Breakdown (debug counters) +unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0}; + +// Free-side trigger counters +unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0}; +unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0}; + +// Adopt/Registry gate counters +unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0}; +unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0}; +unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0}; +unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0}; +unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0}; +unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0}; +unsigned long long g_fast_lookup_none = 0; + +// ---------------------------------------------------------------------------- +// Live Superslab cap (must-adopt-before-mmap support) +// ---------------------------------------------------------------------------- +static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value +__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0}; +static inline int live_cap_for_class(int class_idx) { + if (__builtin_expect(g_live_cap_env == -2, 0)) { + const char* s = getenv("HAKMEM_SS_LIVE_CAP"); + if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); } + } + (void)class_idx; + return g_live_cap_env; +} + +// ---------------------------------------------------------------------------- +// Hot Slot (global simple path) +// ---------------------------------------------------------------------------- +static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot) +static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES]; +static inline int hot_slot_enabled(void) { + if (__builtin_expect(g_hot_slot_en == -1, 0)) { + const char* s = getenv("HAKMEM_HOT_SLOT"); + g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0; + } + return g_hot_slot_en || bench_mode_enabled(); +} +static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) { + if (!hot_slot_enabled()) return; + uintptr_t ent = slab_entry_make(ss, slab_idx); + atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release); +} +static inline uintptr_t hot_slot_pop(int class_idx) { + if (!hot_slot_enabled()) return (uintptr_t)0; + return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel); +} + +static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { + if (!ss) return; + uintptr_t ent = slab_entry_make(ss, slab_idx); + for (int i = 0; i < SLAB_PARTIAL_RING; i++) { + uintptr_t expected = 0; + if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent, + memory_order_release, memory_order_relaxed)) { + g_slab_publish_dbg[class_idx]++; + return; + } + } + // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot + uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; + uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel); + if (old) { + for (int t = 0; t < 8; t++) { + uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; + uintptr_t expected = 0; + if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old, + memory_order_release, memory_order_relaxed)) { + g_slab_requeue_dbg[class_idx]++; + old = 0; break; + } + } + } + g_slab_publish_dbg[class_idx]++; +} + +static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) { + for (int i = 0; i < SLAB_PARTIAL_RING; i++) { + uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel); + if (ent) return ent; + } + return 0; +} + +void ss_partial_publish(int class_idx, SuperSlab* ss) { + if (!ss) return; + // Gate by listed flag to avoid repeated publishes of the same SS + unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel); + if (prev != 0u) return; // already listed + + // CRITICAL: Release ownership of all slabs so adopters can claim them! + // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0). + // The publishing thread must stop using this SS after publishing. + int cap_pub = ss_slabs_capacity(ss); + for (int s = 0; s < cap_pub; s++) { + // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split + TinySlabMeta* meta = ss_slab_meta_ptr(ss, s); + uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE); + if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) { + uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev; + tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE, + (uint16_t)ss_slab_meta_class_idx_get(ss, s), + meta, + aux); + } + } + + // CRITICAL: Unbind current thread's TLS if it points to this SS! + // Otherwise, the publishing thread will continue allocating from the published SS, + // racing with adopters who acquire ownership. + extern __thread TinyTLSSlab g_tls_slabs[]; + if (g_tls_slabs[class_idx].ss == ss) { + g_tls_slabs[class_idx].ss = NULL; + g_tls_slabs[class_idx].meta = NULL; + g_tls_slabs[class_idx].slab_base = NULL; + g_tls_slabs[class_idx].slab_idx = 0; + } + + // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1) + int best = -1; uint32_t best_score = 0; + for (int s = 0; s < cap_pub; s++) { + TinySlabMeta* m = &ss->slabs[s]; + uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed); + int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0); + unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u; + uint32_t score = rc + + (m->freelist ? (1u<<30) : 0u) + + (listed ? (1u<<29) : 0u) + + (has_remote ? 1u : 0u); + if (score > best_score) { best_score = score; best = s; } + } + if (best >= 0 && best < 256) { + ss->publish_hint = (uint8_t)best; + // Box: Ready push — provide slab-level candidate to adopters + tiny_ready_push(class_idx, ss, best); + } else { + ss->publish_hint = 0xFF; + } + for (int i = 0; i < SS_PARTIAL_RING; i++) { + SuperSlab* expected = NULL; + if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss, + memory_order_release, memory_order_relaxed)) { + g_ss_publish_dbg[class_idx]++; + return; // published + } + } + // Ring full: replace one entry in round-robin to avoid dropping supply + uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed); + idx %= SS_PARTIAL_RING; + SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel); + if (old) { + // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads + // that just adopted from it. Draining without ownership checks causes freelist corruption. + // The adopter will drain when needed (with proper ownership checks in tiny_refill.h). + // + // Previous code (UNSAFE): + // for (int s = 0; s < cap; s++) { + // ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter! + // } + + // Keep listed=1 while in overflow so it stays eligible for adopt + // Push old into overflow stack (待機箱) + SuperSlab* head; + do { + head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); + old->partial_next = head; + } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old, + memory_order_release, memory_order_relaxed)); + } + g_ss_publish_dbg[class_idx]++; +} + +SuperSlab* ss_partial_adopt(int class_idx) { + for (int i = 0; i < SS_PARTIAL_RING; i++) { + SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel); + if (ss) { + // Clear listed flag on adopt to allow future publish of this SS + atomic_store_explicit(&ss->listed, 0u, memory_order_release); + g_ss_adopt_dbg[class_idx]++; + return ss; + } + } + // Fallback: adopt from overflow stack (LIFO) + while (1) { + SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); + if (!head) break; + SuperSlab* next = head->partial_next; + if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next, + memory_order_acq_rel, memory_order_relaxed)) { + atomic_store_explicit(&head->listed, 0u, memory_order_release); + g_ss_adopt_dbg[class_idx]++; + return head; + } + } + return NULL; +} + +static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) { + // Canonical binding under Phase 12: + // - Per-slab TinySlabMeta.class_idx defines class for this slab + // - slab_idx is the owning slab index within ss + // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx) + tls->ss = ss; + tls->slab_idx = (uint8_t)slab_idx; + tls->meta = &ss->slabs[slab_idx]; + tls->slab_base = tiny_slab_base_for(ss, slab_idx); +} + +static inline uint32_t tiny_tls_default_refill(uint32_t cap) { + if (cap == 0u) return 8u; + uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u; + if (low < 4u) low = 4u; + return low; +} + +static inline uint32_t tiny_tls_default_spill(uint32_t cap) { + if (cap == 0u) return 0u; + uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u); + if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP; + if (spill < cap) spill = cap; + return (uint32_t)spill; +} + +static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) { + atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release); + atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed); + atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed); + atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); +} + +static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) { + atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release); + atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); +} + +static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) { + uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire); + if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) { + return; + } + uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire); + if (target_cap != 0u && tls->cap != target_cap) { + tls->cap = target_cap; + uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed); + if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap); + tls->refill_low = target_refill; + uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed); + if (target_spill < target_cap) target_spill = target_cap; + tls->spill_high = target_spill; + } + uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire); + if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) { + g_tls_trim_seen[class_idx] = trim_epoch; + if (tls->count > tls->cap) { + tls_list_spill_excess(class_idx, tls); + } + } + g_tls_param_seen[class_idx] = seq; +} +