// ============================================================================ // Performance Measurement: TLS SLL Hit Rate (ENV-gated) // ============================================================================ // Global atomic counters for TLS SLL performance measurement // ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) #include #include "box/tiny_heap_env_box.h" // TinyHeap/C7 gate for TLS SLL skips _Atomic uint64_t g_tls_sll_push_count_global = 0; _Atomic uint64_t g_tls_sll_pop_count_global = 0; _Atomic uint64_t g_tls_sll_pop_empty_count_global = 0; // Hot-path cheap sampling counter to avoid rand() in allocation path // Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B) int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable // Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c) // CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads // PRIORITY 3: TLS Canaries - Add canaries around TLS arrays to detect buffer overruns #define TLS_CANARY_MAGIC 0xDEADBEEFDEADBEEFULL // Phase 3d-B: Unified TLS SLL (head+count in same cache line for +12-18% cache hit rate) #include "front/tiny_heap_v2.h" __thread uint64_t g_tls_canary_before_sll = TLS_CANARY_MAGIC; __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES] = {0}; __thread uint64_t g_tls_canary_after_sll = TLS_CANARY_MAGIC; __thread const char* g_tls_sll_last_writer[TINY_NUM_CLASSES] = {0}; __thread TinyHeapV2Mag g_tiny_heap_v2_mag[TINY_NUM_CLASSES] = {0}; __thread TinyHeapV2Stats g_tiny_heap_v2_stats[TINY_NUM_CLASSES] = {0}; __thread int g_tls_heap_v2_initialized = 0; // Phase 1: TLS SuperSlab Hint Box for Headerless mode // Size: 112 bytes per thread (4 slots * 24 bytes + 16 bytes overhead) #if HAKMEM_TINY_SS_TLS_HINT #include "box/tls_ss_hint_box.h" __thread TlsSsHintCache g_tls_ss_hint = {0}; #endif static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation // Ultra debug counters #if HAKMEM_DEBUG_COUNTERS static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0}; static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0}; #endif // Path counters (normal mode visibility): lightweight, for debugging/bench only #if HAKMEM_DEBUG_COUNTERS static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0}; // New: slow/bitmap/bump/bin instrumentation static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0}; static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0}; #endif static int g_path_debug_enabled = 0; // Spill hysteresis(freeホットパスからgetenvを排除) static int g_spill_hyst = 32; // default margin (configured at init; never getenv on hot path) // Optional per-class refill batch overrides (0=use global defaults) static int g_refill_max_c[TINY_NUM_CLASSES] = {0}; static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0}; static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) { int v = g_refill_max_c[class_idx]; if (v > 0) return v; if (class_idx <= 3) { int hv = g_refill_max_hot_c[class_idx]; if (hv > 0) return hv; return g_tiny_refill_max_hot; } return g_tiny_refill_max; } // Phase 9.5: Frontend/Backend split - Tiny Front modules(QuickSlot / FastCache) #include "front/quick_slot.h" #include "front/fast_cache.h" __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES]; static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend) // SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER) int g_sll_multiplier = 2; // Cached thread id (uint32) to avoid repeated pthread_self() in hot paths static __thread uint32_t g_tls_tid32; static __thread int g_tls_tid32_inited; // Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { #else static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { #endif if (__builtin_expect(!g_tls_tid32_inited, 0)) { g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self(); g_tls_tid32_inited = 1; } return g_tls_tid32; } // Cached pthread_t as-is for APIs that require pthread_t comparison static __thread pthread_t g_tls_pt_self; static __thread int g_tls_pt_inited; // Frontend FastCache hit/miss counters (Small diagnostics) unsigned long long g_front_fc_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_fc_miss[TINY_NUM_CLASSES] = {0}; // TLS SLL class mask: bit i = 1 allows SLL for class i. Default: all 8 classes enabled. int g_tls_sll_class_mask = 0xFF; // Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { if (__builtin_expect(!g_tls_pt_inited, 0)) { g_tls_pt_self = pthread_self(); g_tls_pt_inited = 1; } return g_tls_pt_self; } #include "tiny_refill.h" // tiny_mmap_gate.h already included at top #include "tiny_publish.h" // Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1) static int g_tiny_prefetch = 0; // Small-class magazine pre-initialization (to avoid cap==0 checks on hot path) // Hot-class small TLS magazine(実体とスイッチ) typedef struct { void* slots[128]; uint16_t top; // 0..128 uint16_t cap; // =128 } TinyHotMag; static int g_hotmag_cap_default = 128; // default capacity (fixed) static int g_hotmag_refill_default = 32; // default refill batch (fixed) static int g_hotmag_enable = 0; // 既定OFF(ENVトグル削除済み) static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES]; static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES]; static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES]; static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES]; static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES]; // 0=disabled for class, 1=enabled static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES]; // Inline helpers #include "box/tls_sll_box.h" // Box TLS-SLL: Safe SLL operations API (needed by hotmag) #include "hakmem_tiny_hotmag.inc.h" // Diagnostics: invalid TLS SLL pointers detected (range check failures) _Atomic uint64_t g_tls_sll_invalid_head[TINY_NUM_CLASSES] = {0}; _Atomic uint64_t g_tls_sll_invalid_push[TINY_NUM_CLASSES] = {0}; _Atomic uint64_t g_tls_sll_pop_counter[TINY_NUM_CLASSES] = {0}; // Size-specialized tiny alloc (32B/64B) via function pointers (A/B用) // TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata) // Opt-in via HAKMEM_TINY_QUICK=1 // NOTE: This type definition must come BEFORE the Phase 2D-1 includes below int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1 __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below // Phase 2D-1: Hot-path inline function extractions(Front) // NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み #include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3} #include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations #if HAKMEM_TINY_P0_BATCH_REFILL #include "hakmem_tiny_refill_p0.inc.h" // P0 batch refill → FastCache 直補充 #endif // Phase 7 Task 3: Pre-warm TLS cache at init // Pre-allocate blocks to reduce first-allocation miss penalty #if HAKMEM_TINY_PREWARM_TLS void hak_tiny_prewarm_tls_cache(void) { // Pre-warm each class with HAKMEM_TINY_PREWARM_COUNT blocks // This reduces the first-allocation miss penalty by populating TLS cache // Phase E1-CORRECT: ALL classes (including C7) now use TLS SLL for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { // TinyHeap front ON では対象クラスを TLS SLL へ積まず、TinyHeapBox 側に任せる。 if (tiny_heap_class_route_enabled(class_idx)) { continue; } int count = HAKMEM_TINY_PREWARM_COUNT; // Default: 16 blocks per class // Trigger refill to populate TLS cache // P0 Fix: Use appropriate refill function based on P0 status #if HAKMEM_TINY_P0_BATCH_REFILL sll_refill_batch_from_ss(class_idx, count); #else sll_refill_small_from_ss(class_idx, count); #endif } } #endif // Ultra-Simple front - REMOVED (dead code cleanup 2025-11-27) // HotMag helpers (for classes 0..3) static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; } // Optional front (HotMag) push helper: compile-out in release builds static inline int tiny_optional_push(int class_idx, void* ptr) { #if HAKMEM_BUILD_RELEASE (void)class_idx; (void)ptr; return 0; #else if (__builtin_expect(is_hot_class(class_idx), 0)) { if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) { return 1; } } return 0; #endif } // Phase 9.6: Deferred Intelligence (event queue + background) // Extended event for FLINT Intelligence (lightweight; recorded off hot path only) // Observability, ACE, and intelligence helpers