diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 76e62bce..378d8264 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -187,331 +187,23 @@ static inline int fastcache_push(int class_idx, void* ptr); // Functions: tiny_hot_pop_class0(), tiny_hot_pop_class1(), tiny_hot_pop_class2(), tiny_hot_pop_class3() // 88 lines (lines 407-494) -static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) { - int tls_enabled = g_tls_list_enable; - TinyTLSList* tls = &g_tls_lists[class_idx]; - pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; - pthread_mutex_lock(lock); - TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; - if (slab) { - g_tiny_pool.free_slabs[class_idx] = slab->next; - } else { - slab = allocate_new_slab(class_idx); - if (!slab) { - pthread_mutex_unlock(lock); - return NULL; - } - } - slab->next = NULL; +// ============================================================================ +// Legacy Slow Allocation Path - EXTRACTED to hakmem_tiny_legacy_slow_box.inc +// ============================================================================ +#include "hakmem_tiny_legacy_slow_box.inc" - if (atomic_load_explicit(&slab->remote_head, memory_order_acquire)) { - tiny_remote_drain_locked(slab); - } - - int block_idx = hak_tiny_find_free_block(slab); - if (block_idx < 0) { - slab->next = g_tiny_pool.free_slabs[class_idx]; - g_tiny_pool.free_slabs[class_idx] = slab; - pthread_mutex_unlock(lock); - return NULL; - } - - hak_tiny_set_used(slab, block_idx); - slab->free_count--; - size_t block_size = g_tiny_class_sizes[class_idx]; - uint8_t* base = (uint8_t*)slab->base; - void* ret = (void*)(base + ((size_t)block_idx * block_size)); - g_tiny_pool.alloc_count[class_idx]++; - - uint16_t cap = g_fast_cap_defaults[class_idx]; - uint16_t count = g_fast_count[class_idx]; - uint16_t fast_need = (cap > count) ? (uint16_t)(cap - count) : 0; - if (fast_need > slab->free_count) fast_need = (uint16_t)slab->free_count; - - uint32_t tls_need = 0; - if (tls_enabled && tls_list_needs_refill(tls)) { - uint32_t target = tls_list_refill_threshold(tls); - if (tls->count < target) { - tls_need = target - tls->count; - } - } - uint32_t remaining = slab->free_count; - if (fast_need > remaining) fast_need = (uint16_t)remaining; - remaining -= fast_need; - if (tls_need > remaining) tls_need = remaining; - - while (fast_need > 0) { - int extra_idx = hak_tiny_find_free_block(slab); - if (extra_idx < 0) break; - hak_tiny_set_used(slab, extra_idx); - slab->free_count--; - void* extra = (void*)(base + ((size_t)extra_idx * block_size)); - int pushed = 0; - if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) { - pushed = fastcache_push(class_idx, extra); - } else { - pushed = tiny_fast_push(class_idx, extra); - } - if (!pushed) { - if (tls_enabled) { - tiny_tls_list_guard_push(class_idx, tls, extra); - tls_list_push(tls, extra, class_idx); - } - } - fast_need--; - } - - while (tls_enabled && tls_need > 0) { - int extra_idx = hak_tiny_find_free_block(slab); - if (extra_idx < 0) break; - hak_tiny_set_used(slab, extra_idx); - slab->free_count--; - void* extra = (void*)(base + ((size_t)extra_idx * block_size)); - tiny_tls_list_guard_push(class_idx, tls, extra); - tls_list_push(tls, extra, class_idx); - tls_need--; - } - - if (slab->free_count == 0) { - move_to_full_list(class_idx, slab); - } else { - slab->next = g_tiny_pool.free_slabs[class_idx]; - g_tiny_pool.free_slabs[class_idx] = slab; - } - - pthread_mutex_unlock(lock); - return ret; -} // ============================================================================ // EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1) // ============================================================================ // Function: tiny_fast_refill_and_take() - 39 lines (lines 584-622) -// Hot-path cheap sampling counter to avoid rand() in allocation path -// Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B) -int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable -int g_tiny_hotpath_class5 = 0; // HAKMEM_TINY_HOTPATH_CLASS5=1 to enable class 5 hotpath -// Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c) -// CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads -// PRIORITY 3: TLS Canaries - Add canaries around TLS arrays to detect buffer overruns -#define TLS_CANARY_MAGIC 0xDEADBEEFDEADBEEFULL -// Phase 3d-B: Unified TLS SLL (head+count in same cache line for +12-18% cache hit rate) -__thread uint64_t g_tls_canary_before_sll = TLS_CANARY_MAGIC; -__thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0}; -__thread uint64_t g_tls_canary_after_sll = TLS_CANARY_MAGIC; -static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode -static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation -// Ultra debug counters -#if HAKMEM_DEBUG_COUNTERS -static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0}; -static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0}; -#endif -// Path counters (normal mode visibility): lightweight, for debugging/bench only -#if HAKMEM_DEBUG_COUNTERS -static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0}; -// New: slow/bitmap/bump/bin instrumentation -static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0}; -static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0}; -#endif -static int g_path_debug_enabled = 0; +// ============================================================================ +// TLS/Frontend State & Configuration - EXTRACTED to hakmem_tiny_tls_state_box.inc +// ============================================================================ +#include "hakmem_tiny_tls_state_box.inc" -// Spill hysteresis(freeホットパスからgetenvを排除) -static int g_spill_hyst = 32; // default margin (configured at init; never getenv on hot path) - -// Optional per-class refill batch overrides (0=use global defaults) -static int g_refill_max_c[TINY_NUM_CLASSES] = {0}; -static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0}; -static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) { - int v = g_refill_max_c[class_idx]; - if (v > 0) return v; - if (class_idx <= 3) { - int hv = g_refill_max_hot_c[class_idx]; - if (hv > 0) return hv; - return g_tiny_refill_max_hot; - } - return g_tiny_refill_max; -} - -// Phase 9.5: Frontend/Backend split - Tiny Front modules(QuickSlot / FastCache) -#include "front/quick_slot.h" -#include "front/fast_cache.h" -__thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES]; -static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend) -// SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER) -int g_sll_multiplier = 2; -// Cached thread id (uint32) to avoid repeated pthread_self() in hot paths -static __thread uint32_t g_tls_tid32; -static __thread int g_tls_tid32_inited; -// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) -#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR -inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { -#else -static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { -#endif - if (__builtin_expect(!g_tls_tid32_inited, 0)) { - g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self(); - g_tls_tid32_inited = 1; - } - return g_tls_tid32; -} -// Cached pthread_t as-is for APIs that require pthread_t comparison -static __thread pthread_t g_tls_pt_self; -static __thread int g_tls_pt_inited; - -// Frontend FastCache hit/miss counters (Small diagnostics) -unsigned long long g_front_fc_hit[TINY_NUM_CLASSES] = {0}; -unsigned long long g_front_fc_miss[TINY_NUM_CLASSES] = {0}; -// TLS SLL class mask: bit i = 1 allows SLL for class i. Default: all 8 classes enabled. -int g_tls_sll_class_mask = 0xFF; -// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) -#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR -inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { -#else -static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { -#endif - if (__builtin_expect(!g_tls_pt_inited, 0)) { - g_tls_pt_self = pthread_self(); - g_tls_pt_inited = 1; - } - return g_tls_pt_self; -} - -#include "tiny_refill.h" -// tiny_mmap_gate.h already included at top -#include "tiny_publish.h" - -int g_sll_cap_override[TINY_NUM_CLASSES] = {0}; // LEGACY (Phase12以降は参照しない/互換用ダミー) -// Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1) -static int g_tiny_prefetch = 0; - -// Small-class magazine pre-initialization (to avoid cap==0 checks on hot path) - - - -// Hot-class small TLS magazine(実体とスイッチ) -typedef struct { - void* slots[128]; - uint16_t top; // 0..128 - uint16_t cap; // =128 -} TinyHotMag; -static int g_hotmag_cap_default = 128; // default capacity (env override) -static int g_hotmag_refill_default = 32; // default refill batch (env override) -static int g_hotmag_enable = 0; // 既定OFF(A/B用)。envでON可。 -static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES]; -static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES]; -static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES]; -static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES]; -static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES]; // 0=disabled for class, 1=enabled -static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES]; -// Inline helpers - -#include "box/tls_sll_box.h" // Box TLS-SLL: Safe SLL operations API (needed by hotmag) -#include "hakmem_tiny_hotmag.inc.h" - -// Size-specialized tiny alloc (32B/64B) via function pointers (A/B用) -// TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata) -// Opt-in via HAKMEM_TINY_QUICK=1 -// NOTE: This type definition must come BEFORE the Phase 2D-1 includes below -int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1 -__thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below - -// Phase 2D-1: Hot-path inline function extractions(Front) -// NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み -#include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3} -#include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations -#if HAKMEM_TINY_P0_BATCH_REFILL -#include "hakmem_tiny_refill_p0.inc.h" // P0 batch refill → FastCache 直補充 -#endif - -// Phase 7 Task 3: Pre-warm TLS cache at init -// Pre-allocate blocks to reduce first-allocation miss penalty -#if HAKMEM_TINY_PREWARM_TLS -void hak_tiny_prewarm_tls_cache(void) { - // Pre-warm each class with HAKMEM_TINY_PREWARM_COUNT blocks - // This reduces the first-allocation miss penalty by populating TLS cache - // Phase E1-CORRECT: ALL classes (including C7) now use TLS SLL - for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { - int count = HAKMEM_TINY_PREWARM_COUNT; // Default: 16 blocks per class - - // Trigger refill to populate TLS cache - // P0 Fix: Use appropriate refill function based on P0 status -#if HAKMEM_TINY_P0_BATCH_REFILL - sll_refill_batch_from_ss(class_idx, count); -#else - sll_refill_small_from_ss(class_idx, count); -#endif - } -} -#endif - -// Ultra-Simple front (small per-class stack) — combines tiny front to minimize -// instructions and memory touches on alloc/free. Uses existing TLS bump shadow -// (g_tls_bcur/bend) when enabled to avoid per-alloc header writes. -// UltraFront capacity for 32/64B fast pop -#ifndef ULTRA_FRONT_CAP -#define ULTRA_FRONT_CAP 64 -#endif -typedef struct __attribute__((aligned(64))) { - void* slots[ULTRA_FRONT_CAP]; - uint16_t top; // 0..ULTRA_FRONT_CAP - uint16_t _pad; -} TinyUltraFront; -static int g_ultra_simple = 0; // HAKMEM_TINY_ULTRA_SIMPLE=1 -static __thread TinyUltraFront g_tls_ultra[TINY_NUM_CLASSES]; -// Inline helpers -#include "hakmem_tiny_ultra_front.inc.h" - -// Ultra-Bump TLS shadow (bench/opt-in): keep a TLS-only bump window -// to avoid per-alloc header writes. Header is updated per-chunk reservation. -// NOTE: Non-static because used in hakmem_tiny_refill.inc.h -int g_bump_chunk = 32; // HAKMEM_TINY_BUMP_CHUNK (blocks) -__thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES] = {0}; -__thread uint8_t* g_tls_bend[TINY_NUM_CLASSES] = {0}; - -// SLL small refill batch for specialized class (32/64B) -// Specialized order toggle: 1 = mag-first, 0 = sll-first -// HotMag helpers (for classes 0..3) -static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; } - -// Optional front (Ultra/HotMag) push helper: compile-out in release builds -static inline int tiny_optional_push(int class_idx, void* ptr) { -#if HAKMEM_BUILD_RELEASE - (void)class_idx; - (void)ptr; - return 0; -#else - if (__builtin_expect(g_ultra_simple && is_hot_class(class_idx), 0)) { - if (__builtin_expect(ultra_push(class_idx, ptr), 0)) { - return 1; - } - } - if (__builtin_expect(is_hot_class(class_idx), 0)) { - if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) { - return 1; - } - } - return 0; -#endif -} - -// Ultra-Simple helpers - -// Phase 9.6: Deferred Intelligence (event queue + background) -// Extended event for FLINT Intelligence (lightweight; recorded off hot path only) -// Observability, ACE, and intelligence helpers #include "hakmem_tiny_intel.inc" // ============================================================================ @@ -767,83 +459,12 @@ static inline int ultra_batch_for_class(int class_idx) { // EXTRACTED: } // Lookup slab by base address (O(1) average) -static TinySlab* registry_lookup(uintptr_t slab_base) { - // Lock-free read with atomic owner access (MT-safe) - int hash = registry_hash(slab_base); - - // Linear probing search - for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) { - int idx = (hash + i) & SLAB_REGISTRY_MASK; - SlabRegistryEntry* entry = &g_slab_registry[idx]; - - if (entry->slab_base == slab_base) { - // Atomic load to prevent TOCTOU race with registry_unregister() - TinySlab* owner = atomic_load_explicit(&entry->owner, memory_order_acquire); - if (!owner) return NULL; // Entry cleared by unregister - return owner; - } - - if (entry->slab_base == 0) { - return NULL; // Empty slot - not found - } - } - return NULL; // Not found after max probes -} // ============================================================================ -// EXTRACTED TO hakmem_tiny_slab_mgmt.inc (Phase 2D-4 FINAL) +// Registry Lookup & Owner Slab Discovery - EXTRACTED to hakmem_tiny_slab_lookup_box.inc // ============================================================================ -// Function: allocate_new_slab() - 79 lines (lines 952-1030) -// Allocate new slab for a class +#include "hakmem_tiny_slab_lookup_box.inc" -// Function: release_slab() - 23 lines (lines 1033-1055) -// Release a slab back to system - -// Step 2: Find slab owner by pointer (O(1) via hash table registry, or O(N) fallback) -TinySlab* hak_tiny_owner_slab(void* ptr) { - if (!ptr || !g_tiny_initialized) return NULL; - - // Phase 6.14: Runtime toggle between Registry (O(1)) and List (O(N)) - if (g_use_registry) { - // O(1) lookup via hash table - uintptr_t slab_base = (uintptr_t)ptr & ~(TINY_SLAB_SIZE - 1); - TinySlab* slab = registry_lookup(slab_base); - if (!slab) return NULL; - // SAFETY: validate membership (ptr must be inside [base, base+64KB)) - uintptr_t start = (uintptr_t)slab->base; - uintptr_t end = start + TINY_SLAB_SIZE; - if ((uintptr_t)ptr < start || (uintptr_t)ptr >= end) { - return NULL; // false positive from registry → treat as non-Tiny - } - return slab; - } else { - // O(N) fallback: linear search through all slab lists (lock per class) - for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { - pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; - pthread_mutex_lock(lock); - // Search free slabs - for (TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; slab; slab = slab->next) { - uintptr_t slab_start = (uintptr_t)slab->base; - uintptr_t slab_end = slab_start + TINY_SLAB_SIZE; - if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) { - pthread_mutex_unlock(lock); - return slab; - } - } - // Search full slabs - for (TinySlab* slab = g_tiny_pool.full_slabs[class_idx]; slab; slab = slab->next) { - uintptr_t slab_start = (uintptr_t)slab->base; - uintptr_t slab_end = slab_start + TINY_SLAB_SIZE; - if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) { - pthread_mutex_unlock(lock); - return slab; - } - } - pthread_mutex_unlock(lock); - } - return NULL; // Not found - } -} // Function: move_to_full_list() - 20 lines (lines 1104-1123) // Move slab to full list diff --git a/core/hakmem_tiny_legacy_slow_box.inc b/core/hakmem_tiny_legacy_slow_box.inc new file mode 100644 index 00000000..8bf6893b --- /dev/null +++ b/core/hakmem_tiny_legacy_slow_box.inc @@ -0,0 +1,96 @@ +static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) { + int tls_enabled = g_tls_list_enable; + TinyTLSList* tls = &g_tls_lists[class_idx]; + pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; + pthread_mutex_lock(lock); + + TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; + if (slab) { + g_tiny_pool.free_slabs[class_idx] = slab->next; + } else { + slab = allocate_new_slab(class_idx); + if (!slab) { + pthread_mutex_unlock(lock); + return NULL; + } + } + slab->next = NULL; + + if (atomic_load_explicit(&slab->remote_head, memory_order_acquire)) { + tiny_remote_drain_locked(slab); + } + + int block_idx = hak_tiny_find_free_block(slab); + if (block_idx < 0) { + slab->next = g_tiny_pool.free_slabs[class_idx]; + g_tiny_pool.free_slabs[class_idx] = slab; + pthread_mutex_unlock(lock); + return NULL; + } + + hak_tiny_set_used(slab, block_idx); + slab->free_count--; + size_t block_size = g_tiny_class_sizes[class_idx]; + uint8_t* base = (uint8_t*)slab->base; + void* ret = (void*)(base + ((size_t)block_idx * block_size)); + g_tiny_pool.alloc_count[class_idx]++; + + uint16_t cap = g_fast_cap_defaults[class_idx]; + uint16_t count = g_fast_count[class_idx]; + uint16_t fast_need = (cap > count) ? (uint16_t)(cap - count) : 0; + if (fast_need > slab->free_count) fast_need = (uint16_t)slab->free_count; + + uint32_t tls_need = 0; + if (tls_enabled && tls_list_needs_refill(tls)) { + uint32_t target = tls_list_refill_threshold(tls); + if (tls->count < target) { + tls_need = target - tls->count; + } + } + uint32_t remaining = slab->free_count; + if (fast_need > remaining) fast_need = (uint16_t)remaining; + remaining -= fast_need; + if (tls_need > remaining) tls_need = remaining; + + while (fast_need > 0) { + int extra_idx = hak_tiny_find_free_block(slab); + if (extra_idx < 0) break; + hak_tiny_set_used(slab, extra_idx); + slab->free_count--; + void* extra = (void*)(base + ((size_t)extra_idx * block_size)); + int pushed = 0; + if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) { + pushed = fastcache_push(class_idx, extra); + } else { + pushed = tiny_fast_push(class_idx, extra); + } + if (!pushed) { + if (tls_enabled) { + tiny_tls_list_guard_push(class_idx, tls, extra); + tls_list_push(tls, extra, class_idx); + } + } + fast_need--; + } + + while (tls_enabled && tls_need > 0) { + int extra_idx = hak_tiny_find_free_block(slab); + if (extra_idx < 0) break; + hak_tiny_set_used(slab, extra_idx); + slab->free_count--; + void* extra = (void*)(base + ((size_t)extra_idx * block_size)); + tiny_tls_list_guard_push(class_idx, tls, extra); + tls_list_push(tls, extra, class_idx); + tls_need--; + } + + if (slab->free_count == 0) { + move_to_full_list(class_idx, slab); + } else { + slab->next = g_tiny_pool.free_slabs[class_idx]; + g_tiny_pool.free_slabs[class_idx] = slab; + } + + pthread_mutex_unlock(lock); + return ret; +} diff --git a/core/hakmem_tiny_slab_lookup_box.inc b/core/hakmem_tiny_slab_lookup_box.inc new file mode 100644 index 00000000..2a5684a1 --- /dev/null +++ b/core/hakmem_tiny_slab_lookup_box.inc @@ -0,0 +1,77 @@ +static TinySlab* registry_lookup(uintptr_t slab_base) { + // Lock-free read with atomic owner access (MT-safe) + int hash = registry_hash(slab_base); + + // Linear probing search + for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) { + int idx = (hash + i) & SLAB_REGISTRY_MASK; + SlabRegistryEntry* entry = &g_slab_registry[idx]; + + if (entry->slab_base == slab_base) { + // Atomic load to prevent TOCTOU race with registry_unregister() + TinySlab* owner = atomic_load_explicit(&entry->owner, memory_order_acquire); + if (!owner) return NULL; // Entry cleared by unregister + return owner; + } + + if (entry->slab_base == 0) { + return NULL; // Empty slot - not found + } + } + return NULL; // Not found after max probes +} + +// ============================================================================ +// EXTRACTED TO hakmem_tiny_slab_mgmt.inc (Phase 2D-4 FINAL) +// ============================================================================ +// Function: allocate_new_slab() - 79 lines (lines 952-1030) +// Allocate new slab for a class + +// Function: release_slab() - 23 lines (lines 1033-1055) +// Release a slab back to system + +// Step 2: Find slab owner by pointer (O(1) via hash table registry, or O(N) fallback) +TinySlab* hak_tiny_owner_slab(void* ptr) { + if (!ptr || !g_tiny_initialized) return NULL; + + // Phase 6.14: Runtime toggle between Registry (O(1)) and List (O(N)) + if (g_use_registry) { + // O(1) lookup via hash table + uintptr_t slab_base = (uintptr_t)ptr & ~(TINY_SLAB_SIZE - 1); + TinySlab* slab = registry_lookup(slab_base); + if (!slab) return NULL; + // SAFETY: validate membership (ptr must be inside [base, base+64KB)) + uintptr_t start = (uintptr_t)slab->base; + uintptr_t end = start + TINY_SLAB_SIZE; + if ((uintptr_t)ptr < start || (uintptr_t)ptr >= end) { + return NULL; // false positive from registry → treat as non-Tiny + } + return slab; + } else { + // O(N) fallback: linear search through all slab lists (lock per class) + for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { + pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; + pthread_mutex_lock(lock); + // Search free slabs + for (TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; slab; slab = slab->next) { + uintptr_t slab_start = (uintptr_t)slab->base; + uintptr_t slab_end = slab_start + TINY_SLAB_SIZE; + if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) { + pthread_mutex_unlock(lock); + return slab; + } + } + // Search full slabs + for (TinySlab* slab = g_tiny_pool.full_slabs[class_idx]; slab; slab = slab->next) { + uintptr_t slab_start = (uintptr_t)slab->base; + uintptr_t slab_end = slab_start + TINY_SLAB_SIZE; + if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) { + pthread_mutex_unlock(lock); + return slab; + } + } + pthread_mutex_unlock(lock); + } + return NULL; // Not found + } +} diff --git a/core/hakmem_tiny_tls_state_box.inc b/core/hakmem_tiny_tls_state_box.inc new file mode 100644 index 00000000..9dabc8d5 --- /dev/null +++ b/core/hakmem_tiny_tls_state_box.inc @@ -0,0 +1,224 @@ +// Hot-path cheap sampling counter to avoid rand() in allocation path +// Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B) +int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable +int g_tiny_hotpath_class5 = 0; // HAKMEM_TINY_HOTPATH_CLASS5=1 to enable class 5 hotpath +// Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c) +// CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads +// PRIORITY 3: TLS Canaries - Add canaries around TLS arrays to detect buffer overruns +#define TLS_CANARY_MAGIC 0xDEADBEEFDEADBEEFULL +// Phase 3d-B: Unified TLS SLL (head+count in same cache line for +12-18% cache hit rate) +__thread uint64_t g_tls_canary_before_sll = TLS_CANARY_MAGIC; +__thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0}; +__thread uint64_t g_tls_canary_after_sll = TLS_CANARY_MAGIC; +static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode +static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation +// Ultra debug counters +#if HAKMEM_DEBUG_COUNTERS +static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0}; +static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0}; +#endif + +// Path counters (normal mode visibility): lightweight, for debugging/bench only +#if HAKMEM_DEBUG_COUNTERS +static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0}; +// New: slow/bitmap/bump/bin instrumentation +static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0}; +#endif +static int g_path_debug_enabled = 0; + +// Spill hysteresis(freeホットパスからgetenvを排除) +static int g_spill_hyst = 32; // default margin (configured at init; never getenv on hot path) + +// Optional per-class refill batch overrides (0=use global defaults) +static int g_refill_max_c[TINY_NUM_CLASSES] = {0}; +static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0}; +static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) { + int v = g_refill_max_c[class_idx]; + if (v > 0) return v; + if (class_idx <= 3) { + int hv = g_refill_max_hot_c[class_idx]; + if (hv > 0) return hv; + return g_tiny_refill_max_hot; + } + return g_tiny_refill_max; +} + +// Phase 9.5: Frontend/Backend split - Tiny Front modules(QuickSlot / FastCache) +#include "front/quick_slot.h" +#include "front/fast_cache.h" +__thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES]; +static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend) +// SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER) +int g_sll_multiplier = 2; +// Cached thread id (uint32) to avoid repeated pthread_self() in hot paths +static __thread uint32_t g_tls_tid32; +static __thread int g_tls_tid32_inited; +// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) +#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR +inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { +#else +static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) { +#endif + if (__builtin_expect(!g_tls_tid32_inited, 0)) { + g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self(); + g_tls_tid32_inited = 1; + } + return g_tls_tid32; +} +// Cached pthread_t as-is for APIs that require pthread_t comparison +static __thread pthread_t g_tls_pt_self; +static __thread int g_tls_pt_inited; + +// Frontend FastCache hit/miss counters (Small diagnostics) +unsigned long long g_front_fc_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_fc_miss[TINY_NUM_CLASSES] = {0}; +// TLS SLL class mask: bit i = 1 allows SLL for class i. Default: all 8 classes enabled. +int g_tls_sll_class_mask = 0xFF; +// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c) +#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR +inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { +#else +static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { +#endif + if (__builtin_expect(!g_tls_pt_inited, 0)) { + g_tls_pt_self = pthread_self(); + g_tls_pt_inited = 1; + } + return g_tls_pt_self; +} + +#include "tiny_refill.h" +// tiny_mmap_gate.h already included at top +#include "tiny_publish.h" + +int g_sll_cap_override[TINY_NUM_CLASSES] = {0}; // LEGACY (Phase12以降は参照しない/互換用ダミー) +// Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1) +static int g_tiny_prefetch = 0; + +// Small-class magazine pre-initialization (to avoid cap==0 checks on hot path) + + + +// Hot-class small TLS magazine(実体とスイッチ) +typedef struct { + void* slots[128]; + uint16_t top; // 0..128 + uint16_t cap; // =128 +} TinyHotMag; +static int g_hotmag_cap_default = 128; // default capacity (env override) +static int g_hotmag_refill_default = 32; // default refill batch (env override) +static int g_hotmag_enable = 0; // 既定OFF(A/B用)。envでON可。 +static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES]; +static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES]; +static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES]; +static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES]; +static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES]; // 0=disabled for class, 1=enabled +static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES]; +// Inline helpers + +#include "box/tls_sll_box.h" // Box TLS-SLL: Safe SLL operations API (needed by hotmag) +#include "hakmem_tiny_hotmag.inc.h" + +// Size-specialized tiny alloc (32B/64B) via function pointers (A/B用) +// TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata) +// Opt-in via HAKMEM_TINY_QUICK=1 +// NOTE: This type definition must come BEFORE the Phase 2D-1 includes below +int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1 +__thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below + +// Phase 2D-1: Hot-path inline function extractions(Front) +// NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み +#include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3} +#include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations +#if HAKMEM_TINY_P0_BATCH_REFILL +#include "hakmem_tiny_refill_p0.inc.h" // P0 batch refill → FastCache 直補充 +#endif + +// Phase 7 Task 3: Pre-warm TLS cache at init +// Pre-allocate blocks to reduce first-allocation miss penalty +#if HAKMEM_TINY_PREWARM_TLS +void hak_tiny_prewarm_tls_cache(void) { + // Pre-warm each class with HAKMEM_TINY_PREWARM_COUNT blocks + // This reduces the first-allocation miss penalty by populating TLS cache + // Phase E1-CORRECT: ALL classes (including C7) now use TLS SLL + for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) { + int count = HAKMEM_TINY_PREWARM_COUNT; // Default: 16 blocks per class + + // Trigger refill to populate TLS cache + // P0 Fix: Use appropriate refill function based on P0 status +#if HAKMEM_TINY_P0_BATCH_REFILL + sll_refill_batch_from_ss(class_idx, count); +#else + sll_refill_small_from_ss(class_idx, count); +#endif + } +} +#endif + +// Ultra-Simple front (small per-class stack) — combines tiny front to minimize +// instructions and memory touches on alloc/free. Uses existing TLS bump shadow +// (g_tls_bcur/bend) when enabled to avoid per-alloc header writes. +// UltraFront capacity for 32/64B fast pop +#ifndef ULTRA_FRONT_CAP +#define ULTRA_FRONT_CAP 64 +#endif +typedef struct __attribute__((aligned(64))) { + void* slots[ULTRA_FRONT_CAP]; + uint16_t top; // 0..ULTRA_FRONT_CAP + uint16_t _pad; +} TinyUltraFront; +static int g_ultra_simple = 0; // HAKMEM_TINY_ULTRA_SIMPLE=1 +static __thread TinyUltraFront g_tls_ultra[TINY_NUM_CLASSES]; +// Inline helpers +#include "hakmem_tiny_ultra_front.inc.h" + +// Ultra-Bump TLS shadow (bench/opt-in): keep a TLS-only bump window +// to avoid per-alloc header writes. Header is updated per-chunk reservation. +// NOTE: Non-static because used in hakmem_tiny_refill.inc.h +int g_bump_chunk = 32; // HAKMEM_TINY_BUMP_CHUNK (blocks) +__thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES] = {0}; +__thread uint8_t* g_tls_bend[TINY_NUM_CLASSES] = {0}; + +// SLL small refill batch for specialized class (32/64B) +// Specialized order toggle: 1 = mag-first, 0 = sll-first +// HotMag helpers (for classes 0..3) +static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; } + +// Optional front (Ultra/HotMag) push helper: compile-out in release builds +static inline int tiny_optional_push(int class_idx, void* ptr) { +#if HAKMEM_BUILD_RELEASE + (void)class_idx; + (void)ptr; + return 0; +#else + if (__builtin_expect(g_ultra_simple && is_hot_class(class_idx), 0)) { + if (__builtin_expect(ultra_push(class_idx, ptr), 0)) { + return 1; + } + } + if (__builtin_expect(is_hot_class(class_idx), 0)) { + if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) { + return 1; + } + } + return 0; +#endif +} + +// Ultra-Simple helpers + +// Phase 9.6: Deferred Intelligence (event queue + background) +// Extended event for FLINT Intelligence (lightweight; recorded off hot path only) +// Observability, ACE, and intelligence helpers