diff --git a/core/box/external_guard_box.h b/core/box/external_guard_box.h index e613797c..e85f47eb 100644 --- a/core/box/external_guard_box.h +++ b/core/box/external_guard_box.h @@ -25,6 +25,7 @@ #include #include #include "front_gate_v2.h" // Phase 15: For fg_classification_t types +#include "ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary // ENV control: mincore enable/disable static inline int external_guard_mincore_enabled(void) { @@ -108,7 +109,7 @@ static inline int external_guard_try_free(void* ptr) { fprintf(stderr, "[ExternalGuard] hak_super_lookup(ptr) = %p\n", (void*)ss); if (ss) { fprintf(stderr, "[ExternalGuard] HAKMEM SS FOUND! ptr=%p ss=%p magic=0x%x class=%d\n", - ptr, (void*)ss, ss->magic, ss->slabs ? ss->slabs[0].class_idx : -1); + ptr, (void*)ss, ss->magic, ss->slabs ? ss_slab_meta_class_idx_get(ss, 0) : -1); } // Debug: Check FrontGate classification (types defined in front_gate_v2.h) diff --git a/core/box/ss_allocation_box.c b/core/box/ss_allocation_box.c new file mode 100644 index 00000000..484a9410 --- /dev/null +++ b/core/box/ss_allocation_box.c @@ -0,0 +1,414 @@ +// Box: Core Allocation +// Purpose: SuperSlab allocation/deallocation and slab initialization + +#include "ss_allocation_box.h" +#include "ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary +#include "ss_os_acquire_box.h" +#include "ss_cache_box.h" +#include "ss_stats_box.h" +#include "ss_ace_box.h" +#include "ss_slab_management_box.h" +#include "hakmem_super_registry.h" +#include "hakmem_tiny_config.h" +#include "hakmem_policy.h" // Phase E3-1: Access FrozenPolicy for never-free policy +#include "tiny_region_id.h" +#include "box/tiny_next_ptr_box.h" +#include +#include +#include +#include +#include + +// Global statistics (defined in ss_stats_box.c, declared here for access) +extern pthread_mutex_t g_superslab_lock; +extern uint64_t g_superslabs_freed; +extern uint64_t g_bytes_allocated; + +// g_ss_force_lg is defined in ss_ace_box.c but needs external linkage +extern int g_ss_force_lg; + +// g_ss_populate_once controls MAP_POPULATE flag +static _Atomic int g_ss_populate_once = 0; + +// ============================================================================ +// Remote Drain Helper +// ============================================================================ + +// Drain remote MPSC stack into freelist (ownership already verified by caller) +void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta) +{ + if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss) || !meta) return; + + // Atomically take the whole remote list + uintptr_t head = atomic_exchange_explicit(&ss->remote_heads[slab_idx], 0, + memory_order_acq_rel); + if (head == 0) return; + + // Convert remote stack (offset 0 next) into freelist encoding via Box API + // and splice in front of current freelist preserving relative order. + void* prev = meta->freelist; + int cls = (int)meta->class_idx; + uintptr_t cur = head; + while (cur != 0) { + uintptr_t next = *(uintptr_t*)cur; // remote-next stored at offset 0 + // Restore header for header-classes (class 1-6) which were clobbered by remote push +#if HAKMEM_TINY_HEADER_CLASSIDX + if (cls != 0 && cls != 7) { + uint8_t expected = (uint8_t)(HEADER_MAGIC | (cls & HEADER_CLASS_MASK)); + *(uint8_t*)(uintptr_t)cur = expected; + } +#endif + // Rewrite next pointer to Box representation for this class + tiny_next_write(cls, (void*)cur, prev); + prev = (void*)cur; + cur = next; + } + meta->freelist = prev; + // Reset remote count after full drain + atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release); + + // Update freelist/nonempty visibility bits + uint32_t bit = (1u << slab_idx); + atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release); + atomic_fetch_or_explicit(&ss->nonempty_mask, bit, memory_order_release); +} + +// ============================================================================ +// SuperSlab Allocation (ACE-Aware) +// ============================================================================ + +SuperSlab* superslab_allocate(uint8_t size_class) { + // Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗 + static int fault_rate = -1; // -1=unparsed, 0=disabled, >0=rate + static __thread unsigned long fault_tick = 0; + if (__builtin_expect(fault_rate == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE"); + if (e && *e) { + int v = atoi(e); if (v < 0) v = 0; fault_rate = v; + } else { + fault_rate = 0; + } + } + if (fault_rate > 0) { + unsigned long t = ++fault_tick; + if ((t % (unsigned long)fault_rate) == 0ul) { + return NULL; // simulate OOM + } + } + // Optional env clamp for SuperSlab size + static int env_parsed = 0; + // Allow full ACE range [MIN..MAX] by default so 1MB/2MB の二択学習が有効になる。 + static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_MIN; + static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX; + if (!env_parsed) { + char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB"); + if (maxmb) { + int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21; + } + char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB"); + if (minmb) { + int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21; + } + if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env; + const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG"); + if (force_lg_env && *force_lg_env) { + int v = atoi(force_lg_env); + if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) { + g_ss_force_lg = v; + g_ss_min_lg_env = g_ss_max_lg_env = v; + } + } + size_t precharge_default = 0; + const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE"); + if (precharge_env && *precharge_env) { + long v = atol(precharge_env); + if (v < 0) v = 0; + precharge_default = (size_t)v; + if (v > 0) { + atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed); + } + } + size_t cache_default = 0; + const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE"); + if (cache_env && *cache_env) { + long v = atol(cache_env); + if (v < 0) v = 0; + cache_default = (size_t)v; + } + // Initialize cache/precharge via direct manipulation (box API doesn't need init function) + for (int i = 0; i < 8; i++) { + extern size_t g_ss_cache_cap[8]; + extern size_t g_ss_precharge_target[8]; + g_ss_cache_cap[i] = cache_default; + g_ss_precharge_target[i] = precharge_default; + } + + for (int i = 0; i < 8; i++) { + char name[64]; + snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i); + char* cap_env = getenv(name); + if (cap_env && *cap_env) { + long v = atol(cap_env); + if (v < 0) v = 0; + tiny_ss_cache_set_class_cap(i, (size_t)v); + } + snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i); + char* pre_env = getenv(name); + if (pre_env && *pre_env) { + long v = atol(pre_env); + if (v < 0) v = 0; + tiny_ss_precharge_set_class_target(i, (size_t)v); + } + } + const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE"); + if (populate_env && atoi(populate_env) != 0) { + atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed); + } + env_parsed = 1; + } + + uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class); + if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env; + if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env; + size_t ss_size = (size_t)1 << lg; // 2^20 = 1MB, 2^21 = 2MB + uintptr_t ss_mask = ss_size - 1; + int from_cache = 0; + void* ptr = NULL; + + // Debug logging flag (lazy init) + static __thread int dbg = -1; + if (__builtin_expect(dbg == -1, 0)) { + const char* e = getenv("HAKMEM_SS_PREWARM_DEBUG"); + dbg = (e && *e && *e != '0') ? 1 : 0; + } + + // Phase 9: Try LRU cache first (lazy deallocation) + SuperSlab* cached_ss = hak_ss_lru_pop(size_class); + if (cached_ss) { + ptr = (void*)cached_ss; + from_cache = 1; + // Debug logging for REFILL from LRU + if (dbg == 1) { + fprintf(stderr, "[REFILL] class=%d from_lru=1 ss=%p\n", + size_class, (void*)cached_ss); + } + // Skip old cache path - LRU cache takes priority + } else { + // Fallback to old cache (will be deprecated) + ss_cache_precharge(size_class, ss_size, ss_mask); + void* old_cached = ss_cache_pop(size_class); + if (old_cached) { + ptr = old_cached; + from_cache = 1; + // Debug logging for REFILL from prewarm (old cache is essentially prewarm) + if (dbg == 1) { + fprintf(stderr, "[REFILL] class=%d from_prewarm=1 ss=%p\n", + size_class, ptr); + } + } + } + + if (!ptr) { + int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel); + ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate); + if (!ptr) { + return NULL; + } + // Debug logging for REFILL with new allocation + if (dbg == 1) { + fprintf(stderr, "[REFILL] class=%d new_alloc=1 ss=%p\n", + size_class, (void*)ptr); + } + } + + // Initialize SuperSlab header (Phase 12: no global size_class field) + SuperSlab* ss = (SuperSlab*)ptr; + ss->magic = SUPERSLAB_MAGIC; + ss->active_slabs = 0; + ss->lg_size = lg; // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB) + ss->slab_bitmap = 0; + ss->nonempty_mask = 0; // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask + ss->partial_epoch = 0; + ss->publish_hint = 0xFF; + + // Initialize atomics explicitly + atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed); + atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed); + atomic_store_explicit(&ss->listed, 0, memory_order_relaxed); + ss->partial_next = NULL; + + // Phase 9: Initialize LRU fields + ss->last_used_ns = 0; + ss->generation = 0; + ss->lru_prev = NULL; + ss->lru_next = NULL; + + // Initialize all slab metadata (only up to max slabs for this size) + int max_slabs = (int)(ss_size / SLAB_SIZE); + + // DEFENSIVE FIX: Zero all slab metadata arrays to prevent ANY uninitialized pointers + // This catches the 0xa2a2a2a2a2a2a2a2 pattern bug (ASan/debug fill pattern) + // Even though mmap should return zeroed pages, sanitizers may fill with debug patterns + memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta)); + memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t)); + memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t)); + memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t)); + + for (int i = 0; i < max_slabs; i++) { + ss_slab_meta_freelist_set(ss, i, NULL); // Explicit NULL (redundant after memset, but clear intent) + ss_slab_meta_used_set(ss, i, 0); + ss_slab_meta_capacity_set(ss, i, 0); + ss_slab_meta_owner_tid_low_set(ss, i, 0); + + // Initialize remote queue atomics (memset already zeroed, but use proper atomic init) + atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed); + atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed); + atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed); + } + + if (from_cache) { + ss_stats_cache_reuse(); + } + + // Phase 8.3: Update ACE current_lg to match allocated size + g_ss_ace[size_class].current_lg = lg; + + // Phase 1: Register SuperSlab in global registry for fast lookup + // CRITICAL: Register AFTER full initialization (ss structure is ready) + uintptr_t base = (uintptr_t)ss; + if (!hak_super_register(base, ss)) { + // Registry full - this is a fatal error + fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss); + // Still return ss to avoid memory leak, but lookups may fail + } + + return ss; +} + +// ============================================================================ +// SuperSlab Deallocation +// ============================================================================ + +void superslab_free(SuperSlab* ss) { + if (!ss || ss->magic != SUPERSLAB_MAGIC) { + return; // Invalid SuperSlab + } + + // ADD DEBUG LOGGING + static __thread int dbg = -1; + if (__builtin_expect(dbg == -1, 0)) { + const char* e = getenv("HAKMEM_SS_FREE_DEBUG"); + dbg = (e && *e && *e != '0') ? 1 : 0; + } + if (dbg == 1) { + fprintf(stderr, "[SS_FREE] CALLED: ss=%p lg_size=%d active_slabs=%u\n", + (void*)ss, ss->lg_size, ss->active_slabs); + } + + // Phase 9: Lazy Deallocation - try to cache in LRU instead of munmap + size_t ss_size = (size_t)1 << ss->lg_size; + + // Phase 1: Unregister SuperSlab from registry FIRST + // CRITICAL: Must unregister BEFORE adding to LRU cache + // Reason: Cached SuperSlabs should NOT be found by lookups + uintptr_t base = (uintptr_t)ss; + hak_super_unregister(base); + + // Memory fence to ensure unregister is visible + atomic_thread_fence(memory_order_release); + + // Phase 9: Try LRU cache first (lazy deallocation) + // NOTE: LRU cache keeps magic=SUPERSLAB_MAGIC for validation + // Magic will be cleared on eviction or reuse + int lru_cached = hak_ss_lru_push(ss); + if (dbg == 1) { + fprintf(stderr, "[SS_FREE] hak_ss_lru_push() returned %d\n", lru_cached); + } + if (lru_cached) { + // Successfully cached in LRU - defer munmap + return; + } + + // LRU cache full or disabled - try old cache using head class_idx (if known) + int old_cached = ss_cache_push(0, ss); + if (old_cached) { + ss_stats_cache_store(); + return; + } + + // Phase E3-1: Check never-free policy before munmap + // If policy forbids Tiny SuperSlab munmap, skip deallocation (leak is intentional) + const FrozenPolicy* pol = hkm_policy_get(); + if (pol && pol->tiny_ss_never_free_global) { + // Policy forbids munmap - keep SuperSlab allocated (intentional "leak") + // Watermark enforcement will be added in Phase E3-2 +#if !HAKMEM_BUILD_RELEASE + fprintf(stderr, "[SS_POLICY_SKIP] Skipping munmap (never_free policy) ss=%p size=%zu\n", + (void*)ss, ss_size); +#endif + return; + } + + // Both caches full - immediately free to OS (eager deallocation) + // Clear magic to prevent use-after-free + ss->magic = 0; + +#if !HAKMEM_BUILD_RELEASE + fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p size=%zu active=%u (LRU full)\n", + (void*)ss, ss_size, + atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed)); +#endif + + munmap(ss, ss_size); + + // Update statistics for actual release to OS + pthread_mutex_lock(&g_superslab_lock); + g_superslabs_freed++; + // Phase 12: we no longer track per-SS size_class on header; skip g_ss_freed_by_class here + g_bytes_allocated -= ss_size; + pthread_mutex_unlock(&g_superslab_lock); + +#if !HAKMEM_BUILD_RELEASE + fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n", + (unsigned long long)g_superslabs_freed); +#endif +} + +// ============================================================================ +// Slab Initialization within SuperSlab +// ============================================================================ + +void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_t owner_tid) +{ + if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) { + return; + } + + // Phase E1-CORRECT unified geometry: + // - block_size is the TOTAL stride for this class (g_tiny_class_sizes[cls]) + // - usable bytes are determined by slab index (slab0 vs others) + // - capacity = usable / stride for ALL classes (including former C7) + size_t usable_size = (slab_idx == 0) + ? SUPERSLAB_SLAB0_USABLE_SIZE + : SUPERSLAB_SLAB_USABLE_SIZE; + size_t stride = block_size; + uint16_t capacity = (uint16_t)(usable_size / stride); + +#if !HAKMEM_BUILD_RELEASE + if (slab_idx == 0) { + fprintf(stderr, + "[SUPERSLAB_INIT] slab 0: usable_size=%zu stride=%zu capacity=%u\n", + usable_size, stride, (unsigned)capacity); + } +#endif + + TinySlabMeta* meta = &ss->slabs[slab_idx]; + meta->freelist = NULL; // NULL = linear allocation mode + meta->used = 0; + meta->capacity = capacity; + meta->carved = 0; + meta->owner_tid_low = (uint8_t)(owner_tid & 0xFFu); + // meta->class_idx is set by the caller (shared_pool / refill path) + + superslab_activate_slab(ss, slab_idx); +} diff --git a/core/box/ss_slab_meta_box.h b/core/box/ss_slab_meta_box.h new file mode 100644 index 00000000..8463d426 --- /dev/null +++ b/core/box/ss_slab_meta_box.h @@ -0,0 +1,107 @@ +#ifndef SS_SLAB_META_BOX_H +#define SS_SLAB_META_BOX_H + +// ============================================================================ +// Box: SlabMeta Access Layer (Phase 3d-A) +// ============================================================================ +// Purpose: Encapsulate SuperSlab metadata field access +// Boundary: SuperSlab internal layout (slabs[] array) +// Benefits: +// - Single point of change for future layout optimizations +// - Enables Hot/Cold split without touching call sites +// - Supports A/B testing via compile-time flags +// +// Design: Thin inline wrappers (zero overhead, unchanged behavior) +// ============================================================================ + +#include "../superslab/superslab_types.h" + +// ---------------------------------------------------------------------------- +// HOT field accessors (frequent access on alloc/free paths) +// ---------------------------------------------------------------------------- + +// Get freelist pointer (HOT field) +static inline void* ss_slab_meta_freelist_get(SuperSlab* ss, int slab_idx) { + return ss->slabs[slab_idx].freelist; +} + +// Set freelist pointer (HOT field) +static inline void ss_slab_meta_freelist_set(SuperSlab* ss, int slab_idx, void* ptr) { + ss->slabs[slab_idx].freelist = ptr; +} + +// Get used count (HOT field) +static inline uint16_t ss_slab_meta_used_get(SuperSlab* ss, int slab_idx) { + return ss->slabs[slab_idx].used; +} + +// Set used count (HOT field) +static inline void ss_slab_meta_used_set(SuperSlab* ss, int slab_idx, uint16_t val) { + ss->slabs[slab_idx].used = val; +} + +// Increment used count (HOT field, common operation) +static inline void ss_slab_meta_used_inc(SuperSlab* ss, int slab_idx) { + ss->slabs[slab_idx].used++; +} + +// Decrement used count (HOT field, common operation) +static inline void ss_slab_meta_used_dec(SuperSlab* ss, int slab_idx) { + ss->slabs[slab_idx].used--; +} + +// Get capacity (HOT field) +static inline uint16_t ss_slab_meta_capacity_get(SuperSlab* ss, int slab_idx) { + return ss->slabs[slab_idx].capacity; +} + +// Set capacity (HOT field, set once at init) +static inline void ss_slab_meta_capacity_set(SuperSlab* ss, int slab_idx, uint16_t val) { + ss->slabs[slab_idx].capacity = val; +} + +// ---------------------------------------------------------------------------- +// COLD field accessors (rare access: init, debug, stats) +// ---------------------------------------------------------------------------- + +// Get class_idx (COLD field) +static inline uint8_t ss_slab_meta_class_idx_get(SuperSlab* ss, int slab_idx) { + return ss->slabs[slab_idx].class_idx; +} + +// Set class_idx (COLD field, set once at init) +static inline void ss_slab_meta_class_idx_set(SuperSlab* ss, int slab_idx, uint8_t val) { + ss->slabs[slab_idx].class_idx = val; +} + +// Get carved (COLD field) +static inline uint8_t ss_slab_meta_carved_get(SuperSlab* ss, int slab_idx) { + return ss->slabs[slab_idx].carved; +} + +// Set carved (COLD field) +static inline void ss_slab_meta_carved_set(SuperSlab* ss, int slab_idx, uint8_t val) { + ss->slabs[slab_idx].carved = val; +} + +// Get owner_tid_low (COLD field, debug only) +static inline uint8_t ss_slab_meta_owner_tid_low_get(SuperSlab* ss, int slab_idx) { + return ss->slabs[slab_idx].owner_tid_low; +} + +// Set owner_tid_low (COLD field, debug only) +static inline void ss_slab_meta_owner_tid_low_set(SuperSlab* ss, int slab_idx, uint8_t val) { + ss->slabs[slab_idx].owner_tid_low = val; +} + +// ---------------------------------------------------------------------------- +// Legacy direct pointer access (for gradual migration) +// ---------------------------------------------------------------------------- + +// Get pointer to TinySlabMeta (for code that needs direct struct access) +// TODO Phase 3d-B: Migrate all users to field-specific accessors above +static inline TinySlabMeta* ss_slab_meta_ptr(SuperSlab* ss, int slab_idx) { + return &ss->slabs[slab_idx]; +} + +#endif // SS_SLAB_META_BOX_H diff --git a/core/hakmem_shared_pool.c b/core/hakmem_shared_pool.c index fb4684ff..bda1522e 100644 --- a/core/hakmem_shared_pool.c +++ b/core/hakmem_shared_pool.c @@ -1,7 +1,10 @@ #include "hakmem_shared_pool.h" #include "hakmem_tiny_superslab.h" #include "hakmem_tiny_superslab_constants.h" +#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary #include "box/pagefault_telemetry_box.h" // Box PageFaultTelemetry (PF_BUCKET_SS_META) +#include "box/tls_sll_drain_box.h" // Box TLS SLL Drain (tiny_tls_sll_drain) +#include "hakmem_policy.h" // FrozenPolicy (learning layer) #include #include @@ -48,6 +51,142 @@ static void __attribute__((destructor)) lock_stats_report(void) { fprintf(stderr, "release_slab(): %lu (%.1f%%)\n", release_path, 100.0 * release_path / (acquires ? acquires : 1)); fprintf(stderr, "===================================\n"); + fflush(stderr); +} + +// ============================================================================ +// SP Acquire Stage Statistics (Stage1/2/3 breakdown) +// ============================================================================ +static _Atomic uint64_t g_sp_stage1_hits[TINY_NUM_CLASSES_SS]; +static _Atomic uint64_t g_sp_stage2_hits[TINY_NUM_CLASSES_SS]; +static _Atomic uint64_t g_sp_stage3_hits[TINY_NUM_CLASSES_SS]; +// Data collection gate (0=off, 1=on). 学習層からも有効化される。 +static int g_sp_stage_stats_enabled = 0; +// Logging gate for destructor(ENV: HAKMEM_SHARED_POOL_STAGE_STATS) +static int g_sp_stage_stats_log_enabled = -1; // -1=uninitialized, 0=off, 1=on + +static inline void sp_stage_stats_init(void) { + if (__builtin_expect(g_sp_stage_stats_log_enabled == -1, 0)) { + const char* env = getenv("HAKMEM_SHARED_POOL_STAGE_STATS"); + g_sp_stage_stats_log_enabled = (env && *env && *env != '0') ? 1 : 0; + if (g_sp_stage_stats_log_enabled == 1) { + // ログが有効なら計測も必ず有効化する。 + g_sp_stage_stats_enabled = 1; + } + } +} + +static void __attribute__((destructor)) sp_stage_stats_report(void) { + if (g_sp_stage_stats_log_enabled != 1) { + return; + } + + fprintf(stderr, "\n=== SHARED POOL STAGE STATISTICS ===\n"); + fprintf(stderr, "Per-class acquire_slab() stage hits (Stage1=EMPTY, Stage2=UNUSED, Stage3=new SS)\n"); + + for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) { + uint64_t s1 = atomic_load(&g_sp_stage1_hits[cls]); + uint64_t s2 = atomic_load(&g_sp_stage2_hits[cls]); + uint64_t s3 = atomic_load(&g_sp_stage3_hits[cls]); + uint64_t total = s1 + s2 + s3; + if (total == 0) continue; // Skip unused classes + + double p1 = 100.0 * (double)s1 / (double)total; + double p2 = 100.0 * (double)s2 / (double)total; + double p3 = 100.0 * (double)s3 / (double)total; + + fprintf(stderr, + "Class %d: total=%llu S1=%llu (%.1f%%) S2=%llu (%.1f%%) S3=%llu (%.1f%%)\n", + cls, + (unsigned long long)total, + (unsigned long long)s1, p1, + (unsigned long long)s2, p2, + (unsigned long long)s3, p3); + } + fprintf(stderr, "====================================\n"); + fflush(stderr); + } + +// Snapshot Tiny-related backend metrics for learner / observability. +void +shared_pool_tiny_metrics_snapshot(uint64_t stage1[TINY_NUM_CLASSES_SS], + uint64_t stage2[TINY_NUM_CLASSES_SS], + uint64_t stage3[TINY_NUM_CLASSES_SS], + uint32_t active_slots[TINY_NUM_CLASSES_SS]) +{ + // Ensure env-based logging設定の初期化だけ先に済ませる。 + sp_stage_stats_init(); + // 学習層から呼ばれた場合は、計測自体は常に有効化する(ログは env で制御)。 + g_sp_stage_stats_enabled = 1; + + for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) { + if (stage1) { + stage1[cls] = atomic_load_explicit(&g_sp_stage1_hits[cls], + memory_order_relaxed); + } + if (stage2) { + stage2[cls] = atomic_load_explicit(&g_sp_stage2_hits[cls], + memory_order_relaxed); + } + if (stage3) { + stage3[cls] = atomic_load_explicit(&g_sp_stage3_hits[cls], + memory_order_relaxed); + } + if (active_slots) { + active_slots[cls] = g_shared_pool.class_active_slots[cls]; + } + } +} + +// Helper: return per-class active slot limit from FrozenPolicy.tiny_cap[] +// Semantics: +// - tiny_cap[class] == 0 → no limit (unbounded) +// - otherwise: soft cap on ACTIVE slots managed by shared pool for this class. +static inline uint32_t sp_class_active_limit(int class_idx) { + const FrozenPolicy* pol = hkm_policy_get(); + if (!pol) { + return 0; // no limit + } + if (class_idx < 0 || class_idx >= 8) { + return 0; + } + return (uint32_t)pol->tiny_cap[class_idx]; +} + +// ============================================================================ +// Superslab L0 Cache (per-thread, per-class hot slot) +// ============================================================================ +// +// Goal: +// - Avoid calling shared_pool_acquire_slab()'s full 3-stage logic on every +// allocation when the same (ss, slab_idx) still has room. +// - Keep Box boundaries: slot ownership/state is still managed by SP-SLOT, +// L0 では「既に ACTIVE な slot を再利用するだけ」(UNUSED/EMPTY には触れない)。 +// +// Design: +// - Per-thread TLS for each tiny class (0..TINY_NUM_CLASSES_SS-1): +// - SharedSSMeta* meta +// - uint8_t slot_idx +// - Stage 0 in shared_pool_acquire_slab(): +// - If L0 entry exists and meta->ss is non-NULL and +// ss->slabs[slot_idx] is still bound to this class, +// return (ss, slot_idx) directly without touching locks or lists. +// - If SuperSlab has been freed (meta->ss == NULL) or slot reused, +// L0 エントリを破棄して通常の Stage 1-3 にフォールバック。 +// +// Env: +// - HAKMEM_SS_L0=0 → L0 無効 +// - HAKMEM_SS_L0=1 → L0 有効(デフォルト) + +static __thread SharedSSMeta* g_sp_l0_meta[TINY_NUM_CLASSES_SS]; +static __thread uint8_t g_sp_l0_slot[TINY_NUM_CLASSES_SS]; + +// NOTE: L0 は実験段階のため、現行ビルドでは常に無効化したままにする。 +// 将来の安定版で再度有効化する場合は、実装と検証をやり直すこと。 +static inline int sp_l0_enabled(void) { + (void)g_sp_l0_meta; + (void)g_sp_l0_slot; + return 0; // Disabled for now } // ============================================================================ @@ -58,12 +197,35 @@ static void __attribute__((destructor)) lock_stats_report(void) { FreeSlotNode g_free_node_pool[TINY_NUM_CLASSES_SS][MAX_FREE_NODES_PER_CLASS]; _Atomic uint32_t g_node_alloc_index[TINY_NUM_CLASSES_SS] = {0}; +// Recycle list for FreeSlotNode (per class, lock-free LIFO). +// node_alloc() はまずこのリストから再利用を試み、枯渇時のみ新規ノードを切り出す。 +static _Atomic(FreeSlotNode*) g_node_free_head[TINY_NUM_CLASSES_SS] = { + [0 ... TINY_NUM_CLASSES_SS-1] = ATOMIC_VAR_INIT(NULL) +}; + // Allocate a node from pool (lock-free fast path, may fall back to legacy path) static inline FreeSlotNode* node_alloc(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } + // First, try to pop from recycle list (nodes returned by pop_lockfree). + FreeSlotNode* free_head = atomic_load_explicit( + &g_node_free_head[class_idx], + memory_order_acquire); + while (free_head != NULL) { + FreeSlotNode* next = free_head->next; + if (atomic_compare_exchange_weak_explicit( + &g_node_free_head[class_idx], + &free_head, + next, + memory_order_acq_rel, + memory_order_acquire)) { + return free_head; // Recycled node + } + // CAS failed: free_head is updated; retry with new head. + } + uint32_t idx = atomic_fetch_add(&g_node_alloc_index[class_idx], 1); if (idx >= MAX_FREE_NODES_PER_CLASS) { // Pool exhausted - should be rare. Caller must fall back to legacy @@ -445,9 +607,19 @@ static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int* *out_meta = old_head->meta; *out_slot_idx = old_head->slot_idx; - // NOTE: We do NOT free the node back to pool (no node recycling yet) - // This is acceptable because MAX_FREE_NODES_PER_CLASS (512) is generous - // and workloads typically don't push/pop the same slot repeatedly + // Recycle node back into per-class free list so that long-running workloads + // do not permanently consume new nodes on every EMPTY event. + FreeSlotNode* free_head = atomic_load_explicit( + &g_node_free_head[class_idx], + memory_order_acquire); + do { + old_head->next = free_head; + } while (!atomic_compare_exchange_weak_explicit( + &g_node_free_head[class_idx], + &free_head, + old_head, + memory_order_release, + memory_order_acquire)); return 1; // Success } @@ -491,7 +663,7 @@ shared_pool_allocate_superslab_unlocked(void) // For shared-pool semantics we normalize all slab class_idx to UNASSIGNED. int max_slabs = ss_slabs_capacity(ss); for (int i = 0; i < max_slabs; i++) { - ss->slabs[i].class_idx = 255; // UNASSIGNED + ss_slab_meta_class_idx_set(ss, i, 255); // UNASSIGNED } if (g_shared_pool.total_count >= g_shared_pool.capacity) { @@ -556,13 +728,48 @@ shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out) shared_pool_init(); - // Debug logging + // Debug logging / stage stats static int dbg_acquire = -1; if (__builtin_expect(dbg_acquire == -1, 0)) { const char* e = getenv("HAKMEM_SS_ACQUIRE_DEBUG"); dbg_acquire = (e && *e && *e != '0') ? 1 : 0; } + sp_stage_stats_init(); + // ========== Stage 0: Per-thread hot slot (L0) reuse ========== + // + // 既に ACTIVE な slot で、かつ class_idx が一致し、まだ capacity に余裕がある場合のみ + // そのまま (ss, slab_idx) を返す。slot state の遷移や lock は一切触らない。 + if (sp_l0_enabled()) { + SharedSSMeta* meta = g_sp_l0_meta[class_idx]; + int l0_idx = (int)g_sp_l0_slot[class_idx]; + if (meta && l0_idx >= 0) { + SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire); + if (ss && l0_idx < ss_slabs_capacity(ss)) { + TinySlabMeta* slab_meta = &ss->slabs[l0_idx]; + if (slab_meta->class_idx == (uint8_t)class_idx && + slab_meta->capacity > 0 && + slab_meta->used < slab_meta->capacity) { + if (dbg_acquire == 1) { + fprintf(stderr, + "[SP_ACQUIRE_STAGE0_L0] class=%d reuse hot slot (ss=%p slab=%d used=%u cap=%u)\n", + class_idx, + (void*)ss, + l0_idx, + (unsigned)slab_meta->used, + (unsigned)slab_meta->capacity); + } + *ss_out = ss; + *slab_idx_out = l0_idx; + return 0; + } + } + // 熱スロットが無効になっているのでクリアして通常経路へ + g_sp_l0_meta[class_idx] = NULL; + } + } + +stage1_retry_after_tension_drain: // ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ========== // P0-4: Lock-free pop from per-class free list (no mutex needed!) // Best case: Same class freed a slot, reuse immediately (cache-hot) @@ -606,17 +813,27 @@ shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out) // Update SuperSlab metadata ss->slab_bitmap |= (1u << reuse_slot_idx); - ss->slabs[reuse_slot_idx].class_idx = (uint8_t)class_idx; + ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx); if (ss->active_slabs == 0) { // Was empty, now active again ss->active_slabs = 1; g_shared_pool.active_count++; } + // Track per-class active slots (approximate, under alloc_lock) + if (class_idx < TINY_NUM_CLASSES_SS) { + g_shared_pool.class_active_slots[class_idx]++; + } // Update hint g_shared_pool.class_hints[class_idx] = ss; + // Update per-thread hot slot (L0) + if (sp_l0_enabled()) { + g_sp_l0_meta[class_idx] = reuse_meta; + g_sp_l0_slot[class_idx] = (uint8_t)reuse_slot_idx; + } + *ss_out = ss; *slab_idx_out = reuse_slot_idx; @@ -624,6 +841,9 @@ shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out) atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); + if (g_sp_stage_stats_enabled) { + atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1); + } return 0; // ✅ Stage 1 (lock-free) success } @@ -674,16 +894,25 @@ stage2_fallback: // Update SuperSlab metadata under mutex ss->slab_bitmap |= (1u << claimed_idx); - ss->slabs[claimed_idx].class_idx = (uint8_t)class_idx; + ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx); if (ss->active_slabs == 0) { ss->active_slabs = 1; g_shared_pool.active_count++; } + if (class_idx < TINY_NUM_CLASSES_SS) { + g_shared_pool.class_active_slots[class_idx]++; + } // Update hint g_shared_pool.class_hints[class_idx] = ss; + // Update per-thread hot slot (L0) + if (sp_l0_enabled()) { + g_sp_l0_meta[class_idx] = meta; + g_sp_l0_slot[class_idx] = (uint8_t)claimed_idx; + } + *ss_out = ss; *slab_idx_out = claimed_idx; @@ -691,12 +920,55 @@ stage2_fallback: atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); + if (g_sp_stage_stats_enabled) { + atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1); + } return 0; // ✅ Stage 2 (lock-free) success } // Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab } + // ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ========== + // If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection + // This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1 + // ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1) + // ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024) + { + static int tension_drain_enabled = -1; + static uint32_t tension_threshold = 1024; + + if (tension_drain_enabled < 0) { + const char* env = getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE"); + tension_drain_enabled = (env == NULL || atoi(env) != 0) ? 1 : 0; + + const char* thresh_env = getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD"); + if (thresh_env) { + tension_threshold = (uint32_t)atoi(thresh_env); + if (tension_threshold < 64) tension_threshold = 64; + if (tension_threshold > 65536) tension_threshold = 65536; + } + } + + if (tension_drain_enabled) { + extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; + extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size); + + uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll_count[class_idx] : 0; + + if (sll_count >= tension_threshold) { + // Drain all blocks to maximize EMPTY slot creation + uint32_t drained = tiny_tls_sll_drain(class_idx, 0); // 0 = drain all + + if (drained > 0) { + // Retry Stage 1 (EMPTY reuse) after drain + // Some slabs might have become EMPTY (meta->used == 0) + goto stage1_retry_after_tension_drain; + } + } + } + } + // ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ========== // All existing SuperSlabs have no UNUSED slots → need new SuperSlab // P0 instrumentation: count lock acquisitions @@ -736,6 +1008,21 @@ stage2_fallback: return -1; // ❌ Out of memory } + // Before creating a new SuperSlab, consult learning-layer soft cap. + // If current active slots for this class already exceed the policy cap, + // fail early so caller can fall back to legacy backend. + uint32_t limit = sp_class_active_limit(class_idx); + if (limit > 0) { + uint32_t cur = g_shared_pool.class_active_slots[class_idx]; + if (cur >= limit) { + if (g_lock_stats_enabled == 1) { + atomic_fetch_add(&g_lock_release_count, 1); + } + pthread_mutex_unlock(&g_shared_pool.alloc_lock); + return -1; // Soft cap reached for this class + } + } + // Create metadata for this new SuperSlab SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss); if (!new_meta) { @@ -758,13 +1045,22 @@ stage2_fallback: // Update SuperSlab metadata new_ss->slab_bitmap |= (1u << first_slot); - new_ss->slabs[first_slot].class_idx = (uint8_t)class_idx; + ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx); new_ss->active_slabs = 1; g_shared_pool.active_count++; + if (class_idx < TINY_NUM_CLASSES_SS) { + g_shared_pool.class_active_slots[class_idx]++; + } // Update hint g_shared_pool.class_hints[class_idx] = new_ss; + // Update per-thread hot slot (L0) + if (sp_l0_enabled()) { + g_sp_l0_meta[class_idx] = new_meta; + g_sp_l0_slot[class_idx] = (uint8_t)first_slot; + } + *ss_out = new_ss; *slab_idx_out = first_slot; @@ -772,6 +1068,9 @@ stage2_fallback: atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); + if (g_sp_stage_stats_enabled) { + atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1); + } return 0; // ✅ Stage 3 success } @@ -869,6 +1168,10 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx) g_shared_pool.active_count--; } } + if (class_idx < TINY_NUM_CLASSES_SS && + g_shared_pool.class_active_slots[class_idx] > 0) { + g_shared_pool.class_active_slots[class_idx]--; + } } // P0-4: Push to lock-free per-class free list (enables reuse by same class) diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 31f66f8e..54cf598f 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -1,7 +1,8 @@ #include "hakmem_tiny.h" #include "hakmem_tiny_config.h" // Centralized configuration #include "hakmem_phase7_config.h" // Phase 7: Task 3 constants (PREWARM_COUNT, etc.) -#include "hakmem_tiny_superslab.h" // Phase 6.22: SuperSlab allocator +#include "hakmem_tiny_superslab.h" +#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary // Phase 6.22: SuperSlab allocator #include "hakmem_super_registry.h" // Phase 8.2: SuperSlab registry for memory profiling #include "hakmem_internal.h" #include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD) @@ -29,6 +30,11 @@ #include "hakmem_prof.h" #include "hakmem_trace.h" // Optional USDT (perf) tracepoints +// Phase E5: Ultra fast path (8-instruction alloc/free) +#if HAKMEM_ULTRA_FAST_PATH +#include "tiny_ultra_fast.inc.h" +#endif + extern uint64_t g_bytes_allocated; // from hakmem_tiny_superslab.c // ============================================================================ @@ -111,12 +117,6 @@ int g_tiny_safe_free = 0; // Default OFF for performance; env: HAKMEM_SA int g_tiny_safe_free_strict = 0; // env: HAKMEM_SAFE_FREE_STRICT=1 int g_tiny_force_remote = 0; // env: HAKMEM_TINY_FORCE_REMOTE=1 -// Hot-class optimization: enable dedicated class5 (256B) TLS fast path -// Env: HAKMEM_TINY_HOTPATH_CLASS5=1/0 (default: 0 for stability; enable explicitly to A/B) -int g_tiny_hotpath_class5 = 0; - -// (moved) tiny_class5_stats_dump is defined later, after TLS vars - // Build-time gate: Minimal Tiny front (bench-only) static inline int superslab_trace_enabled(void) { @@ -501,7 +501,7 @@ static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { uintptr_t delta = (uintptr_t)base_ptr - base; if (blk == 0 || (delta % blk) != 0) { tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned"); - } else if (delta / blk >= ss->slabs[slab_idx].capacity) { + } else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) { tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity"); } } @@ -544,7 +544,8 @@ static _Atomic uint32_t g_ss_partial_epoch = 0; // Phase 6.24: Unified TLS slab cache (Medium fix) // Reduces TLS reads from 3 to 1 (cache-line aligned for performance) -__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; +// Phase E4: 64B alignment for L1 cache optimization +__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64))); static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES]; static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES]; @@ -879,12 +880,14 @@ void ss_partial_publish(int class_idx, SuperSlab* ss) { // The publishing thread must stop using this SS after publishing. int cap_pub = ss_slabs_capacity(ss); for (int s = 0; s < cap_pub; s++) { - uint8_t prev = __atomic_exchange_n(&ss->slabs[s].owner_tid_low, 0u, __ATOMIC_RELEASE); + // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split + TinySlabMeta* meta = ss_slab_meta_ptr(ss, s); + uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE); if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) { uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev; tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE, - (uint16_t)ss->slabs[s].class_idx, - &ss->slabs[s], + (uint16_t)ss_slab_meta_class_idx_get(ss, s), + meta, aux); } } @@ -1168,17 +1171,17 @@ int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable #define TLS_CANARY_MAGIC 0xDEADBEEFDEADBEEFULL __thread uint64_t g_tls_canary_before_sll_head = TLS_CANARY_MAGIC; #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR -__thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0}; +__thread void* g_tls_sll_head[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0}; #else -static __thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0}; +static __thread void* g_tls_sll_head[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0}; #endif __thread uint64_t g_tls_canary_after_sll_head = TLS_CANARY_MAGIC; __thread uint64_t g_tls_canary_before_sll_count = TLS_CANARY_MAGIC; #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR -__thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0}; +__thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0}; #else -static __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0}; +static __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0}; #endif __thread uint64_t g_tls_canary_after_sll_count = TLS_CANARY_MAGIC; static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode @@ -1309,14 +1312,6 @@ static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES]; int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1 __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below -// Phase 13: Tiny Heap v2 - Forward declarations -// NOTE: TLS storage declarations moved to after tiny_heap_v2.h include (Line ~1770) -// Reason: tiny_heap_v2.h must be included AFTER tiny_alloc_fast.inc.h -static inline int tiny_heap_v2_enabled(void); -static inline int tiny_heap_v2_class_enabled(int class_idx); -static inline int tiny_heap_v2_refill_mag(int class_idx); -static inline void* tiny_heap_v2_alloc(size_t size); - // Phase 2D-1: Hot-path inline function extractions(Front) // NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み #include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3} @@ -1324,7 +1319,6 @@ static inline void* tiny_heap_v2_alloc(size_t size); #if HAKMEM_TINY_P0_BATCH_REFILL #include "hakmem_tiny_refill_p0.inc.h" // P0 batch refill → FastCache 直補充 #endif -#include "refill/ss_refill_fc.h" // NEW: Direct SS→FC refill // Phase 7 Task 3: Pre-warm TLS cache at init // Pre-allocate blocks to reduce first-allocation miss penalty @@ -1790,7 +1784,7 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { #error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE" #endif -// Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check + // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check #if HAKMEM_TINY_PHASE6_BOX_REFACTOR #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options" @@ -1802,17 +1796,6 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { // Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions) #include "tiny_alloc_fast.inc.h" - // Phase 13: Tiny Heap v2 front (must come AFTER tiny_alloc_fast.inc.h) - #include "front/tiny_heap_v2.h" - - // Phase 13: Tiny Heap v2 - TLS storage (types defined in tiny_heap_v2.h above) - __thread TinyHeapV2Mag g_tiny_heap_v2_mag[TINY_NUM_CLASSES]; - __thread TinyHeapV2Stats g_tiny_heap_v2_stats[TINY_NUM_CLASSES]; - - // Phase 14: TinyUltraHot - Ultra-fast C1/C2 path (L1 dcache miss reduction) - #include "front/tiny_ultra_hot.h" - __thread TinyUltraHot g_ultra_hot; - // Box 6: Free Fast Path (Layer 2 - 2-3 instructions) #include "tiny_free_fast.inc.h" @@ -1826,6 +1809,14 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { // Export wrapper functions for hakmem.c to call // Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining void* hak_tiny_alloc_fast_wrapper(size_t size) { + // Phase E5: Ultra fast path (8-instruction alloc, bypasses all layers) + // Enable with: HAKMEM_ULTRA_FAST_PATH=1 (compile-time) + #if HAKMEM_ULTRA_FAST_PATH + void* ret = tiny_alloc_fast_ultra(size); + if (ret) return ret; + // Miss → fallback to full fast path + #endif + // Bench-only ultra-short path: bypass diagnostics and pointer tracking // Enable with: HAKMEM_BENCH_FAST_FRONT=1 static int g_bench_fast_front = -1; @@ -1873,6 +1864,12 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { } void hak_tiny_free_fast_wrapper(void* ptr) { + // Phase E5: Ultra fast path (6-8 instruction free) + #if HAKMEM_ULTRA_FAST_PATH + tiny_free_fast_ultra(ptr); + return; + #endif + static _Atomic uint64_t free_call_count = 0; uint64_t call_num = atomic_fetch_add(&free_call_count, 1); if (call_num > 14135 && call_num < 14145) { @@ -2042,19 +2039,6 @@ int tiny_fc_push_bulk(int class_idx, void** arr, int n) { return take; } -// Minimal class5 TLS stats dump (release-safe, one-shot) -// Env: HAKMEM_TINY_CLASS5_STATS_DUMP=1 to enable -static void tiny_class5_stats_dump(void) __attribute__((destructor)); -static void tiny_class5_stats_dump(void) { - const char* e = getenv("HAKMEM_TINY_CLASS5_STATS_DUMP"); - if (!(e && *e && e[0] != '0')) return; - TinyTLSList* tls5 = &g_tls_lists[5]; - fprintf(stderr, "\n=== Class5 TLS (release-min) ===\n"); - fprintf(stderr, "hotpath=%d cap=%u refill_low=%u spill_high=%u count=%u\n", - g_tiny_hotpath_class5, tls5->cap, tls5->refill_low, tls5->spill_high, tls5->count); - fprintf(stderr, "===============================\n"); -} - // ========= Tiny Guard (targeted debug; low overhead when disabled) ========= static int g_tiny_guard_enabled = -1; static int g_tiny_guard_class = 2; @@ -2105,93 +2089,3 @@ void tiny_guard_on_invalid(void* user_ptr, uint8_t hdr) { } -// Phase 13-A: Tiny Heap v2 statistics wrapper (for external linkage) -void tiny_heap_v2_print_stats(void) { - // Implemented in front/tiny_heap_v2.h as static inline - // This wrapper is needed for external linkage from bench programs - extern __thread TinyHeapV2Stats g_tiny_heap_v2_stats[TINY_NUM_CLASSES]; - - static int g_stats_enable = -1; - if (g_stats_enable == -1) { - const char* e = getenv("HAKMEM_TINY_HEAP_V2_STATS"); - g_stats_enable = (e && *e && *e != '0') ? 1 : 0; - } - if (!g_stats_enable) return; - - fprintf(stderr, "\n=== TinyHeapV2 Statistics (en=%d) ===\n", g_stats_enable); - int any_allocs = 0; - for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { - TinyHeapV2Stats* s = &g_tiny_heap_v2_stats[cls]; - if (s->alloc_calls == 0) continue; - - double hit_rate = (s->alloc_calls > 0) ? (100.0 * s->mag_hits / s->alloc_calls) : 0.0; - double avg_refill = (s->refill_calls > 0) ? ((double)s->refill_blocks / s->refill_calls) : 0.0; - - fprintf(stderr, "[C%d] alloc=%lu mag_hits=%lu (%.1f%%) refill=%lu avg_blocks=%.1f oom=%lu\n", - cls, s->alloc_calls, s->mag_hits, hit_rate, - s->refill_calls, avg_refill, s->backend_oom); - any_allocs = 1; - } - if (!any_allocs) fprintf(stderr, "(No HeapV2 allocs recorded)\n"); - fprintf(stderr, "==============================\n\n"); -} - -// Phase 14 + Phase 14-B: UltraHot statistics (C2-C5) -void ultra_hot_print_stats(void) { - extern __thread TinyUltraHot g_ultra_hot; - - static int g_stats_enable = -1; - if (g_stats_enable == -1) { - const char* e = getenv("HAKMEM_TINY_ULTRA_HOT_STATS"); - g_stats_enable = (e && *e && *e != '0') ? 1 : 0; - } - if (!g_stats_enable) return; - - fprintf(stderr, "\n=== TinyUltraHot Statistics (Phase 14 + 14-B) ===\n"); - - // C1 (16B) stats - Phase 14 - uint64_t c1_total = g_ultra_hot.c1_alloc_calls; - if (c1_total > 0) { - double c1_hit_rate = 100.0 * g_ultra_hot.c1_hits / c1_total; - fprintf(stderr, "[C2-16B] alloc=%lu hits=%lu (%.1f%%) misses=%lu\n", - c1_total, g_ultra_hot.c1_hits, c1_hit_rate, g_ultra_hot.c1_misses); - fprintf(stderr, " free=%lu free_hits=%lu\n", - g_ultra_hot.c1_free_calls, g_ultra_hot.c1_free_hits); - } - - // C2 (32B) stats - Phase 14 - uint64_t c2_total = g_ultra_hot.c2_alloc_calls; - if (c2_total > 0) { - double c2_hit_rate = 100.0 * g_ultra_hot.c2_hits / c2_total; - fprintf(stderr, "[C3-32B] alloc=%lu hits=%lu (%.1f%%) misses=%lu\n", - c2_total, g_ultra_hot.c2_hits, c2_hit_rate, g_ultra_hot.c2_misses); - fprintf(stderr, " free=%lu free_hits=%lu\n", - g_ultra_hot.c2_free_calls, g_ultra_hot.c2_free_hits); - } - - // C4 (64B) stats - Phase 14-B NEW - uint64_t c4_total = g_ultra_hot.c4_alloc_calls; - if (c4_total > 0) { - double c4_hit_rate = 100.0 * g_ultra_hot.c4_hits / c4_total; - fprintf(stderr, "[C4-64B] alloc=%lu hits=%lu (%.1f%%) misses=%lu (NEW Phase 14-B)\n", - c4_total, g_ultra_hot.c4_hits, c4_hit_rate, g_ultra_hot.c4_misses); - fprintf(stderr, " free=%lu free_hits=%lu\n", - g_ultra_hot.c4_free_calls, g_ultra_hot.c4_free_hits); - } - - // C5 (128B) stats - Phase 14-B NEW - uint64_t c5_total = g_ultra_hot.c5_alloc_calls; - if (c5_total > 0) { - double c5_hit_rate = 100.0 * g_ultra_hot.c5_hits / c5_total; - fprintf(stderr, "[C5-128B] alloc=%lu hits=%lu (%.1f%%) misses=%lu (NEW Phase 14-B)\n", - c5_total, g_ultra_hot.c5_hits, c5_hit_rate, g_ultra_hot.c5_misses); - fprintf(stderr, " free=%lu free_hits=%lu\n", - g_ultra_hot.c5_free_calls, g_ultra_hot.c5_free_hits); - } - - if (c1_total == 0 && c2_total == 0 && c4_total == 0 && c5_total == 0) { - fprintf(stderr, "(No UltraHot allocs recorded)\n"); - } - fprintf(stderr, "==================================================\n\n"); -} - diff --git a/core/hakmem_tiny_lifecycle.inc b/core/hakmem_tiny_lifecycle.inc index 1ead2346..b43be675 100644 --- a/core/hakmem_tiny_lifecycle.inc +++ b/core/hakmem_tiny_lifecycle.inc @@ -11,6 +11,7 @@ // // Cold/maintenance path - not performance critical. #include "tiny_tls_guard.h" +#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary // Phase 12: Helper to derive a representative class index for a SuperSlab // from per-slab metadata (all slabs are empty when used in trim). @@ -18,7 +19,7 @@ static inline int superslab_any_class_idx(SuperSlab* ss) { if (!ss) return -1; int cap = ss_slabs_capacity(ss); for (int s = 0; s < cap; s++) { - uint8_t cls = ss->slabs[s].class_idx; + uint8_t cls = ss_slab_meta_class_idx_get(ss, s); if (cls < TINY_NUM_CLASSES) return (int)cls; } return -1; diff --git a/core/tiny_free_fast_v2.inc.h b/core/tiny_free_fast_v2.inc.h index fbfc2fc1..27671008 100644 --- a/core/tiny_free_fast_v2.inc.h +++ b/core/tiny_free_fast_v2.inc.h @@ -23,12 +23,10 @@ #include "box/tls_sll_box.h" // Box TLS-SLL API #include "box/tls_sll_drain_box.h" // Box TLS-SLL Drain (Option B) #include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection -#include "front/tiny_heap_v2.h" // Phase 13-B: TinyHeapV2 magazine supply -#include "front/tiny_ultra_hot.h" // Phase 14: TinyUltraHot C1/C2 ultra-fast path -#include "front/tiny_ring_cache.h" // Phase 21-1: Ring cache (C2/C3 array-based TLS cache) -#include "front/tiny_unified_cache.h" // Phase 23: Unified frontend cache (tcache-style, all classes) +// Ring Cache and Unified Cache removed (A/B test: OFF is faster) #include "hakmem_super_registry.h" // For hak_super_lookup (cross-thread check) #include "superslab/superslab_inline.h" // For slab_index_for (cross-thread check) +#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary #include "box/free_remote_box.h" // For tiny_free_remote_box (cross-thread routing) // Phase 7: Header-based ultra-fast free @@ -127,7 +125,9 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { assert(0 && "class_idx from header out of bounds"); return 0; } +#if !HAKMEM_BUILD_RELEASE atomic_fetch_add(&g_integrity_check_class_bounds, 1); +#endif // 2. Check TLS freelist capacity (defense in depth - ALWAYS ENABLED) // CRITICAL: Enable in both debug and release to prevent corruption accumulation @@ -148,45 +148,6 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { // → 正史(TLS SLL)の在庫を正しく保つ // → UltraHot refill は alloc 側で TLS SLL から借りる - // Phase 23: Unified Frontend Cache (all classes) - tcache-style single-layer cache - // ENV-gated: HAKMEM_TINY_UNIFIED_CACHE=1 (default: OFF) - // Target: +50-100% (20.3M → 30-40M ops/s) by flattening 4-5 layer cascade - // Design: Single unified array cache (2-3 cache misses vs current 8-10) - if (__builtin_expect(unified_cache_enabled(), 0)) { - if (unified_cache_push(class_idx, base)) { - // Unified cache push success - done! - return 1; - } - // Unified cache full while enabled → fall back to existing TLS helper directly. - return tiny_alloc_fast_push(class_idx, base); - } - - // Phase 21-1: Ring Cache (C2/C3 only) - Array-based TLS cache - // ENV-gated: HAKMEM_TINY_HOT_RING_ENABLE=1 (default: ON after Phase 21-1-D) - // Target: +15-20% (54.4M → 62-65M ops/s) by eliminating pointer chasing - // Design: Ring (L0) → SLL (L1) → SuperSlab (L2) cascade hierarchy - if (class_idx == 2 || class_idx == 3) { - if (ring_cache_push(class_idx, base)) { - // Ring push success - done! - return 1; - } - // Ring full - fall through to existing path (TLS SLL/HeapV2) - } - - // Phase 13-B: TinyHeapV2 magazine supply (C0-C3 only) - // Two supply modes (controlled by HAKMEM_TINY_HEAP_V2_LEFTOVER_MODE): - // Mode 0 (default): L0 gets blocks first ("stealing" design) - // Mode 1: L1 primary owner, L0 gets leftovers (ChatGPT recommended design) - if (class_idx <= 3 && tiny_heap_v2_enabled() && !tiny_heap_v2_leftover_mode()) { - // Mode 0: Try to supply to magazine first (L0 cache, faster than TLS SLL) - // Falls back to TLS SLL if magazine is full - if (tiny_heap_v2_try_push(class_idx, base)) { - // Successfully supplied to magazine - return 1; - } - // Magazine full → fall through to TLS SLL - } - // LARSON FIX (2025-11-16): Cross-thread free detection - ENV GATED // Problem: Larson MT crash - TLS SLL poison (0xbada55...) from cross-thread free // Root cause: Block allocated by Thread A, freed by Thread B → pushed to B's TLS SLL @@ -209,7 +170,7 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0, 1)) { uint32_t self_tid = tiny_self_u32_local(); - uint8_t owner_tid_low = ss->slabs[slab_idx].owner_tid_low; + uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); // Check if this is a cross-thread free (lower 8 bits mismatch) if (__builtin_expect((owner_tid_low & 0xFF) != (self_tid & 0xFF), 0)) { @@ -236,19 +197,6 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { return 0; } - // Phase 13-B: Leftover mode - L0 gets leftovers from L1 - // Mode 1: L1 (TLS SLL) is primary owner, L0 (magazine) gets leftovers - // Only refill L0 if it's empty (don't reduce L1 capacity) - if (class_idx <= 3 && tiny_heap_v2_enabled() && tiny_heap_v2_leftover_mode()) { - TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx]; - if (mag->top == 0) { // Only refill if magazine is empty - void* leftover; - if (tls_sll_pop(class_idx, &leftover)) { - mag->items[mag->top++] = leftover; - } - } - } - // Option B: Periodic TLS SLL Drain (restore slab accounting consistency) // Purpose: Every N frees (default: 1024), drain TLS SLL → slab freelist // Impact: Enables empty detection → SuperSlabs freed → LRU cache functional