#include "hakmem_shared_pool_internal.h" #include "hakmem_debug_master.h" // Phase 4b: Master debug control #include "hakmem_stats_master.h" // Phase 4d: Master stats control #include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary #include "box/ss_hot_cold_box.h" // Phase 12-1.1: EMPTY slab marking #include "box/pagefault_telemetry_box.h" // Box PageFaultTelemetry (PF_BUCKET_SS_META) #include "box/tls_sll_drain_box.h" // Box TLS SLL Drain (tiny_tls_sll_drain) #include "box/tls_slab_reuse_guard_box.h" // Box TLS Slab Reuse Guard (P0.3) #include "hakmem_policy.h" // FrozenPolicy (learning layer) #include #include #include #include #include // For mmap/munmap (used in shared_pool_ensure_capacity_unlocked) // ============================================================================ // P0 Lock Contention Instrumentation (Debug build only; counters defined always) // ============================================================================ _Atomic uint64_t g_lock_acquire_count = 0; // Total lock acquisitions _Atomic uint64_t g_lock_release_count = 0; // Total lock releases _Atomic uint64_t g_lock_acquire_slab_count = 0; // Locks from acquire_slab path _Atomic uint64_t g_lock_release_slab_count = 0; // Locks from release_slab path #if !HAKMEM_BUILD_RELEASE int g_lock_stats_enabled = -1; // -1=uninitialized, 0=off, 1=on // Initialize lock stats from environment variable // Phase 4b: Now uses hak_debug_check() for master debug control support void lock_stats_init(void) { if (__builtin_expect(g_lock_stats_enabled == -1, 0)) { g_lock_stats_enabled = hak_debug_check("HAKMEM_SHARED_POOL_LOCK_STATS"); } } // Report lock statistics at shutdown static void __attribute__((destructor)) lock_stats_report(void) { if (g_lock_stats_enabled != 1) { return; } uint64_t acquires = atomic_load(&g_lock_acquire_count); uint64_t releases = atomic_load(&g_lock_release_count); uint64_t acquire_path = atomic_load(&g_lock_acquire_slab_count); uint64_t release_path = atomic_load(&g_lock_release_slab_count); fprintf(stderr, "\n=== SHARED POOL LOCK STATISTICS ===\n"); fprintf(stderr, "Total lock ops: %lu (acquire) + %lu (release) = %lu\n", acquires, releases, acquires + releases); fprintf(stderr, "Balance: %ld (should be 0)\n", (int64_t)acquires - (int64_t)releases); fprintf(stderr, "\n--- Breakdown by Code Path ---\n"); fprintf(stderr, "acquire_slab(): %lu (%.1f%%)\n", acquire_path, 100.0 * acquire_path / (acquires ? acquires : 1)); fprintf(stderr, "release_slab(): %lu (%.1f%%)\n", release_path, 100.0 * release_path / (acquires ? acquires : 1)); fprintf(stderr, "===================================\n"); fflush(stderr); } #else // Release build: No-op stubs int g_lock_stats_enabled = 0; #endif // ============================================================================ // SP Acquire Stage Statistics (Stage1/2/3 breakdown) // ============================================================================ _Atomic uint64_t g_sp_stage1_hits[TINY_NUM_CLASSES_SS]; _Atomic uint64_t g_sp_stage2_hits[TINY_NUM_CLASSES_SS]; _Atomic uint64_t g_sp_stage3_hits[TINY_NUM_CLASSES_SS]; // Data collection gate (0=off, 1=on). 学習層からも有効化される。 int g_sp_stage_stats_enabled = 0; #if !HAKMEM_BUILD_RELEASE // Logging gate for destructor(ENV: HAKMEM_SHARED_POOL_STAGE_STATS) static int g_sp_stage_stats_log_enabled = -1; // -1=uninitialized, 0=off, 1=on void sp_stage_stats_init(void) { // Phase 4d: Now uses hak_stats_check() for unified stats control if (__builtin_expect(g_sp_stage_stats_log_enabled == -1, 0)) { g_sp_stage_stats_log_enabled = hak_stats_check("HAKMEM_SHARED_POOL_STAGE_STATS", "pool"); if (g_sp_stage_stats_log_enabled == 1) { // ログが有効なら計測も必ず有効化する。 g_sp_stage_stats_enabled = 1; } } } static void __attribute__((destructor)) sp_stage_stats_report(void) { if (g_sp_stage_stats_log_enabled != 1) { return; } fprintf(stderr, "\n=== SHARED POOL STAGE STATISTICS ===\n"); fprintf(stderr, "Per-class acquire_slab() stage hits (Stage1=EMPTY, Stage2=UNUSED, Stage3=new SS)\n"); for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) { uint64_t s1 = atomic_load(&g_sp_stage1_hits[cls]); uint64_t s2 = atomic_load(&g_sp_stage2_hits[cls]); uint64_t s3 = atomic_load(&g_sp_stage3_hits[cls]); uint64_t total = s1 + s2 + s3; if (total == 0) continue; // Skip unused classes double p1 = 100.0 * (double)s1 / (double)total; double p2 = 100.0 * (double)s2 / (double)total; double p3 = 100.0 * (double)s3 / (double)total; fprintf(stderr, "Class %d: total=%llu S1=%llu (%.1f%%) S2=%llu (%.1f%%) S3=%llu (%.1f%%)\n", cls, (unsigned long long)total, (unsigned long long)s1, p1, (unsigned long long)s2, p2, (unsigned long long)s3, p3); } fprintf(stderr, "====================================\n"); fflush(stderr); } #else // Release build: No-op stubs void sp_stage_stats_init(void) {} #endif // Snapshot Tiny-related backend metrics for learner / observability. void shared_pool_tiny_metrics_snapshot(uint64_t stage1[TINY_NUM_CLASSES_SS], uint64_t stage2[TINY_NUM_CLASSES_SS], uint64_t stage3[TINY_NUM_CLASSES_SS], uint32_t active_slots[TINY_NUM_CLASSES_SS]) { // Ensure env-based logging設定の初期化だけ先に済ませる。 sp_stage_stats_init(); // 学習層から呼ばれた場合は、計測自体は常に有効化する(ログは env で制御)。 g_sp_stage_stats_enabled = 1; for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) { if (stage1) { stage1[cls] = atomic_load_explicit(&g_sp_stage1_hits[cls], memory_order_relaxed); } if (stage2) { stage2[cls] = atomic_load_explicit(&g_sp_stage2_hits[cls], memory_order_relaxed); } if (stage3) { stage3[cls] = atomic_load_explicit(&g_sp_stage3_hits[cls], memory_order_relaxed); } if (active_slots) { active_slots[cls] = g_shared_pool.class_active_slots[cls]; } } } // Helper: return per-class active slot limit from FrozenPolicy.tiny_cap[] // Semantics: // - tiny_cap[class] == 0 → no limit (unbounded) // - otherwise: soft cap on ACTIVE slots managed by shared pool for this class. uint32_t sp_class_active_limit(int class_idx) { const FrozenPolicy* pol = hkm_policy_get(); if (!pol) { return 0; // no limit } if (class_idx < 0 || class_idx >= 8) { return 0; } return (uint32_t)pol->tiny_cap[class_idx]; } // ============================================================================ // P0-4: Lock-Free Free Slot List - Node Pool // ============================================================================ // Pre-allocated node pools (one per class, to avoid malloc/free) FreeSlotNode g_free_node_pool[TINY_NUM_CLASSES_SS][MAX_FREE_NODES_PER_CLASS]; _Atomic uint32_t g_node_alloc_index[TINY_NUM_CLASSES_SS] = {0}; // Recycle list for FreeSlotNode (per class, lock-free LIFO). // node_alloc() はまずこのリストから再利用を試み、枯渇時のみ新規ノードを切り出す。 static _Atomic(FreeSlotNode*) g_node_free_head[TINY_NUM_CLASSES_SS] = { [0 ... TINY_NUM_CLASSES_SS-1] = ATOMIC_VAR_INIT(NULL) }; // Allocate a node from pool (lock-free fast path, may fall back to legacy path) static inline FreeSlotNode* node_alloc(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } // First, try to pop from recycle list (nodes returned by pop_lockfree). FreeSlotNode* free_head = atomic_load_explicit( &g_node_free_head[class_idx], memory_order_acquire); while (free_head != NULL) { FreeSlotNode* next = free_head->next; if (atomic_compare_exchange_weak_explicit( &g_node_free_head[class_idx], &free_head, next, memory_order_acq_rel, memory_order_acquire)) { return free_head; // Recycled node } // CAS failed: free_head is updated; retry with new head. } uint32_t idx = atomic_fetch_add(&g_node_alloc_index[class_idx], 1); if (idx >= MAX_FREE_NODES_PER_CLASS) { // Pool exhausted - should be rare. return NULL; } return &g_free_node_pool[class_idx][idx]; } // ============================================================================ // Phase 12-2: SharedSuperSlabPool skeleton implementation // Goal: // - Centralize SuperSlab allocation/registration // - Provide acquire_slab/release_slab APIs for later refill/free integration // - Keep logic simple & conservative; correctness and observability first. // // Notes: // - Concurrency: protected by g_shared_pool.alloc_lock for now. // - class_hints is best-effort: read lock-free, written under lock. // - LRU hooks left as no-op placeholders. SharedSuperSlabPool g_shared_pool = { .slabs = NULL, .capacity = 0, .total_count = 0, .active_count = 0, .alloc_lock = PTHREAD_MUTEX_INITIALIZER, .class_hints = { NULL }, .lru_head = NULL, .lru_tail = NULL, .lru_count = 0, // P0-4: Lock-free free slot lists (zero-initialized atomic pointers) .free_slots_lockfree = {{.head = ATOMIC_VAR_INIT(NULL)}}, // Legacy: mutex-protected free lists .free_slots = {{.entries = {{0}}, .count = 0}}, // Phase 12: SP-SLOT fields (ss_metadata is fixed-size array, auto-zeroed) .ss_meta_count = 0 }; void shared_pool_ensure_capacity_unlocked(uint32_t min_capacity) { if (g_shared_pool.capacity >= min_capacity) { return; } uint32_t new_cap = g_shared_pool.capacity ? g_shared_pool.capacity : 16; while (new_cap < min_capacity) { new_cap *= 2; } // CRITICAL FIX: Use system mmap() directly to avoid recursion! size_t new_size = new_cap * sizeof(SuperSlab*); SuperSlab** new_slabs = (SuperSlab**)mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (new_slabs == MAP_FAILED) { // Allocation failure: keep old state; caller must handle NULL later. return; } // Copy old data if exists if (g_shared_pool.slabs != NULL) { memcpy(new_slabs, g_shared_pool.slabs, g_shared_pool.capacity * sizeof(SuperSlab*)); // Free old mapping (also use system munmap, not free!) size_t old_size = g_shared_pool.capacity * sizeof(SuperSlab*); munmap(g_shared_pool.slabs, old_size); } // Zero new entries to keep scanning logic simple. memset(new_slabs + g_shared_pool.capacity, 0, (new_cap - g_shared_pool.capacity) * sizeof(SuperSlab*)); g_shared_pool.slabs = new_slabs; g_shared_pool.capacity = new_cap; } void shared_pool_init(void) { // Idempotent init; safe to call from multiple early paths. // pthread_mutex_t with static initializer is already valid. pthread_mutex_lock(&g_shared_pool.alloc_lock); if (g_shared_pool.capacity == 0 && g_shared_pool.slabs == NULL) { shared_pool_ensure_capacity_unlocked(16); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); } // ============================================================================ // Phase 12: SP-SLOT Box - Modular Helper Functions // ============================================================================ // ---------- Layer 1: Slot Operations (Low-level) ---------- // Find first unused slot in SharedSSMeta // P0-5: Uses atomic load for state check // Returns: slot_idx on success, -1 if no unused slots static int sp_slot_find_unused(SharedSSMeta* meta) { if (!meta) return -1; for (int i = 0; i < meta->total_slots; i++) { SlotState state = atomic_load_explicit(&meta->slots[i].state, memory_order_acquire); if (state == SLOT_UNUSED) { return i; } } return -1; } // Mark slot as ACTIVE (UNUSED→ACTIVE or EMPTY→ACTIVE) // P0-5: Uses atomic store for state transition (caller must hold mutex!) // Returns: 0 on success, -1 on error int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx) { if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1; if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1; SharedSlot* slot = &meta->slots[slot_idx]; // Load state atomically SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire); // Transition: UNUSED→ACTIVE or EMPTY→ACTIVE if (state == SLOT_UNUSED || state == SLOT_EMPTY) { atomic_store_explicit(&slot->state, SLOT_ACTIVE, memory_order_release); slot->class_idx = (uint8_t)class_idx; slot->slab_idx = (uint8_t)slot_idx; meta->active_slots++; return 0; } return -1; // Already ACTIVE or invalid state } // Mark slot as EMPTY (ACTIVE→EMPTY) // P0-5: Uses atomic store for state transition (caller must hold mutex!) // Returns: 0 on success, -1 on error int sp_slot_mark_empty(SharedSSMeta* meta, int slot_idx) { if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1; SharedSlot* slot = &meta->slots[slot_idx]; // Load state atomically SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire); if (state == SLOT_ACTIVE) { atomic_store_explicit(&slot->state, SLOT_EMPTY, memory_order_release); if (meta->active_slots > 0) { meta->active_slots--; } return 0; } return -1; // Not ACTIVE } // Sync SP-SLOT view from an existing SuperSlab. // This is needed when a legacy-allocated SuperSlab reaches the shared-pool // release path for the first time (slot states are still SLOT_UNUSED). void sp_meta_sync_slots_from_ss(SharedSSMeta* meta, SuperSlab* ss) { if (!meta || !ss) return; int cap = ss_slabs_capacity(ss); if (cap > MAX_SLOTS_PER_SS) { cap = MAX_SLOTS_PER_SS; } meta->total_slots = (uint8_t)cap; meta->active_slots = 0; for (int i = 0; i < cap; i++) { SlotState state = SLOT_UNUSED; uint32_t bit = (1u << i); if (ss->slab_bitmap & bit) { state = SLOT_ACTIVE; meta->active_slots++; } else { TinySlabMeta* smeta = &ss->slabs[i]; uint16_t used = atomic_load_explicit(&smeta->used, memory_order_relaxed); if (smeta->capacity > 0 && used == 0) { state = SLOT_EMPTY; } } uint8_t cls = ss->class_map[i]; if (cls == 255) { cls = ss->slabs[i].class_idx; } meta->slots[i].class_idx = cls; meta->slots[i].slab_idx = (uint8_t)i; atomic_store_explicit(&meta->slots[i].state, state, memory_order_release); } } // ---------- Layer 2: Metadata Management (Mid-level) ---------- // Ensure ss_metadata array has capacity for at least min_count entries // Caller must hold alloc_lock // Returns: 0 on success, -1 if capacity exceeded // RACE FIX: No realloc! Fixed-size array prevents race with lock-free Stage 2 static int sp_meta_ensure_capacity(uint32_t min_count) { if (min_count > MAX_SS_METADATA_ENTRIES) { #if !HAKMEM_BUILD_RELEASE static int warn_once = 0; if (warn_once == 0) { fprintf(stderr, "[SP_META_CAPACITY_ERROR] Exceeded MAX_SS_METADATA_ENTRIES=%d\n", MAX_SS_METADATA_ENTRIES); warn_once = 1; } #endif return -1; } return 0; } // Find SharedSSMeta for given SuperSlab, or create if not exists // Caller must hold alloc_lock // Returns: SharedSSMeta* on success, NULL on error SharedSSMeta* sp_meta_find_or_create(SuperSlab* ss) { if (!ss) return NULL; // RACE FIX: Load count atomically for consistency (even under mutex) uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed); // Search existing metadata for (uint32_t i = 0; i < count; i++) { // RACE FIX: Load pointer atomically for consistency SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed); if (meta_ss == ss) { return &g_shared_pool.ss_metadata[i]; } } // Create new metadata entry if (sp_meta_ensure_capacity(count + 1) != 0) { return NULL; } // RACE FIX: Read current count atomically (even under mutex for consistency) uint32_t current_count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed); SharedSSMeta* meta = &g_shared_pool.ss_metadata[current_count]; // RACE FIX: Store SuperSlab pointer atomically (visible to lock-free Stage 2) atomic_store_explicit(&meta->ss, ss, memory_order_relaxed); meta->total_slots = (uint8_t)ss_slabs_capacity(ss); meta->active_slots = 0; // Initialize all slots as UNUSED // P0-5: Use atomic store for state initialization for (int i = 0; i < meta->total_slots; i++) { atomic_store_explicit(&meta->slots[i].state, SLOT_UNUSED, memory_order_relaxed); meta->slots[i].class_idx = 0; meta->slots[i].slab_idx = (uint8_t)i; } // RACE FIX: Atomic increment with release semantics // This ensures all writes to metadata[current_count] (lines 268-278) are visible // before the count increment is visible to lock-free Stage 2 readers atomic_fetch_add_explicit(&g_shared_pool.ss_meta_count, 1, memory_order_release); return meta; } // Find UNUSED slot and claim it (UNUSED → ACTIVE) using lock-free CAS // Returns: slot_idx on success, -1 if no UNUSED slots int sp_slot_claim_lockfree(SharedSSMeta* meta, int class_idx) { if (!meta) return -1; // Optimization: Quick check if any unused slots exist? // For now, just iterate. Metadata size is small (max 32 slots). for (int i = 0; i < meta->total_slots; i++) { SharedSlot* slot = &meta->slots[i]; SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire); if (state == SLOT_UNUSED) { // Attempt CAS: UNUSED → ACTIVE if (atomic_compare_exchange_strong_explicit( &slot->state, &state, SLOT_ACTIVE, memory_order_acq_rel, memory_order_acquire)) { return i; // Success! } // CAS failed: someone else took it or state changed } } return -1; } // ---------- Layer 3: Free List Management ---------- // Push empty slot to per-class free list // Caller must hold alloc_lock // Returns: 0 on success, -1 if list is full int sp_freelist_push_lockfree(int class_idx, SharedSSMeta* meta, int slot_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1; FreeSlotNode* node = node_alloc(class_idx); if (!node) { // Pool exhausted return -1; } node->meta = meta; node->slot_idx = slot_idx; // Lock-free push to stack (LIFO) FreeSlotNode* old_head = atomic_load_explicit( &g_shared_pool.free_slots_lockfree[class_idx].head, memory_order_relaxed); do { node->next = old_head; } while (!atomic_compare_exchange_weak_explicit( &g_shared_pool.free_slots_lockfree[class_idx].head, &old_head, node, memory_order_release, memory_order_relaxed)); return 0; } // Pop empty slot from per-class free list // Lock-free // Returns: 1 on success, 0 if empty int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** meta_out, int* slot_idx_out) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0; FreeSlotNode* head = atomic_load_explicit( &g_shared_pool.free_slots_lockfree[class_idx].head, memory_order_acquire); while (head) { FreeSlotNode* next = head->next; if (atomic_compare_exchange_weak_explicit( &g_shared_pool.free_slots_lockfree[class_idx].head, &head, next, memory_order_acquire, memory_order_acquire)) { // Success! *meta_out = head->meta; *slot_idx_out = head->slot_idx; // Recycle node (push to free_head list) FreeSlotNode* free_head = atomic_load_explicit(&g_node_free_head[class_idx], memory_order_relaxed); do { head->next = free_head; } while (!atomic_compare_exchange_weak_explicit( &g_node_free_head[class_idx], &free_head, head, memory_order_release, memory_order_relaxed)); return 1; } // CAS failed: head updated, retry } return 0; // Empty list } // Allocator helper for SuperSlab (Phase 9-2 Task 1) // NOTE: class_idx MUST be a valid tiny class (0-7). Passing an out-of-range // value previously went through superslab_allocate(8), which overflowed // g_ss_ace[] and could corrupt neighboring globals, leading to missing // registry entries and TLS SLL header corruption. SuperSlab* sp_internal_allocate_superslab(int class_idx) { do { static _Atomic uint32_t g_sp_alloc_log = 0; uint32_t shot = atomic_fetch_add_explicit(&g_sp_alloc_log, 1, memory_order_relaxed); if (shot < 4) { fprintf(stderr, "[SP_INTERNAL_ALLOC] class_idx=%d\n", class_idx); fflush(stderr); } } while (0); // Clamp to valid range to avoid out-of-bounds access inside superslab_allocate(). if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { class_idx = TINY_NUM_CLASSES_SS - 1; } // Use legacy backend to allocate a SuperSlab (malloc-based) extern SuperSlab* superslab_allocate(uint8_t size_class); SuperSlab* ss = superslab_allocate((uint8_t)class_idx); if (!ss) { return NULL; } // Initialize basic fields if not done by superslab_alloc ss->active_slabs = 0; ss->slab_bitmap = 0; return ss; } // ============================================================================ // Public API (High-level) // ============================================================================ SuperSlab* shared_pool_acquire_superslab(void) { // Phase 12: Legacy wrapper? // This function seems to be a direct allocation bypass. return sp_internal_allocate_superslab(0); } void sp_fix_geometry_if_needed(SuperSlab* ss, int slab_idx, int class_idx) { // Phase 9-1: For now, we assume geometry is compatible or set by caller. // This hook exists for future use when we support dynamic geometry resizing. (void)ss; (void)slab_idx; (void)class_idx; }