#include "hakmem_shared_pool.h" #include "hakmem_tiny_superslab.h" #include "hakmem_tiny_superslab_constants.h" #include "hakmem_debug_master.h" // Phase 4b: Master debug control #include "hakmem_stats_master.h" // Phase 4d: Master stats control #include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary #include "box/ss_hot_cold_box.h" // Phase 12-1.1: EMPTY slab marking #include "box/pagefault_telemetry_box.h" // Box PageFaultTelemetry (PF_BUCKET_SS_META) #include "box/tls_sll_drain_box.h" // Box TLS SLL Drain (tiny_tls_sll_drain) #include "box/tls_slab_reuse_guard_box.h" // Box TLS Slab Reuse Guard (P0.3) #include "hakmem_policy.h" // FrozenPolicy (learning layer) #include #include #include #include #include // For mmap/munmap (used in shared_pool_ensure_capacity_unlocked) // ============================================================================ // P0 Lock Contention Instrumentation (Debug build only; counters defined always) // ============================================================================ static _Atomic uint64_t g_lock_acquire_count = 0; // Total lock acquisitions static _Atomic uint64_t g_lock_release_count = 0; // Total lock releases static _Atomic uint64_t g_lock_acquire_slab_count = 0; // Locks from acquire_slab path static _Atomic uint64_t g_lock_release_slab_count = 0; // Locks from release_slab path static int g_lock_stats_enabled = -1; // -1=uninitialized, 0=off, 1=on #if !HAKMEM_BUILD_RELEASE // Initialize lock stats from environment variable // Phase 4b: Now uses hak_debug_check() for master debug control support static inline void lock_stats_init(void) { if (__builtin_expect(g_lock_stats_enabled == -1, 0)) { g_lock_stats_enabled = hak_debug_check("HAKMEM_SHARED_POOL_LOCK_STATS"); } } // Report lock statistics at shutdown static void __attribute__((destructor)) lock_stats_report(void) { if (g_lock_stats_enabled != 1) { return; } uint64_t acquires = atomic_load(&g_lock_acquire_count); uint64_t releases = atomic_load(&g_lock_release_count); uint64_t acquire_path = atomic_load(&g_lock_acquire_slab_count); uint64_t release_path = atomic_load(&g_lock_release_slab_count); fprintf(stderr, "\n=== SHARED POOL LOCK STATISTICS ===\n"); fprintf(stderr, "Total lock ops: %lu (acquire) + %lu (release) = %lu\n", acquires, releases, acquires + releases); fprintf(stderr, "Balance: %ld (should be 0)\n", (int64_t)acquires - (int64_t)releases); fprintf(stderr, "\n--- Breakdown by Code Path ---\n"); fprintf(stderr, "acquire_slab(): %lu (%.1f%%)\n", acquire_path, 100.0 * acquire_path / (acquires ? acquires : 1)); fprintf(stderr, "release_slab(): %lu (%.1f%%)\n", release_path, 100.0 * release_path / (acquires ? acquires : 1)); fprintf(stderr, "===================================\n"); fflush(stderr); } #else // Release build: No-op stubs static inline void lock_stats_init(void) { if (__builtin_expect(g_lock_stats_enabled == -1, 0)) { g_lock_stats_enabled = 0; } } #endif // ============================================================================ // SP Acquire Stage Statistics (Stage1/2/3 breakdown) // ============================================================================ static _Atomic uint64_t g_sp_stage1_hits[TINY_NUM_CLASSES_SS]; static _Atomic uint64_t g_sp_stage2_hits[TINY_NUM_CLASSES_SS]; static _Atomic uint64_t g_sp_stage3_hits[TINY_NUM_CLASSES_SS]; // Data collection gate (0=off, 1=on). 学習層からも有効化される。 static int g_sp_stage_stats_enabled = 0; #if !HAKMEM_BUILD_RELEASE // Logging gate for destructor(ENV: HAKMEM_SHARED_POOL_STAGE_STATS) static int g_sp_stage_stats_log_enabled = -1; // -1=uninitialized, 0=off, 1=on static inline void sp_stage_stats_init(void) { // Phase 4d: Now uses hak_stats_check() for unified stats control if (__builtin_expect(g_sp_stage_stats_log_enabled == -1, 0)) { g_sp_stage_stats_log_enabled = hak_stats_check("HAKMEM_SHARED_POOL_STAGE_STATS", "pool"); if (g_sp_stage_stats_log_enabled == 1) { // ログが有効なら計測も必ず有効化する。 g_sp_stage_stats_enabled = 1; } } } static void __attribute__((destructor)) sp_stage_stats_report(void) { if (g_sp_stage_stats_log_enabled != 1) { return; } fprintf(stderr, "\n=== SHARED POOL STAGE STATISTICS ===\n"); fprintf(stderr, "Per-class acquire_slab() stage hits (Stage1=EMPTY, Stage2=UNUSED, Stage3=new SS)\n"); for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) { uint64_t s1 = atomic_load(&g_sp_stage1_hits[cls]); uint64_t s2 = atomic_load(&g_sp_stage2_hits[cls]); uint64_t s3 = atomic_load(&g_sp_stage3_hits[cls]); uint64_t total = s1 + s2 + s3; if (total == 0) continue; // Skip unused classes double p1 = 100.0 * (double)s1 / (double)total; double p2 = 100.0 * (double)s2 / (double)total; double p3 = 100.0 * (double)s3 / (double)total; fprintf(stderr, "Class %d: total=%llu S1=%llu (%.1f%%) S2=%llu (%.1f%%) S3=%llu (%.1f%%)\n", cls, (unsigned long long)total, (unsigned long long)s1, p1, (unsigned long long)s2, p2, (unsigned long long)s3, p3); } fprintf(stderr, "====================================\n"); fflush(stderr); } #else // Release build: No-op stubs static inline void sp_stage_stats_init(void) {} #endif // Snapshot Tiny-related backend metrics for learner / observability. void shared_pool_tiny_metrics_snapshot(uint64_t stage1[TINY_NUM_CLASSES_SS], uint64_t stage2[TINY_NUM_CLASSES_SS], uint64_t stage3[TINY_NUM_CLASSES_SS], uint32_t active_slots[TINY_NUM_CLASSES_SS]) { // Ensure env-based logging設定の初期化だけ先に済ませる。 sp_stage_stats_init(); // 学習層から呼ばれた場合は、計測自体は常に有効化する(ログは env で制御)。 g_sp_stage_stats_enabled = 1; for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) { if (stage1) { stage1[cls] = atomic_load_explicit(&g_sp_stage1_hits[cls], memory_order_relaxed); } if (stage2) { stage2[cls] = atomic_load_explicit(&g_sp_stage2_hits[cls], memory_order_relaxed); } if (stage3) { stage3[cls] = atomic_load_explicit(&g_sp_stage3_hits[cls], memory_order_relaxed); } if (active_slots) { active_slots[cls] = g_shared_pool.class_active_slots[cls]; } } } // Helper: return per-class active slot limit from FrozenPolicy.tiny_cap[] // Semantics: // - tiny_cap[class] == 0 → no limit (unbounded) // - otherwise: soft cap on ACTIVE slots managed by shared pool for this class. static inline uint32_t sp_class_active_limit(int class_idx) { const FrozenPolicy* pol = hkm_policy_get(); if (!pol) { return 0; // no limit } if (class_idx < 0 || class_idx >= 8) { return 0; } return (uint32_t)pol->tiny_cap[class_idx]; } // ============================================================================ // P0-4: Lock-Free Free Slot List - Node Pool // ============================================================================ // Pre-allocated node pools (one per class, to avoid malloc/free) FreeSlotNode g_free_node_pool[TINY_NUM_CLASSES_SS][MAX_FREE_NODES_PER_CLASS]; _Atomic uint32_t g_node_alloc_index[TINY_NUM_CLASSES_SS] = {0}; // Recycle list for FreeSlotNode (per class, lock-free LIFO). // node_alloc() はまずこのリストから再利用を試み、枯渇時のみ新規ノードを切り出す。 static _Atomic(FreeSlotNode*) g_node_free_head[TINY_NUM_CLASSES_SS] = { [0 ... TINY_NUM_CLASSES_SS-1] = ATOMIC_VAR_INIT(NULL) }; // Allocate a node from pool (lock-free fast path, may fall back to legacy path) static inline FreeSlotNode* node_alloc(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } // First, try to pop from recycle list (nodes returned by pop_lockfree). FreeSlotNode* free_head = atomic_load_explicit( &g_node_free_head[class_idx], memory_order_acquire); while (free_head != NULL) { FreeSlotNode* next = free_head->next; if (atomic_compare_exchange_weak_explicit( &g_node_free_head[class_idx], &free_head, next, memory_order_acq_rel, memory_order_acquire)) { return free_head; // Recycled node } // CAS failed: free_head is updated; retry with new head. } uint32_t idx = atomic_fetch_add(&g_node_alloc_index[class_idx], 1); if (idx >= MAX_FREE_NODES_PER_CLASS) { // Pool exhausted - should be rare. Caller must fall back to legacy // mutex-protected free list to preserve correctness. #if !HAKMEM_BUILD_RELEASE static _Atomic int warn_once = 0; if (atomic_exchange(&warn_once, 1) == 0) { fprintf(stderr, "[P0-4 WARN] Node pool exhausted for class %d\n", class_idx); } #endif return NULL; } return &g_free_node_pool[class_idx][idx]; } // ============================================================================ // Phase 12-2: SharedSuperSlabPool skeleton implementation // Goal: // - Centralize SuperSlab allocation/registration // - Provide acquire_slab/release_slab APIs for later refill/free integration // - Keep logic simple & conservative; correctness and observability first. // // Notes: // - Concurrency: protected by g_shared_pool.alloc_lock for now. // - class_hints is best-effort: read lock-free, written under lock. // - LRU hooks left as no-op placeholders. SharedSuperSlabPool g_shared_pool = { .slabs = NULL, .capacity = 0, .total_count = 0, .active_count = 0, .alloc_lock = PTHREAD_MUTEX_INITIALIZER, .class_hints = { NULL }, .lru_head = NULL, .lru_tail = NULL, .lru_count = 0, // P0-4: Lock-free free slot lists (zero-initialized atomic pointers) .free_slots_lockfree = {{.head = ATOMIC_VAR_INIT(NULL)}}, // Legacy: mutex-protected free lists .free_slots = {{.entries = {{0}}, .count = 0}}, // Phase 12: SP-SLOT fields (ss_metadata is fixed-size array, auto-zeroed) .ss_meta_count = 0 }; static void shared_pool_ensure_capacity_unlocked(uint32_t min_capacity) { if (g_shared_pool.capacity >= min_capacity) { return; } uint32_t new_cap = g_shared_pool.capacity ? g_shared_pool.capacity : 16; while (new_cap < min_capacity) { new_cap *= 2; } // CRITICAL FIX: Use system mmap() directly to avoid recursion! // Problem: realloc() goes through HAKMEM allocator → hak_alloc_at(128) // → needs Shared Pool init → calls realloc() → INFINITE RECURSION! // Solution: Allocate Shared Pool metadata using system mmap, not HAKMEM allocator size_t new_size = new_cap * sizeof(SuperSlab*); SuperSlab** new_slabs = (SuperSlab**)mmap(NULL, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (new_slabs == MAP_FAILED) { // Allocation failure: keep old state; caller must handle NULL later. return; } // Copy old data if exists if (g_shared_pool.slabs != NULL) { memcpy(new_slabs, g_shared_pool.slabs, g_shared_pool.capacity * sizeof(SuperSlab*)); // Free old mapping (also use system munmap, not free!) size_t old_size = g_shared_pool.capacity * sizeof(SuperSlab*); munmap(g_shared_pool.slabs, old_size); } // Zero new entries to keep scanning logic simple. memset(new_slabs + g_shared_pool.capacity, 0, (new_cap - g_shared_pool.capacity) * sizeof(SuperSlab*)); g_shared_pool.slabs = new_slabs; g_shared_pool.capacity = new_cap; } void shared_pool_init(void) { // Idempotent init; safe to call from multiple early paths. // pthread_mutex_t with static initializer is already valid. pthread_mutex_lock(&g_shared_pool.alloc_lock); if (g_shared_pool.capacity == 0 && g_shared_pool.slabs == NULL) { shared_pool_ensure_capacity_unlocked(16); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); } // ============================================================================ // Phase 12: SP-SLOT Box - Modular Helper Functions // ============================================================================ // ---------- Layer 1: Slot Operations (Low-level) ---------- // Find first unused slot in SharedSSMeta // P0-5: Uses atomic load for state check // Returns: slot_idx on success, -1 if no unused slots static int sp_slot_find_unused(SharedSSMeta* meta) { if (!meta) return -1; for (int i = 0; i < meta->total_slots; i++) { SlotState state = atomic_load_explicit(&meta->slots[i].state, memory_order_acquire); if (state == SLOT_UNUSED) { return i; } } return -1; } // Mark slot as ACTIVE (UNUSED→ACTIVE or EMPTY→ACTIVE) // P0-5: Uses atomic store for state transition (caller must hold mutex!) // Returns: 0 on success, -1 on error static int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx) { if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1; if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1; SharedSlot* slot = &meta->slots[slot_idx]; // Load state atomically SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire); // Transition: UNUSED→ACTIVE or EMPTY→ACTIVE if (state == SLOT_UNUSED || state == SLOT_EMPTY) { atomic_store_explicit(&slot->state, SLOT_ACTIVE, memory_order_release); slot->class_idx = (uint8_t)class_idx; slot->slab_idx = (uint8_t)slot_idx; meta->active_slots++; return 0; } return -1; // Already ACTIVE or invalid state } // Mark slot as EMPTY (ACTIVE→EMPTY) // P0-5: Uses atomic store for state transition (caller must hold mutex!) // Returns: 0 on success, -1 on error static int sp_slot_mark_empty(SharedSSMeta* meta, int slot_idx) { if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1; SharedSlot* slot = &meta->slots[slot_idx]; // Load state atomically SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire); if (state == SLOT_ACTIVE) { atomic_store_explicit(&slot->state, SLOT_EMPTY, memory_order_release); if (meta->active_slots > 0) { meta->active_slots--; } return 0; } return -1; // Not ACTIVE } // Sync SP-SLOT view from an existing SuperSlab. // This is needed when a legacy-allocated SuperSlab reaches the shared-pool // release path for the first time (slot states are still SLOT_UNUSED). static void sp_meta_sync_slots_from_ss(SharedSSMeta* meta, SuperSlab* ss) { if (!meta || !ss) return; int cap = ss_slabs_capacity(ss); if (cap > MAX_SLOTS_PER_SS) { cap = MAX_SLOTS_PER_SS; } meta->total_slots = (uint8_t)cap; meta->active_slots = 0; for (int i = 0; i < cap; i++) { SlotState state = SLOT_UNUSED; uint32_t bit = (1u << i); if (ss->slab_bitmap & bit) { state = SLOT_ACTIVE; meta->active_slots++; } else { TinySlabMeta* smeta = &ss->slabs[i]; uint16_t used = atomic_load_explicit(&smeta->used, memory_order_relaxed); if (smeta->capacity > 0 && used == 0) { state = SLOT_EMPTY; } } uint8_t cls = ss->class_map[i]; if (cls == 255) { cls = ss->slabs[i].class_idx; } meta->slots[i].class_idx = cls; meta->slots[i].slab_idx = (uint8_t)i; atomic_store_explicit(&meta->slots[i].state, state, memory_order_release); } } // ---------- Layer 2: Metadata Management (Mid-level) ---------- // Ensure ss_metadata array has capacity for at least min_count entries // Caller must hold alloc_lock // Returns: 0 on success, -1 if capacity exceeded // RACE FIX: No realloc! Fixed-size array prevents race with lock-free Stage 2 static int sp_meta_ensure_capacity(uint32_t min_count) { if (min_count > MAX_SS_METADATA_ENTRIES) { #if !HAKMEM_BUILD_RELEASE static int warn_once = 0; if (warn_once == 0) { fprintf(stderr, "[SP_META_CAPACITY_ERROR] Exceeded MAX_SS_METADATA_ENTRIES=%d\n", MAX_SS_METADATA_ENTRIES); warn_once = 1; } #endif return -1; } return 0; } // Find SharedSSMeta for given SuperSlab, or create if not exists // Caller must hold alloc_lock // Returns: SharedSSMeta* on success, NULL on error static SharedSSMeta* sp_meta_find_or_create(SuperSlab* ss) { if (!ss) return NULL; // RACE FIX: Load count atomically for consistency (even under mutex) uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed); // Search existing metadata for (uint32_t i = 0; i < count; i++) { // RACE FIX: Load pointer atomically for consistency SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed); if (meta_ss == ss) { return &g_shared_pool.ss_metadata[i]; } } // Create new metadata entry if (sp_meta_ensure_capacity(count + 1) != 0) { return NULL; } // RACE FIX: Read current count atomically (even under mutex for consistency) uint32_t current_count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed); SharedSSMeta* meta = &g_shared_pool.ss_metadata[current_count]; // RACE FIX: Store SuperSlab pointer atomically (visible to lock-free Stage 2) atomic_store_explicit(&meta->ss, ss, memory_order_relaxed); meta->total_slots = (uint8_t)ss_slabs_capacity(ss); meta->active_slots = 0; // Initialize all slots as UNUSED // P0-5: Use atomic store for state initialization for (int i = 0; i < meta->total_slots; i++) { atomic_store_explicit(&meta->slots[i].state, SLOT_UNUSED, memory_order_relaxed); meta->slots[i].class_idx = 0; meta->slots[i].slab_idx = (uint8_t)i; } // RACE FIX: Atomic increment with release semantics // This ensures all writes to metadata[current_count] (lines 268-278) are visible // before the count increment is visible to lock-free Stage 2 readers atomic_fetch_add_explicit(&g_shared_pool.ss_meta_count, 1, memory_order_release); return meta; } // ============================================================================ // Phase 12-1.x: Acquire Helper Boxes (Stage 0.5/1/2/3) // ============================================================================ // Debug / stats helper (Stage hits) static inline void sp_stage_stats_dump_if_enabled(void) { #if !HAKMEM_BUILD_RELEASE static int dump_en = -1; if (__builtin_expect(dump_en == -1, 0)) { const char* e = getenv("HAKMEM_SHARED_POOL_STAGE_STATS"); dump_en = (e && *e && *e != '0') ? 1 : 0; } if (!dump_en) return; // 全クラス合計を出力(スキャン/ヒットの分布を見るため) uint64_t s0 = 0, s1 = 0, s2 = 0, s3 = 0; for (int c = 0; c < TINY_NUM_CLASSES_SS; c++) { s0 += atomic_load_explicit(&g_sp_stage0_hits[c], memory_order_relaxed); s1 += atomic_load_explicit(&g_sp_stage1_hits[c], memory_order_relaxed); s2 += atomic_load_explicit(&g_sp_stage2_hits[c], memory_order_relaxed); s3 += atomic_load_explicit(&g_sp_stage3_hits[c], memory_order_relaxed); } fprintf(stderr, "[SP_STAGE_STATS] total: stage0.5=%lu stage1=%lu stage2=%lu stage3=%lu\n", (unsigned long)s0, (unsigned long)s1, (unsigned long)s2, (unsigned long)s3); #else (void)g_sp_stage1_hits; (void)g_sp_stage2_hits; (void)g_sp_stage3_hits; #endif } // Stage 0.5: EMPTY slab direct scan(registry ベースの EMPTY 再利用) static inline int sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire) { static int empty_reuse_enabled = -1; if (__builtin_expect(empty_reuse_enabled == -1, 0)) { const char* e = getenv("HAKMEM_SS_EMPTY_REUSE"); empty_reuse_enabled = (e && *e && *e == '0') ? 0 : 1; // default ON } if (!empty_reuse_enabled) { return -1; } extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; extern int g_super_reg_class_size[TINY_NUM_CLASSES]; int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0; static int scan_limit = -1; if (__builtin_expect(scan_limit == -1, 0)) { const char* e = getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT"); scan_limit = (e && *e) ? atoi(e) : 32; // default: scan first 32 SuperSlabs (Phase 9-2 tuning) } if (scan_limit > reg_size) scan_limit = reg_size; // Stage 0.5 hit counter for visualization static _Atomic uint64_t stage05_hits = 0; static _Atomic uint64_t stage05_attempts = 0; atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed); for (int i = 0; i < scan_limit; i++) { SuperSlab* ss = g_super_reg_by_class[class_idx][i]; if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue; if (ss->empty_count == 0) continue; // No EMPTY slabs in this SS uint32_t mask = ss->empty_mask; while (mask) { int empty_idx = __builtin_ctz(mask); mask &= (mask - 1); // clear lowest bit TinySlabMeta* meta = &ss->slabs[empty_idx]; if (meta->capacity > 0 && meta->used == 0) { tiny_tls_slab_reuse_guard(ss); ss_clear_slab_empty(ss, empty_idx); meta->class_idx = (uint8_t)class_idx; ss->class_map[empty_idx] = (uint8_t)class_idx; #if !HAKMEM_BUILD_RELEASE if (dbg_acquire == 1) { fprintf(stderr, "[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u)\n", class_idx, (void*)ss, empty_idx, ss->empty_count); } #else (void)dbg_acquire; #endif *ss_out = ss; *slab_idx_out = empty_idx; sp_stage_stats_init(); if (g_sp_stage_stats_enabled) { atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1); } atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed); // Stage 0.5 hit rate visualization (every 100 hits) uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed); if (hits % 100 == 1) { uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed); fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d)\n", hits, attempts, (double)hits * 100.0 / attempts, scan_limit); } return 0; } } } return -1; } // ---------- Layer 3: Free List Management ---------- // Push empty slot to per-class free list // Caller must hold alloc_lock // Returns: 0 on success, -1 if list is full static int sp_freelist_push(int class_idx, SharedSSMeta* meta, int slot_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1; if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1; FreeSlotList* list = &g_shared_pool.free_slots[class_idx]; if (list->count >= MAX_FREE_SLOTS_PER_CLASS) { return -1; // List full } list->entries[list->count].meta = meta; list->entries[list->count].slot_idx = (uint8_t)slot_idx; list->count++; return 0; } // Pop empty slot from per-class free list // Caller must hold alloc_lock // Returns: 1 if popped (out params filled), 0 if list empty static int sp_freelist_pop(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0; if (!out_meta || !out_slot_idx) return 0; FreeSlotList* list = &g_shared_pool.free_slots[class_idx]; if (list->count == 0) { return 0; // List empty } // Pop from end (LIFO for cache locality) list->count--; *out_meta = list->entries[list->count].meta; *out_slot_idx = list->entries[list->count].slot_idx; return 1; } // ============================================================================ // P0-5: Lock-Free Slot Claiming (Stage 2 Optimization) // ============================================================================ // Try to claim an UNUSED slot via lock-free CAS // Returns: slot_idx on success, -1 if no UNUSED slots available // LOCK-FREE: Can be called from any thread without mutex static int sp_slot_claim_lockfree(SharedSSMeta* meta, int class_idx) { if (!meta) return -1; if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1; // Scan all slots for UNUSED state for (int i = 0; i < meta->total_slots; i++) { SlotState expected = SLOT_UNUSED; // Try to claim this slot atomically (UNUSED → ACTIVE) if (atomic_compare_exchange_strong_explicit( &meta->slots[i].state, &expected, SLOT_ACTIVE, memory_order_acq_rel, // Success: acquire+release semantics memory_order_relaxed // Failure: just retry next slot )) { // Successfully claimed! Update non-atomic fields // (Safe because we now own this slot) meta->slots[i].class_idx = (uint8_t)class_idx; meta->slots[i].slab_idx = (uint8_t)i; // Increment active_slots counter atomically // (Multiple threads may claim slots concurrently) atomic_fetch_add_explicit( (_Atomic uint8_t*)&meta->active_slots, 1, memory_order_relaxed ); return i; // Return claimed slot index } // CAS failed (slot was not UNUSED) - continue to next slot } return -1; // No UNUSED slots available } // ============================================================================ // P0-4: Lock-Free Free Slot List Operations // ============================================================================ // Push empty slot to lock-free per-class free list (LIFO) // LOCK-FREE: Can be called from any thread without mutex // Returns: 0 on success, -1 on failure (node pool exhausted) static int sp_freelist_push_lockfree(int class_idx, SharedSSMeta* meta, int slot_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1; if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1; // Allocate node from pool FreeSlotNode* node = node_alloc(class_idx); if (!node) { // Fallback: push into legacy per-class free list // ASSUME: Caller already holds alloc_lock (e.g., shared_pool_release_slab:772) // Do NOT lock again to avoid deadlock on non-recursive mutex! (void)sp_freelist_push(class_idx, meta, slot_idx); return 0; } // Fill node data node->meta = meta; node->slot_idx = (uint8_t)slot_idx; // Lock-free LIFO push using CAS loop LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx]; FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_relaxed); do { node->next = old_head; } while (!atomic_compare_exchange_weak_explicit( &list->head, &old_head, node, memory_order_release, // Success: publish node to other threads memory_order_relaxed // Failure: retry with updated old_head )); return 0; // Success } // Pop empty slot from lock-free per-class free list (LIFO) // LOCK-FREE: Can be called from any thread without mutex // Returns: 1 if popped (out params filled), 0 if list empty static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0; if (!out_meta || !out_slot_idx) return 0; LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx]; FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_acquire); // Lock-free LIFO pop using CAS loop do { if (old_head == NULL) { return 0; // List empty } } while (!atomic_compare_exchange_weak_explicit( &list->head, &old_head, old_head->next, memory_order_acquire, // Success: acquire node data memory_order_acquire // Failure: retry with updated old_head )); // Extract data from popped node *out_meta = old_head->meta; *out_slot_idx = old_head->slot_idx; // Recycle node back into per-class free list so that long-running workloads // do not permanently consume new nodes on every EMPTY event. FreeSlotNode* free_head = atomic_load_explicit( &g_node_free_head[class_idx], memory_order_acquire); do { old_head->next = free_head; } while (!atomic_compare_exchange_weak_explicit( &g_node_free_head[class_idx], &free_head, old_head, memory_order_release, memory_order_acquire)); return 1; // Success } // Internal helper: Allocates a new SuperSlab from the OS and performs basic initialization. // Does NOT interact with g_shared_pool.slabs[] or g_shared_pool.total_count directly. // Caller is responsible for adding the SuperSlab to g_shared_pool's arrays and metadata. static SuperSlab* sp_internal_allocate_superslab(void) { // Use size_class 0 as a neutral hint; Phase 12 per-slab class_idx is authoritative. extern SuperSlab* superslab_allocate(uint8_t size_class); SuperSlab* ss = superslab_allocate(0); if (!ss) { return NULL; } // PageFaultTelemetry: mark all backing pages for this Superslab (approximate) size_t ss_bytes = (size_t)1 << ss->lg_size; for (size_t off = 0; off < ss_bytes; off += 4096) { pagefault_telemetry_touch(PF_BUCKET_SS_META, (char*)ss + off); } // superslab_allocate() already: // - zeroes slab metadata / remote queues, // - sets magic/lg_size/etc, // - registers in global registry. // For shared-pool semantics we normalize all slab class_idx to UNASSIGNED. int max_slabs = ss_slabs_capacity(ss); for (int i = 0; i < max_slabs; i++) { ss_slab_meta_class_idx_set(ss, i, 255); // UNASSIGNED // P1.1: Initialize class_map to UNASSIGNED as well ss->class_map[i] = 255; } return ss; } SuperSlab* shared_pool_acquire_superslab(void) { shared_pool_init(); pthread_mutex_lock(&g_shared_pool.alloc_lock); // For now, always allocate a fresh SuperSlab and register it. // More advanced reuse/GC comes later. // Release lock to avoid deadlock with registry during superslab_allocate pthread_mutex_unlock(&g_shared_pool.alloc_lock); SuperSlab* ss = sp_internal_allocate_superslab(); // Call lock-free internal helper pthread_mutex_lock(&g_shared_pool.alloc_lock); if (!ss) { pthread_mutex_unlock(&g_shared_pool.alloc_lock); return NULL; } // Add newly allocated SuperSlab to the shared pool's internal array if (g_shared_pool.total_count >= g_shared_pool.capacity) { shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1); if (g_shared_pool.total_count >= g_shared_pool.capacity) { // Pool table expansion failed; leave ss alive (registry-owned), // but do not treat it as part of shared_pool. pthread_mutex_unlock(&g_shared_pool.alloc_lock); return NULL; } } g_shared_pool.slabs[g_shared_pool.total_count] = ss; g_shared_pool.total_count++; // Not counted as active until at least one slab is assigned. pthread_mutex_unlock(&g_shared_pool.alloc_lock); return ss; } // ---------- Layer 4: Public API (High-level) ---------- // Ensure slab geometry matches current class stride (handles upgrades like C7 1024->2048). static inline void sp_fix_geometry_if_needed(SuperSlab* ss, int slab_idx, int class_idx) { if (!ss || slab_idx < 0 || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return; } TinySlabMeta* meta = &ss->slabs[slab_idx]; size_t stride = g_tiny_class_sizes[class_idx]; size_t usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE; uint16_t expect_cap = (uint16_t)(usable / stride); // Reinitialize if capacity is off or class_idx mismatches. if (meta->class_idx != (uint8_t)class_idx || meta->capacity != expect_cap) { #if !HAKMEM_BUILD_RELEASE extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[SP_FIX_GEOMETRY] ss=%p slab=%d cls=%d: old_cls=%u old_cap=%u -> new_cls=%d new_cap=%u (stride=%zu)\n", (void*)ss, slab_idx, class_idx, meta->class_idx, meta->capacity, class_idx, expect_cap, stride); g_hakmem_lock_depth--; #endif superslab_init_slab(ss, slab_idx, stride, 0 /*owner_tid*/); meta->class_idx = (uint8_t)class_idx; // P1.1: Update class_map after geometry fix ss->class_map[slab_idx] = (uint8_t)class_idx; } } int shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out) { // Phase 12: SP-SLOT Box - 3-Stage Acquire Logic // // Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE) // Stage 2: Find UNUSED slots in existing SuperSlabs // Stage 3: Get new SuperSlab (LRU pop or mmap) // // Invariants: // - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots // - The chosen slab has meta->class_idx == class_idx if (!ss_out || !slab_idx_out) { return -1; } if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return -1; } shared_pool_init(); // Debug logging / stage stats #if !HAKMEM_BUILD_RELEASE static int dbg_acquire = -1; if (__builtin_expect(dbg_acquire == -1, 0)) { const char* e = getenv("HAKMEM_SS_ACQUIRE_DEBUG"); dbg_acquire = (e && *e && *e != '0') ? 1 : 0; } #else static const int dbg_acquire = 0; #endif sp_stage_stats_init(); stage1_retry_after_tension_drain: // ========== Stage 0.5 (Phase 12-1.1): EMPTY slab direct scan ========== // Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to // avoid Stage 3 (mmap) when freed slabs are available. if (sp_acquire_from_empty_scan(class_idx, ss_out, slab_idx_out, dbg_acquire) == 0) { return 0; } // ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ========== // P0-4: Lock-free pop from per-class free list (no mutex needed!) // Best case: Same class freed a slot, reuse immediately (cache-hot) SharedSSMeta* reuse_meta = NULL; int reuse_slot_idx = -1; if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) { // Found EMPTY slot from lock-free list! // Now acquire mutex ONLY for slot activation and metadata update // P0 instrumentation: count lock acquisitions lock_stats_init(); if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_acquire_count, 1); atomic_fetch_add(&g_lock_acquire_slab_count, 1); } pthread_mutex_lock(&g_shared_pool.alloc_lock); // P0.3: Guard against TLS SLL orphaned pointers before reusing slab // RACE FIX: Load SuperSlab pointer atomically BEFORE guard (consistency) SuperSlab* ss_guard = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed); if (ss_guard) { tiny_tls_slab_reuse_guard(ss_guard); } // Activate slot under mutex (slot state transition requires protection) if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) { // RACE FIX: Load SuperSlab pointer atomically (consistency) SuperSlab* ss = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed); // RACE FIX: Check if SuperSlab was freed (NULL pointer) // This can happen if Thread A freed the SuperSlab after pushing slot to freelist, // but Thread B popped the stale slot before the freelist was cleared. if (!ss) { // SuperSlab freed - skip and fall through to Stage 2/3 if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); goto stage2_fallback; } #if !HAKMEM_BUILD_RELEASE if (dbg_acquire == 1) { fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n", class_idx, (void*)ss, reuse_slot_idx); } #endif // Update SuperSlab metadata ss->slab_bitmap |= (1u << reuse_slot_idx); ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx); if (ss->active_slabs == 0) { // Was empty, now active again ss->active_slabs = 1; g_shared_pool.active_count++; } // Track per-class active slots (approximate, under alloc_lock) if (class_idx < TINY_NUM_CLASSES_SS) { g_shared_pool.class_active_slots[class_idx]++; } // Update hint g_shared_pool.class_hints[class_idx] = ss; *ss_out = ss; *slab_idx_out = reuse_slot_idx; if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); if (g_sp_stage_stats_enabled) { atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1); } return 0; // ✅ Stage 1 (lock-free) success } // Slot activation failed (race condition?) - release lock and fall through if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); } stage2_fallback: // ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ========== // P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!) // RACE FIX: Read ss_meta_count atomically (now properly declared as _Atomic) // No cast needed! memory_order_acquire synchronizes with release in sp_meta_find_or_create uint32_t meta_count = atomic_load_explicit( &g_shared_pool.ss_meta_count, memory_order_acquire ); for (uint32_t i = 0; i < meta_count; i++) { SharedSSMeta* meta = &g_shared_pool.ss_metadata[i]; // Try lock-free claiming (UNUSED → ACTIVE via CAS) int claimed_idx = sp_slot_claim_lockfree(meta, class_idx); if (claimed_idx >= 0) { // RACE FIX: Load SuperSlab pointer atomically (critical for lock-free Stage 2) // Use memory_order_acquire to synchronize with release in sp_meta_find_or_create SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire); if (!ss) { // SuperSlab was freed between claiming and loading - skip this entry continue; } #if !HAKMEM_BUILD_RELEASE if (dbg_acquire == 1) { fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n", class_idx, (void*)ss, claimed_idx); } #endif // P0 instrumentation: count lock acquisitions lock_stats_init(); if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_acquire_count, 1); atomic_fetch_add(&g_lock_acquire_slab_count, 1); } pthread_mutex_lock(&g_shared_pool.alloc_lock); // Update SuperSlab metadata under mutex ss->slab_bitmap |= (1u << claimed_idx); ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx); if (ss->active_slabs == 0) { ss->active_slabs = 1; g_shared_pool.active_count++; } if (class_idx < TINY_NUM_CLASSES_SS) { g_shared_pool.class_active_slots[class_idx]++; } // Update hint g_shared_pool.class_hints[class_idx] = ss; *ss_out = ss; *slab_idx_out = claimed_idx; sp_fix_geometry_if_needed(ss, claimed_idx, class_idx); if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); if (g_sp_stage_stats_enabled) { atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1); } return 0; // ✅ Stage 2 (lock-free) success } // Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab } // ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ========== // If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection // This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1 // ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1) // ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024) { static int tension_drain_enabled = -1; static uint32_t tension_threshold = 1024; if (tension_drain_enabled < 0) { const char* env = getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE"); tension_drain_enabled = (env == NULL || atoi(env) != 0) ? 1 : 0; const char* thresh_env = getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD"); if (thresh_env) { tension_threshold = (uint32_t)atoi(thresh_env); if (tension_threshold < 64) tension_threshold = 64; if (tension_threshold > 65536) tension_threshold = 65536; } } if (tension_drain_enabled) { extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size); uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll[class_idx].count : 0; if (sll_count >= tension_threshold) { // Drain all blocks to maximize EMPTY slot creation uint32_t drained = tiny_tls_sll_drain(class_idx, 0); // 0 = drain all if (drained > 0) { // Retry Stage 1 (EMPTY reuse) after drain // Some slabs might have become EMPTY (meta->used == 0) goto stage1_retry_after_tension_drain; } } } } // ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ========== // All existing SuperSlabs have no UNUSED slots → need new SuperSlab // P0 instrumentation: count lock acquisitions lock_stats_init(); if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_acquire_count, 1); atomic_fetch_add(&g_lock_acquire_slab_count, 1); } pthread_mutex_lock(&g_shared_pool.alloc_lock); // ========== Stage 3: Get new SuperSlab ========== // Try LRU cache first, then mmap SuperSlab* new_ss = NULL; // Stage 3a: Try LRU cache extern SuperSlab* hak_ss_lru_pop(uint8_t size_class); new_ss = hak_ss_lru_pop((uint8_t)class_idx); int from_lru = (new_ss != NULL); // Stage 3b: If LRU miss, allocate new SuperSlab if (!new_ss) { // Release the alloc_lock to avoid deadlock with registry during superslab_allocate if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); SuperSlab* allocated_ss = sp_internal_allocate_superslab(); // Re-acquire the alloc_lock if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_acquire_count, 1); atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path } pthread_mutex_lock(&g_shared_pool.alloc_lock); if (!allocated_ss) { // Allocation failed; return now. if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return -1; // Out of memory } new_ss = allocated_ss; // Add newly allocated SuperSlab to the shared pool's internal array if (g_shared_pool.total_count >= g_shared_pool.capacity) { shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1); if (g_shared_pool.total_count >= g_shared_pool.capacity) { // Pool table expansion failed; leave ss alive (registry-owned), // but do not treat it as part of shared_pool. // This is a critical error, return early. if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return -1; } } g_shared_pool.slabs[g_shared_pool.total_count] = new_ss; g_shared_pool.total_count++; } #if !HAKMEM_BUILD_RELEASE if (dbg_acquire == 1 && new_ss) { fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n", class_idx, (void*)new_ss, from_lru); } #endif if (!new_ss) { if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return -1; // ❌ Out of memory } // Before creating a new SuperSlab, consult learning-layer soft cap. // If current active slots for this class already exceed the policy cap, // fail early so caller can fall back to legacy backend. uint32_t limit = sp_class_active_limit(class_idx); if (limit > 0) { uint32_t cur = g_shared_pool.class_active_slots[class_idx]; if (cur >= limit) { if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return -1; // Soft cap reached for this class } } // Create metadata for this new SuperSlab SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss); if (!new_meta) { if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return -1; // ❌ Metadata allocation failed } // Assign first slot to this class int first_slot = 0; if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) { if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return -1; // ❌ Should not happen } // Update SuperSlab metadata new_ss->slab_bitmap |= (1u << first_slot); ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx); new_ss->active_slabs = 1; g_shared_pool.active_count++; if (class_idx < TINY_NUM_CLASSES_SS) { g_shared_pool.class_active_slots[class_idx]++; } // Update hint g_shared_pool.class_hints[class_idx] = new_ss; *ss_out = new_ss; *slab_idx_out = first_slot; sp_fix_geometry_if_needed(new_ss, first_slot, class_idx); if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); if (g_sp_stage_stats_enabled) { atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1); } return 0; // ✅ Stage 3 success } void shared_pool_release_slab(SuperSlab* ss, int slab_idx) { // Phase 12: SP-SLOT Box - Slot-based Release // // Flow: // 1. Validate inputs and check meta->used == 0 // 2. Find SharedSSMeta for this SuperSlab // 3. Mark slot ACTIVE → EMPTY // 4. Push to per-class free list (enables same-class reuse) // 5. If all slots EMPTY → superslab_free() → LRU cache if (!ss) { return; } if (slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) { return; } // Debug logging #if !HAKMEM_BUILD_RELEASE static int dbg = -1; if (__builtin_expect(dbg == -1, 0)) { const char* e = getenv("HAKMEM_SS_FREE_DEBUG"); dbg = (e && *e && *e != '0') ? 1 : 0; } #else static const int dbg = 0; #endif // P0 instrumentation: count lock acquisitions lock_stats_init(); if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_acquire_count, 1); atomic_fetch_add(&g_lock_release_slab_count, 1); } pthread_mutex_lock(&g_shared_pool.alloc_lock); TinySlabMeta* slab_meta = &ss->slabs[slab_idx]; if (slab_meta->used != 0) { // Not actually empty; nothing to do if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return; } uint8_t class_idx = slab_meta->class_idx; #if !HAKMEM_BUILD_RELEASE if (dbg == 1) { fprintf(stderr, "[SP_SLOT_RELEASE] ss=%p slab_idx=%d class=%d used=0 (marking EMPTY)\n", (void*)ss, slab_idx, class_idx); } #endif // Find SharedSSMeta for this SuperSlab SharedSSMeta* sp_meta = NULL; uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed); for (uint32_t i = 0; i < count; i++) { // RACE FIX: Load pointer atomically SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed); if (meta_ss == ss) { sp_meta = &g_shared_pool.ss_metadata[i]; break; } } if (!sp_meta) { // SuperSlab not in SP-SLOT system yet - create metadata sp_meta = sp_meta_find_or_create(ss); if (!sp_meta) { pthread_mutex_unlock(&g_shared_pool.alloc_lock); return; // Failed to create metadata } } // Mark slot as EMPTY (ACTIVE → EMPTY) uint32_t slab_bit = (1u << slab_idx); SlotState slot_state = atomic_load_explicit( &sp_meta->slots[slab_idx].state, memory_order_acquire); if (slot_state != SLOT_ACTIVE && (ss->slab_bitmap & slab_bit)) { // Legacy path import: rebuild slot states from SuperSlab bitmap/class_map sp_meta_sync_slots_from_ss(sp_meta, ss); slot_state = atomic_load_explicit( &sp_meta->slots[slab_idx].state, memory_order_acquire); } if (slot_state != SLOT_ACTIVE || sp_slot_mark_empty(sp_meta, slab_idx) != 0) { if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); return; // Slot wasn't ACTIVE } // Update SuperSlab metadata uint32_t bit = (1u << slab_idx); if (ss->slab_bitmap & bit) { ss->slab_bitmap &= ~bit; slab_meta->class_idx = 255; // UNASSIGNED // P1.1: Mark class_map as UNASSIGNED when releasing slab ss->class_map[slab_idx] = 255; if (ss->active_slabs > 0) { ss->active_slabs--; if (ss->active_slabs == 0 && g_shared_pool.active_count > 0) { g_shared_pool.active_count--; } } if (class_idx < TINY_NUM_CLASSES_SS && g_shared_pool.class_active_slots[class_idx] > 0) { g_shared_pool.class_active_slots[class_idx]--; } } // P0-4: Push to lock-free per-class free list (enables reuse by same class) // Note: push BEFORE releasing mutex (slot state already updated under lock) if (class_idx < TINY_NUM_CLASSES_SS) { sp_freelist_push_lockfree(class_idx, sp_meta, slab_idx); #if !HAKMEM_BUILD_RELEASE if (dbg == 1) { fprintf(stderr, "[SP_SLOT_FREELIST_LOCKFREE] class=%d pushed slot (ss=%p slab=%d) active_slots=%u/%u\n", class_idx, (void*)ss, slab_idx, sp_meta->active_slots, sp_meta->total_slots); } #endif } // Check if SuperSlab is now completely empty (all slots EMPTY or UNUSED) if (sp_meta->active_slots == 0) { #if !HAKMEM_BUILD_RELEASE if (dbg == 1) { fprintf(stderr, "[SP_SLOT_COMPLETELY_EMPTY] ss=%p active_slots=0 (calling superslab_free)\n", (void*)ss); } #endif if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } // RACE FIX: Set meta->ss to NULL BEFORE unlocking mutex // This prevents Stage 2 from accessing freed SuperSlab atomic_store_explicit(&sp_meta->ss, NULL, memory_order_release); pthread_mutex_unlock(&g_shared_pool.alloc_lock); // Remove from legacy backend list (if present) to prevent dangling pointers extern void remove_superslab_from_legacy_head(SuperSlab* ss); remove_superslab_from_legacy_head(ss); // Free SuperSlab: // 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation // 2. Or munmap if LRU is full - eager deallocation extern void superslab_free(SuperSlab* ss); superslab_free(ss); return; } if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); } pthread_mutex_unlock(&g_shared_pool.alloc_lock); }