Files
hakmem/core/hakmem_shared_pool.c
Moe Charm (CI) e769dec283 Refactor: Clean up SuperSlab shared pool code
- Removed unused/disabled L0 cache implementation from core/hakmem_shared_pool.c.
- Deleted stale backup file core/hakmem_tiny_superslab.c.bak.
- Removed untracked and obsolete shared_pool source files.
2025-11-30 15:27:53 +09:00

1419 lines
54 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "hakmem_shared_pool.h"
#include "hakmem_tiny_superslab.h"
#include "hakmem_tiny_superslab_constants.h"
#include "hakmem_debug_master.h" // Phase 4b: Master debug control
#include "hakmem_stats_master.h" // Phase 4d: Master stats control
#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary
#include "box/ss_hot_cold_box.h" // Phase 12-1.1: EMPTY slab marking
#include "box/pagefault_telemetry_box.h" // Box PageFaultTelemetry (PF_BUCKET_SS_META)
#include "box/tls_sll_drain_box.h" // Box TLS SLL Drain (tiny_tls_sll_drain)
#include "box/tls_slab_reuse_guard_box.h" // Box TLS Slab Reuse Guard (P0.3)
#include "hakmem_policy.h" // FrozenPolicy (learning layer)
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
#include <stdio.h>
#include <sys/mman.h> // For mmap/munmap (used in shared_pool_ensure_capacity_unlocked)
// ============================================================================
// P0 Lock Contention Instrumentation (Debug build only; counters defined always)
// ============================================================================
static _Atomic uint64_t g_lock_acquire_count = 0; // Total lock acquisitions
static _Atomic uint64_t g_lock_release_count = 0; // Total lock releases
static _Atomic uint64_t g_lock_acquire_slab_count = 0; // Locks from acquire_slab path
static _Atomic uint64_t g_lock_release_slab_count = 0; // Locks from release_slab path
static int g_lock_stats_enabled = -1; // -1=uninitialized, 0=off, 1=on
#if !HAKMEM_BUILD_RELEASE
// Initialize lock stats from environment variable
// Phase 4b: Now uses hak_debug_check() for master debug control support
static inline void lock_stats_init(void) {
if (__builtin_expect(g_lock_stats_enabled == -1, 0)) {
g_lock_stats_enabled = hak_debug_check("HAKMEM_SHARED_POOL_LOCK_STATS");
}
}
// Report lock statistics at shutdown
static void __attribute__((destructor)) lock_stats_report(void) {
if (g_lock_stats_enabled != 1) {
return;
}
uint64_t acquires = atomic_load(&g_lock_acquire_count);
uint64_t releases = atomic_load(&g_lock_release_count);
uint64_t acquire_path = atomic_load(&g_lock_acquire_slab_count);
uint64_t release_path = atomic_load(&g_lock_release_slab_count);
fprintf(stderr, "\n=== SHARED POOL LOCK STATISTICS ===\n");
fprintf(stderr, "Total lock ops: %lu (acquire) + %lu (release) = %lu\n",
acquires, releases, acquires + releases);
fprintf(stderr, "Balance: %ld (should be 0)\n",
(int64_t)acquires - (int64_t)releases);
fprintf(stderr, "\n--- Breakdown by Code Path ---\n");
fprintf(stderr, "acquire_slab(): %lu (%.1f%%)\n",
acquire_path, 100.0 * acquire_path / (acquires ? acquires : 1));
fprintf(stderr, "release_slab(): %lu (%.1f%%)\n",
release_path, 100.0 * release_path / (acquires ? acquires : 1));
fprintf(stderr, "===================================\n");
fflush(stderr);
}
#else
// Release build: No-op stubs
static inline void lock_stats_init(void) {
if (__builtin_expect(g_lock_stats_enabled == -1, 0)) {
g_lock_stats_enabled = 0;
}
}
#endif
// ============================================================================
// SP Acquire Stage Statistics (Stage1/2/3 breakdown)
// ============================================================================
static _Atomic uint64_t g_sp_stage1_hits[TINY_NUM_CLASSES_SS];
static _Atomic uint64_t g_sp_stage2_hits[TINY_NUM_CLASSES_SS];
static _Atomic uint64_t g_sp_stage3_hits[TINY_NUM_CLASSES_SS];
// Data collection gate (0=off, 1=on). 学習層からも有効化される。
static int g_sp_stage_stats_enabled = 0;
#if !HAKMEM_BUILD_RELEASE
// Logging gate for destructorENV: HAKMEM_SHARED_POOL_STAGE_STATS
static int g_sp_stage_stats_log_enabled = -1; // -1=uninitialized, 0=off, 1=on
static inline void sp_stage_stats_init(void) {
// Phase 4d: Now uses hak_stats_check() for unified stats control
if (__builtin_expect(g_sp_stage_stats_log_enabled == -1, 0)) {
g_sp_stage_stats_log_enabled = hak_stats_check("HAKMEM_SHARED_POOL_STAGE_STATS", "pool");
if (g_sp_stage_stats_log_enabled == 1) {
// ログが有効なら計測も必ず有効化する。
g_sp_stage_stats_enabled = 1;
}
}
}
static void __attribute__((destructor)) sp_stage_stats_report(void) {
if (g_sp_stage_stats_log_enabled != 1) {
return;
}
fprintf(stderr, "\n=== SHARED POOL STAGE STATISTICS ===\n");
fprintf(stderr, "Per-class acquire_slab() stage hits (Stage1=EMPTY, Stage2=UNUSED, Stage3=new SS)\n");
for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) {
uint64_t s1 = atomic_load(&g_sp_stage1_hits[cls]);
uint64_t s2 = atomic_load(&g_sp_stage2_hits[cls]);
uint64_t s3 = atomic_load(&g_sp_stage3_hits[cls]);
uint64_t total = s1 + s2 + s3;
if (total == 0) continue; // Skip unused classes
double p1 = 100.0 * (double)s1 / (double)total;
double p2 = 100.0 * (double)s2 / (double)total;
double p3 = 100.0 * (double)s3 / (double)total;
fprintf(stderr,
"Class %d: total=%llu S1=%llu (%.1f%%) S2=%llu (%.1f%%) S3=%llu (%.1f%%)\n",
cls,
(unsigned long long)total,
(unsigned long long)s1, p1,
(unsigned long long)s2, p2,
(unsigned long long)s3, p3);
}
fprintf(stderr, "====================================\n");
fflush(stderr);
}
#else
// Release build: No-op stubs
static inline void sp_stage_stats_init(void) {}
#endif
// Snapshot Tiny-related backend metrics for learner / observability.
void
shared_pool_tiny_metrics_snapshot(uint64_t stage1[TINY_NUM_CLASSES_SS],
uint64_t stage2[TINY_NUM_CLASSES_SS],
uint64_t stage3[TINY_NUM_CLASSES_SS],
uint32_t active_slots[TINY_NUM_CLASSES_SS])
{
// Ensure env-based logging設定の初期化だけ先に済ませる。
sp_stage_stats_init();
// 学習層から呼ばれた場合は、計測自体は常に有効化する(ログは env で制御)。
g_sp_stage_stats_enabled = 1;
for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) {
if (stage1) {
stage1[cls] = atomic_load_explicit(&g_sp_stage1_hits[cls],
memory_order_relaxed);
}
if (stage2) {
stage2[cls] = atomic_load_explicit(&g_sp_stage2_hits[cls],
memory_order_relaxed);
}
if (stage3) {
stage3[cls] = atomic_load_explicit(&g_sp_stage3_hits[cls],
memory_order_relaxed);
}
if (active_slots) {
active_slots[cls] = g_shared_pool.class_active_slots[cls];
}
}
}
// Helper: return per-class active slot limit from FrozenPolicy.tiny_cap[]
// Semantics:
// - tiny_cap[class] == 0 → no limit (unbounded)
// - otherwise: soft cap on ACTIVE slots managed by shared pool for this class.
static inline uint32_t sp_class_active_limit(int class_idx) {
const FrozenPolicy* pol = hkm_policy_get();
if (!pol) {
return 0; // no limit
}
if (class_idx < 0 || class_idx >= 8) {
return 0;
}
return (uint32_t)pol->tiny_cap[class_idx];
}
// ============================================================================
// P0-4: Lock-Free Free Slot List - Node Pool
// ============================================================================
// Pre-allocated node pools (one per class, to avoid malloc/free)
FreeSlotNode g_free_node_pool[TINY_NUM_CLASSES_SS][MAX_FREE_NODES_PER_CLASS];
_Atomic uint32_t g_node_alloc_index[TINY_NUM_CLASSES_SS] = {0};
// Recycle list for FreeSlotNode (per class, lock-free LIFO).
// node_alloc() はまずこのリストから再利用を試み、枯渇時のみ新規ノードを切り出す。
static _Atomic(FreeSlotNode*) g_node_free_head[TINY_NUM_CLASSES_SS] = {
[0 ... TINY_NUM_CLASSES_SS-1] = ATOMIC_VAR_INIT(NULL)
};
// Allocate a node from pool (lock-free fast path, may fall back to legacy path)
static inline FreeSlotNode* node_alloc(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return NULL;
}
// First, try to pop from recycle list (nodes returned by pop_lockfree).
FreeSlotNode* free_head = atomic_load_explicit(
&g_node_free_head[class_idx],
memory_order_acquire);
while (free_head != NULL) {
FreeSlotNode* next = free_head->next;
if (atomic_compare_exchange_weak_explicit(
&g_node_free_head[class_idx],
&free_head,
next,
memory_order_acq_rel,
memory_order_acquire)) {
return free_head; // Recycled node
}
// CAS failed: free_head is updated; retry with new head.
}
uint32_t idx = atomic_fetch_add(&g_node_alloc_index[class_idx], 1);
if (idx >= MAX_FREE_NODES_PER_CLASS) {
// Pool exhausted - should be rare. Caller must fall back to legacy
// mutex-protected free list to preserve correctness.
#if !HAKMEM_BUILD_RELEASE
static _Atomic int warn_once = 0;
if (atomic_exchange(&warn_once, 1) == 0) {
fprintf(stderr, "[P0-4 WARN] Node pool exhausted for class %d\n", class_idx);
}
#endif
return NULL;
}
return &g_free_node_pool[class_idx][idx];
}
// ============================================================================
// Phase 12-2: SharedSuperSlabPool skeleton implementation
// Goal:
// - Centralize SuperSlab allocation/registration
// - Provide acquire_slab/release_slab APIs for later refill/free integration
// - Keep logic simple & conservative; correctness and observability first.
//
// Notes:
// - Concurrency: protected by g_shared_pool.alloc_lock for now.
// - class_hints is best-effort: read lock-free, written under lock.
// - LRU hooks left as no-op placeholders.
SharedSuperSlabPool g_shared_pool = {
.slabs = NULL,
.capacity = 0,
.total_count = 0,
.active_count = 0,
.alloc_lock = PTHREAD_MUTEX_INITIALIZER,
.class_hints = { NULL },
.lru_head = NULL,
.lru_tail = NULL,
.lru_count = 0,
// P0-4: Lock-free free slot lists (zero-initialized atomic pointers)
.free_slots_lockfree = {{.head = ATOMIC_VAR_INIT(NULL)}},
// Legacy: mutex-protected free lists
.free_slots = {{.entries = {{0}}, .count = 0}},
// Phase 12: SP-SLOT fields (ss_metadata is fixed-size array, auto-zeroed)
.ss_meta_count = 0
};
static void
shared_pool_ensure_capacity_unlocked(uint32_t min_capacity)
{
if (g_shared_pool.capacity >= min_capacity) {
return;
}
uint32_t new_cap = g_shared_pool.capacity ? g_shared_pool.capacity : 16;
while (new_cap < min_capacity) {
new_cap *= 2;
}
// CRITICAL FIX: Use system mmap() directly to avoid recursion!
// Problem: realloc() goes through HAKMEM allocator → hak_alloc_at(128)
// → needs Shared Pool init → calls realloc() → INFINITE RECURSION!
// Solution: Allocate Shared Pool metadata using system mmap, not HAKMEM allocator
size_t new_size = new_cap * sizeof(SuperSlab*);
SuperSlab** new_slabs = (SuperSlab**)mmap(NULL, new_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (new_slabs == MAP_FAILED) {
// Allocation failure: keep old state; caller must handle NULL later.
return;
}
// Copy old data if exists
if (g_shared_pool.slabs != NULL) {
memcpy(new_slabs, g_shared_pool.slabs,
g_shared_pool.capacity * sizeof(SuperSlab*));
// Free old mapping (also use system munmap, not free!)
size_t old_size = g_shared_pool.capacity * sizeof(SuperSlab*);
munmap(g_shared_pool.slabs, old_size);
}
// Zero new entries to keep scanning logic simple.
memset(new_slabs + g_shared_pool.capacity, 0,
(new_cap - g_shared_pool.capacity) * sizeof(SuperSlab*));
g_shared_pool.slabs = new_slabs;
g_shared_pool.capacity = new_cap;
}
void
shared_pool_init(void)
{
// Idempotent init; safe to call from multiple early paths.
// pthread_mutex_t with static initializer is already valid.
pthread_mutex_lock(&g_shared_pool.alloc_lock);
if (g_shared_pool.capacity == 0 && g_shared_pool.slabs == NULL) {
shared_pool_ensure_capacity_unlocked(16);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}
// ============================================================================
// Phase 12: SP-SLOT Box - Modular Helper Functions
// ============================================================================
// ---------- Layer 1: Slot Operations (Low-level) ----------
// Find first unused slot in SharedSSMeta
// P0-5: Uses atomic load for state check
// Returns: slot_idx on success, -1 if no unused slots
static int sp_slot_find_unused(SharedSSMeta* meta) {
if (!meta) return -1;
for (int i = 0; i < meta->total_slots; i++) {
SlotState state = atomic_load_explicit(&meta->slots[i].state, memory_order_acquire);
if (state == SLOT_UNUSED) {
return i;
}
}
return -1;
}
// Mark slot as ACTIVE (UNUSED→ACTIVE or EMPTY→ACTIVE)
// P0-5: Uses atomic store for state transition (caller must hold mutex!)
// Returns: 0 on success, -1 on error
static int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx) {
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
SharedSlot* slot = &meta->slots[slot_idx];
// Load state atomically
SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire);
// Transition: UNUSED→ACTIVE or EMPTY→ACTIVE
if (state == SLOT_UNUSED || state == SLOT_EMPTY) {
atomic_store_explicit(&slot->state, SLOT_ACTIVE, memory_order_release);
slot->class_idx = (uint8_t)class_idx;
slot->slab_idx = (uint8_t)slot_idx;
meta->active_slots++;
return 0;
}
return -1; // Already ACTIVE or invalid state
}
// Mark slot as EMPTY (ACTIVE→EMPTY)
// P0-5: Uses atomic store for state transition (caller must hold mutex!)
// Returns: 0 on success, -1 on error
static int sp_slot_mark_empty(SharedSSMeta* meta, int slot_idx) {
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
SharedSlot* slot = &meta->slots[slot_idx];
// Load state atomically
SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire);
if (state == SLOT_ACTIVE) {
atomic_store_explicit(&slot->state, SLOT_EMPTY, memory_order_release);
if (meta->active_slots > 0) {
meta->active_slots--;
}
return 0;
}
return -1; // Not ACTIVE
}
// Sync SP-SLOT view from an existing SuperSlab.
// This is needed when a legacy-allocated SuperSlab reaches the shared-pool
// release path for the first time (slot states are still SLOT_UNUSED).
static void sp_meta_sync_slots_from_ss(SharedSSMeta* meta, SuperSlab* ss) {
if (!meta || !ss) return;
int cap = ss_slabs_capacity(ss);
if (cap > MAX_SLOTS_PER_SS) {
cap = MAX_SLOTS_PER_SS;
}
meta->total_slots = (uint8_t)cap;
meta->active_slots = 0;
for (int i = 0; i < cap; i++) {
SlotState state = SLOT_UNUSED;
uint32_t bit = (1u << i);
if (ss->slab_bitmap & bit) {
state = SLOT_ACTIVE;
meta->active_slots++;
} else {
TinySlabMeta* smeta = &ss->slabs[i];
uint16_t used = atomic_load_explicit(&smeta->used, memory_order_relaxed);
if (smeta->capacity > 0 && used == 0) {
state = SLOT_EMPTY;
}
}
uint8_t cls = ss->class_map[i];
if (cls == 255) {
cls = ss->slabs[i].class_idx;
}
meta->slots[i].class_idx = cls;
meta->slots[i].slab_idx = (uint8_t)i;
atomic_store_explicit(&meta->slots[i].state, state, memory_order_release);
}
}
// ---------- Layer 2: Metadata Management (Mid-level) ----------
// Ensure ss_metadata array has capacity for at least min_count entries
// Caller must hold alloc_lock
// Returns: 0 on success, -1 if capacity exceeded
// RACE FIX: No realloc! Fixed-size array prevents race with lock-free Stage 2
static int sp_meta_ensure_capacity(uint32_t min_count) {
if (min_count > MAX_SS_METADATA_ENTRIES) {
#if !HAKMEM_BUILD_RELEASE
static int warn_once = 0;
if (warn_once == 0) {
fprintf(stderr, "[SP_META_CAPACITY_ERROR] Exceeded MAX_SS_METADATA_ENTRIES=%d\n",
MAX_SS_METADATA_ENTRIES);
warn_once = 1;
}
#endif
return -1;
}
return 0;
}
// Find SharedSSMeta for given SuperSlab, or create if not exists
// Caller must hold alloc_lock
// Returns: SharedSSMeta* on success, NULL on error
static SharedSSMeta* sp_meta_find_or_create(SuperSlab* ss) {
if (!ss) return NULL;
// RACE FIX: Load count atomically for consistency (even under mutex)
uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed);
// Search existing metadata
for (uint32_t i = 0; i < count; i++) {
// RACE FIX: Load pointer atomically for consistency
SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed);
if (meta_ss == ss) {
return &g_shared_pool.ss_metadata[i];
}
}
// Create new metadata entry
if (sp_meta_ensure_capacity(count + 1) != 0) {
return NULL;
}
// RACE FIX: Read current count atomically (even under mutex for consistency)
uint32_t current_count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed);
SharedSSMeta* meta = &g_shared_pool.ss_metadata[current_count];
// RACE FIX: Store SuperSlab pointer atomically (visible to lock-free Stage 2)
atomic_store_explicit(&meta->ss, ss, memory_order_relaxed);
meta->total_slots = (uint8_t)ss_slabs_capacity(ss);
meta->active_slots = 0;
// Initialize all slots as UNUSED
// P0-5: Use atomic store for state initialization
for (int i = 0; i < meta->total_slots; i++) {
atomic_store_explicit(&meta->slots[i].state, SLOT_UNUSED, memory_order_relaxed);
meta->slots[i].class_idx = 0;
meta->slots[i].slab_idx = (uint8_t)i;
}
// RACE FIX: Atomic increment with release semantics
// This ensures all writes to metadata[current_count] (lines 268-278) are visible
// before the count increment is visible to lock-free Stage 2 readers
atomic_fetch_add_explicit(&g_shared_pool.ss_meta_count, 1, memory_order_release);
return meta;
}
// ============================================================================
// Phase 12-1.x: Acquire Helper Boxes (Stage 0.5/1/2/3)
// ============================================================================
// Debug / stats helper (Stage hits)
static inline void sp_stage_stats_dump_if_enabled(void) {
#if !HAKMEM_BUILD_RELEASE
static int dump_en = -1;
if (__builtin_expect(dump_en == -1, 0)) {
const char* e = getenv("HAKMEM_SHARED_POOL_STAGE_STATS");
dump_en = (e && *e && *e != '0') ? 1 : 0;
}
if (!dump_en) return;
// 全クラス合計を出力(スキャン/ヒットの分布を見るため)
uint64_t s0 = 0, s1 = 0, s2 = 0, s3 = 0;
for (int c = 0; c < TINY_NUM_CLASSES_SS; c++) {
s0 += atomic_load_explicit(&g_sp_stage0_hits[c], memory_order_relaxed);
s1 += atomic_load_explicit(&g_sp_stage1_hits[c], memory_order_relaxed);
s2 += atomic_load_explicit(&g_sp_stage2_hits[c], memory_order_relaxed);
s3 += atomic_load_explicit(&g_sp_stage3_hits[c], memory_order_relaxed);
}
fprintf(stderr, "[SP_STAGE_STATS] total: stage0.5=%lu stage1=%lu stage2=%lu stage3=%lu\n",
(unsigned long)s0, (unsigned long)s1, (unsigned long)s2, (unsigned long)s3);
#else
(void)g_sp_stage1_hits; (void)g_sp_stage2_hits; (void)g_sp_stage3_hits;
#endif
}
// Stage 0.5: EMPTY slab direct scanregistry ベースの EMPTY 再利用)
static inline int
sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
{
static int empty_reuse_enabled = -1;
if (__builtin_expect(empty_reuse_enabled == -1, 0)) {
const char* e = getenv("HAKMEM_SS_EMPTY_REUSE");
empty_reuse_enabled = (e && *e && *e == '0') ? 0 : 1; // default ON
}
if (!empty_reuse_enabled) {
return -1;
}
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0;
static int scan_limit = -1;
if (__builtin_expect(scan_limit == -1, 0)) {
const char* e = getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT");
scan_limit = (e && *e) ? atoi(e) : 32; // default: scan first 32 SuperSlabs (Phase 9-2 tuning)
}
if (scan_limit > reg_size) scan_limit = reg_size;
// Stage 0.5 hit counter for visualization
static _Atomic uint64_t stage05_hits = 0;
static _Atomic uint64_t stage05_attempts = 0;
atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);
for (int i = 0; i < scan_limit; i++) {
SuperSlab* ss = g_super_reg_by_class[class_idx][i];
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
if (ss->empty_count == 0) continue; // No EMPTY slabs in this SS
uint32_t mask = ss->empty_mask;
while (mask) {
int empty_idx = __builtin_ctz(mask);
mask &= (mask - 1); // clear lowest bit
TinySlabMeta* meta = &ss->slabs[empty_idx];
if (meta->capacity > 0 && meta->used == 0) {
tiny_tls_slab_reuse_guard(ss);
ss_clear_slab_empty(ss, empty_idx);
meta->class_idx = (uint8_t)class_idx;
ss->class_map[empty_idx] = (uint8_t)class_idx;
#if !HAKMEM_BUILD_RELEASE
if (dbg_acquire == 1) {
fprintf(stderr,
"[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u)\n",
class_idx, (void*)ss, empty_idx, ss->empty_count);
}
#else
(void)dbg_acquire;
#endif
*ss_out = ss;
*slab_idx_out = empty_idx;
sp_stage_stats_init();
if (g_sp_stage_stats_enabled) {
atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
}
atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
// Stage 0.5 hit rate visualization (every 100 hits)
uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
if (hits % 100 == 1) {
uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d)\n",
hits, attempts, (double)hits * 100.0 / attempts, scan_limit);
}
return 0;
}
}
}
return -1;
}
// ---------- Layer 3: Free List Management ----------
// Push empty slot to per-class free list
// Caller must hold alloc_lock
// Returns: 0 on success, -1 if list is full
static int sp_freelist_push(int class_idx, SharedSSMeta* meta, int slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
FreeSlotList* list = &g_shared_pool.free_slots[class_idx];
if (list->count >= MAX_FREE_SLOTS_PER_CLASS) {
return -1; // List full
}
list->entries[list->count].meta = meta;
list->entries[list->count].slot_idx = (uint8_t)slot_idx;
list->count++;
return 0;
}
// Pop empty slot from per-class free list
// Caller must hold alloc_lock
// Returns: 1 if popped (out params filled), 0 if list empty
static int sp_freelist_pop(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0;
if (!out_meta || !out_slot_idx) return 0;
FreeSlotList* list = &g_shared_pool.free_slots[class_idx];
if (list->count == 0) {
return 0; // List empty
}
// Pop from end (LIFO for cache locality)
list->count--;
*out_meta = list->entries[list->count].meta;
*out_slot_idx = list->entries[list->count].slot_idx;
return 1;
}
// ============================================================================
// P0-5: Lock-Free Slot Claiming (Stage 2 Optimization)
// ============================================================================
// Try to claim an UNUSED slot via lock-free CAS
// Returns: slot_idx on success, -1 if no UNUSED slots available
// LOCK-FREE: Can be called from any thread without mutex
static int sp_slot_claim_lockfree(SharedSSMeta* meta, int class_idx) {
if (!meta) return -1;
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
// Scan all slots for UNUSED state
for (int i = 0; i < meta->total_slots; i++) {
SlotState expected = SLOT_UNUSED;
// Try to claim this slot atomically (UNUSED → ACTIVE)
if (atomic_compare_exchange_strong_explicit(
&meta->slots[i].state,
&expected,
SLOT_ACTIVE,
memory_order_acq_rel, // Success: acquire+release semantics
memory_order_relaxed // Failure: just retry next slot
)) {
// Successfully claimed! Update non-atomic fields
// (Safe because we now own this slot)
meta->slots[i].class_idx = (uint8_t)class_idx;
meta->slots[i].slab_idx = (uint8_t)i;
// Increment active_slots counter atomically
// (Multiple threads may claim slots concurrently)
atomic_fetch_add_explicit(
(_Atomic uint8_t*)&meta->active_slots, 1,
memory_order_relaxed
);
return i; // Return claimed slot index
}
// CAS failed (slot was not UNUSED) - continue to next slot
}
return -1; // No UNUSED slots available
}
// ============================================================================
// P0-4: Lock-Free Free Slot List Operations
// ============================================================================
// Push empty slot to lock-free per-class free list (LIFO)
// LOCK-FREE: Can be called from any thread without mutex
// Returns: 0 on success, -1 on failure (node pool exhausted)
static int sp_freelist_push_lockfree(int class_idx, SharedSSMeta* meta, int slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
// Allocate node from pool
FreeSlotNode* node = node_alloc(class_idx);
if (!node) {
// Fallback: push into legacy per-class free list
// ASSUME: Caller already holds alloc_lock (e.g., shared_pool_release_slab:772)
// Do NOT lock again to avoid deadlock on non-recursive mutex!
(void)sp_freelist_push(class_idx, meta, slot_idx);
return 0;
}
// Fill node data
node->meta = meta;
node->slot_idx = (uint8_t)slot_idx;
// Lock-free LIFO push using CAS loop
LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx];
FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_relaxed);
do {
node->next = old_head;
} while (!atomic_compare_exchange_weak_explicit(
&list->head, &old_head, node,
memory_order_release, // Success: publish node to other threads
memory_order_relaxed // Failure: retry with updated old_head
));
return 0; // Success
}
// Pop empty slot from lock-free per-class free list (LIFO)
// LOCK-FREE: Can be called from any thread without mutex
// Returns: 1 if popped (out params filled), 0 if list empty
static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0;
if (!out_meta || !out_slot_idx) return 0;
LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx];
FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_acquire);
// Lock-free LIFO pop using CAS loop
do {
if (old_head == NULL) {
return 0; // List empty
}
} while (!atomic_compare_exchange_weak_explicit(
&list->head, &old_head, old_head->next,
memory_order_acquire, // Success: acquire node data
memory_order_acquire // Failure: retry with updated old_head
));
// Extract data from popped node
*out_meta = old_head->meta;
*out_slot_idx = old_head->slot_idx;
// Recycle node back into per-class free list so that long-running workloads
// do not permanently consume new nodes on every EMPTY event.
FreeSlotNode* free_head = atomic_load_explicit(
&g_node_free_head[class_idx],
memory_order_acquire);
do {
old_head->next = free_head;
} while (!atomic_compare_exchange_weak_explicit(
&g_node_free_head[class_idx],
&free_head,
old_head,
memory_order_release,
memory_order_acquire));
return 1; // Success
}
// Internal helper: Allocates a new SuperSlab from the OS and performs basic initialization.
// Does NOT interact with g_shared_pool.slabs[] or g_shared_pool.total_count directly.
// Caller is responsible for adding the SuperSlab to g_shared_pool's arrays and metadata.
static SuperSlab*
sp_internal_allocate_superslab(void)
{
// Use size_class 0 as a neutral hint; Phase 12 per-slab class_idx is authoritative.
extern SuperSlab* superslab_allocate(uint8_t size_class);
SuperSlab* ss = superslab_allocate(0);
if (!ss) {
return NULL;
}
// PageFaultTelemetry: mark all backing pages for this Superslab (approximate)
size_t ss_bytes = (size_t)1 << ss->lg_size;
for (size_t off = 0; off < ss_bytes; off += 4096) {
pagefault_telemetry_touch(PF_BUCKET_SS_META, (char*)ss + off);
}
// superslab_allocate() already:
// - zeroes slab metadata / remote queues,
// - sets magic/lg_size/etc,
// - registers in global registry.
// For shared-pool semantics we normalize all slab class_idx to UNASSIGNED.
int max_slabs = ss_slabs_capacity(ss);
for (int i = 0; i < max_slabs; i++) {
ss_slab_meta_class_idx_set(ss, i, 255); // UNASSIGNED
// P1.1: Initialize class_map to UNASSIGNED as well
ss->class_map[i] = 255;
}
return ss;
}
SuperSlab*
shared_pool_acquire_superslab(void)
{
shared_pool_init();
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// For now, always allocate a fresh SuperSlab and register it.
// More advanced reuse/GC comes later.
// Release lock to avoid deadlock with registry during superslab_allocate
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
SuperSlab* ss = sp_internal_allocate_superslab(); // Call lock-free internal helper
pthread_mutex_lock(&g_shared_pool.alloc_lock);
if (!ss) {
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return NULL;
}
// Add newly allocated SuperSlab to the shared pool's internal array
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
// Pool table expansion failed; leave ss alive (registry-owned),
// but do not treat it as part of shared_pool.
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return NULL;
}
}
g_shared_pool.slabs[g_shared_pool.total_count] = ss;
g_shared_pool.total_count++;
// Not counted as active until at least one slab is assigned.
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return ss;
}
// ---------- Layer 4: Public API (High-level) ----------
// Ensure slab geometry matches current class stride (handles upgrades like C7 1024->2048).
static inline void sp_fix_geometry_if_needed(SuperSlab* ss, int slab_idx, int class_idx)
{
if (!ss || slab_idx < 0 || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return;
}
TinySlabMeta* meta = &ss->slabs[slab_idx];
size_t stride = g_tiny_class_sizes[class_idx];
size_t usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
uint16_t expect_cap = (uint16_t)(usable / stride);
// Reinitialize if capacity is off or class_idx mismatches.
if (meta->class_idx != (uint8_t)class_idx || meta->capacity != expect_cap) {
#if !HAKMEM_BUILD_RELEASE
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[SP_FIX_GEOMETRY] ss=%p slab=%d cls=%d: old_cls=%u old_cap=%u -> new_cls=%d new_cap=%u (stride=%zu)\n",
(void*)ss, slab_idx, class_idx,
meta->class_idx, meta->capacity,
class_idx, expect_cap, stride);
g_hakmem_lock_depth--;
#endif
superslab_init_slab(ss, slab_idx, stride, 0 /*owner_tid*/);
meta->class_idx = (uint8_t)class_idx;
// P1.1: Update class_map after geometry fix
ss->class_map[slab_idx] = (uint8_t)class_idx;
}
}
int
shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
{
// Phase 12: SP-SLOT Box - 3-Stage Acquire Logic
//
// Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE)
// Stage 2: Find UNUSED slots in existing SuperSlabs
// Stage 3: Get new SuperSlab (LRU pop or mmap)
//
// Invariants:
// - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots
// - The chosen slab has meta->class_idx == class_idx
if (!ss_out || !slab_idx_out) {
return -1;
}
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return -1;
}
shared_pool_init();
// Debug logging / stage stats
#if !HAKMEM_BUILD_RELEASE
static int dbg_acquire = -1;
if (__builtin_expect(dbg_acquire == -1, 0)) {
const char* e = getenv("HAKMEM_SS_ACQUIRE_DEBUG");
dbg_acquire = (e && *e && *e != '0') ? 1 : 0;
}
#else
static const int dbg_acquire = 0;
#endif
sp_stage_stats_init();
stage1_retry_after_tension_drain:
// ========== Stage 0.5 (Phase 12-1.1): EMPTY slab direct scan ==========
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
// avoid Stage 3 (mmap) when freed slabs are available.
if (sp_acquire_from_empty_scan(class_idx, ss_out, slab_idx_out, dbg_acquire) == 0) {
return 0;
}
// ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
// P0-4: Lock-free pop from per-class free list (no mutex needed!)
// Best case: Same class freed a slot, reuse immediately (cache-hot)
SharedSSMeta* reuse_meta = NULL;
int reuse_slot_idx = -1;
if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
// Found EMPTY slot from lock-free list!
// Now acquire mutex ONLY for slot activation and metadata update
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// P0.3: Guard against TLS SLL orphaned pointers before reusing slab
// RACE FIX: Load SuperSlab pointer atomically BEFORE guard (consistency)
SuperSlab* ss_guard = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
if (ss_guard) {
tiny_tls_slab_reuse_guard(ss_guard);
}
// Activate slot under mutex (slot state transition requires protection)
if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
// RACE FIX: Load SuperSlab pointer atomically (consistency)
SuperSlab* ss = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
// RACE FIX: Check if SuperSlab was freed (NULL pointer)
// This can happen if Thread A freed the SuperSlab after pushing slot to freelist,
// but Thread B popped the stale slot before the freelist was cleared.
if (!ss) {
// SuperSlab freed - skip and fall through to Stage 2/3
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
goto stage2_fallback;
}
#if !HAKMEM_BUILD_RELEASE
if (dbg_acquire == 1) {
fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
class_idx, (void*)ss, reuse_slot_idx);
}
#endif
// Update SuperSlab metadata
ss->slab_bitmap |= (1u << reuse_slot_idx);
ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx);
if (ss->active_slabs == 0) {
// Was empty, now active again
ss->active_slabs = 1;
g_shared_pool.active_count++;
}
// Track per-class active slots (approximate, under alloc_lock)
if (class_idx < TINY_NUM_CLASSES_SS) {
g_shared_pool.class_active_slots[class_idx]++;
}
// Update hint
g_shared_pool.class_hints[class_idx] = ss;
*ss_out = ss;
*slab_idx_out = reuse_slot_idx;
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
if (g_sp_stage_stats_enabled) {
atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
}
return 0; // ✅ Stage 1 (lock-free) success
}
// Slot activation failed (race condition?) - release lock and fall through
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}
stage2_fallback:
// ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
// P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
// RACE FIX: Read ss_meta_count atomically (now properly declared as _Atomic)
// No cast needed! memory_order_acquire synchronizes with release in sp_meta_find_or_create
uint32_t meta_count = atomic_load_explicit(
&g_shared_pool.ss_meta_count,
memory_order_acquire
);
for (uint32_t i = 0; i < meta_count; i++) {
SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];
// Try lock-free claiming (UNUSED → ACTIVE via CAS)
int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
if (claimed_idx >= 0) {
// RACE FIX: Load SuperSlab pointer atomically (critical for lock-free Stage 2)
// Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire);
if (!ss) {
// SuperSlab was freed between claiming and loading - skip this entry
continue;
}
#if !HAKMEM_BUILD_RELEASE
if (dbg_acquire == 1) {
fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
class_idx, (void*)ss, claimed_idx);
}
#endif
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// Update SuperSlab metadata under mutex
ss->slab_bitmap |= (1u << claimed_idx);
ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
if (ss->active_slabs == 0) {
ss->active_slabs = 1;
g_shared_pool.active_count++;
}
if (class_idx < TINY_NUM_CLASSES_SS) {
g_shared_pool.class_active_slots[class_idx]++;
}
// Update hint
g_shared_pool.class_hints[class_idx] = ss;
*ss_out = ss;
*slab_idx_out = claimed_idx;
sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
if (g_sp_stage_stats_enabled) {
atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
}
return 0; // ✅ Stage 2 (lock-free) success
}
// Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
}
// ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ==========
// If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection
// This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1
// ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1)
// ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024)
{
static int tension_drain_enabled = -1;
static uint32_t tension_threshold = 1024;
if (tension_drain_enabled < 0) {
const char* env = getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE");
tension_drain_enabled = (env == NULL || atoi(env) != 0) ? 1 : 0;
const char* thresh_env = getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD");
if (thresh_env) {
tension_threshold = (uint32_t)atoi(thresh_env);
if (tension_threshold < 64) tension_threshold = 64;
if (tension_threshold > 65536) tension_threshold = 65536;
}
}
if (tension_drain_enabled) {
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size);
uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll[class_idx].count : 0;
if (sll_count >= tension_threshold) {
// Drain all blocks to maximize EMPTY slot creation
uint32_t drained = tiny_tls_sll_drain(class_idx, 0); // 0 = drain all
if (drained > 0) {
// Retry Stage 1 (EMPTY reuse) after drain
// Some slabs might have become EMPTY (meta->used == 0)
goto stage1_retry_after_tension_drain;
}
}
}
}
// ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
// All existing SuperSlabs have no UNUSED slots → need new SuperSlab
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// ========== Stage 3: Get new SuperSlab ==========
// Try LRU cache first, then mmap
SuperSlab* new_ss = NULL;
// Stage 3a: Try LRU cache
extern SuperSlab* hak_ss_lru_pop(uint8_t size_class);
new_ss = hak_ss_lru_pop((uint8_t)class_idx);
int from_lru = (new_ss != NULL);
// Stage 3b: If LRU miss, allocate new SuperSlab
if (!new_ss) {
// Release the alloc_lock to avoid deadlock with registry during superslab_allocate
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
SuperSlab* allocated_ss = sp_internal_allocate_superslab();
// Re-acquire the alloc_lock
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
if (!allocated_ss) {
// Allocation failed; return now.
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // Out of memory
}
new_ss = allocated_ss;
// Add newly allocated SuperSlab to the shared pool's internal array
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
// Pool table expansion failed; leave ss alive (registry-owned),
// but do not treat it as part of shared_pool.
// This is a critical error, return early.
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1;
}
}
g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
g_shared_pool.total_count++;
}
#if !HAKMEM_BUILD_RELEASE
if (dbg_acquire == 1 && new_ss) {
fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n",
class_idx, (void*)new_ss, from_lru);
}
#endif
if (!new_ss) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // ❌ Out of memory
}
// Before creating a new SuperSlab, consult learning-layer soft cap.
// If current active slots for this class already exceed the policy cap,
// fail early so caller can fall back to legacy backend.
uint32_t limit = sp_class_active_limit(class_idx);
if (limit > 0) {
uint32_t cur = g_shared_pool.class_active_slots[class_idx];
if (cur >= limit) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // Soft cap reached for this class
}
}
// Create metadata for this new SuperSlab
SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
if (!new_meta) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // ❌ Metadata allocation failed
}
// Assign first slot to this class
int first_slot = 0;
if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // ❌ Should not happen
}
// Update SuperSlab metadata
new_ss->slab_bitmap |= (1u << first_slot);
ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx);
new_ss->active_slabs = 1;
g_shared_pool.active_count++;
if (class_idx < TINY_NUM_CLASSES_SS) {
g_shared_pool.class_active_slots[class_idx]++;
}
// Update hint
g_shared_pool.class_hints[class_idx] = new_ss;
*ss_out = new_ss;
*slab_idx_out = first_slot;
sp_fix_geometry_if_needed(new_ss, first_slot, class_idx);
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
if (g_sp_stage_stats_enabled) {
atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1);
}
return 0; // ✅ Stage 3 success
}
void
shared_pool_release_slab(SuperSlab* ss, int slab_idx)
{
// Phase 12: SP-SLOT Box - Slot-based Release
//
// Flow:
// 1. Validate inputs and check meta->used == 0
// 2. Find SharedSSMeta for this SuperSlab
// 3. Mark slot ACTIVE → EMPTY
// 4. Push to per-class free list (enables same-class reuse)
// 5. If all slots EMPTY → superslab_free() → LRU cache
if (!ss) {
return;
}
if (slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) {
return;
}
// Debug logging
#if !HAKMEM_BUILD_RELEASE
static int dbg = -1;
if (__builtin_expect(dbg == -1, 0)) {
const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
dbg = (e && *e && *e != '0') ? 1 : 0;
}
#else
static const int dbg = 0;
#endif
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_release_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
if (slab_meta->used != 0) {
// Not actually empty; nothing to do
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return;
}
uint8_t class_idx = slab_meta->class_idx;
#if !HAKMEM_BUILD_RELEASE
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_RELEASE] ss=%p slab_idx=%d class=%d used=0 (marking EMPTY)\n",
(void*)ss, slab_idx, class_idx);
}
#endif
// Find SharedSSMeta for this SuperSlab
SharedSSMeta* sp_meta = NULL;
uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed);
for (uint32_t i = 0; i < count; i++) {
// RACE FIX: Load pointer atomically
SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed);
if (meta_ss == ss) {
sp_meta = &g_shared_pool.ss_metadata[i];
break;
}
}
if (!sp_meta) {
// SuperSlab not in SP-SLOT system yet - create metadata
sp_meta = sp_meta_find_or_create(ss);
if (!sp_meta) {
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return; // Failed to create metadata
}
}
// Mark slot as EMPTY (ACTIVE → EMPTY)
uint32_t slab_bit = (1u << slab_idx);
SlotState slot_state = atomic_load_explicit(
&sp_meta->slots[slab_idx].state,
memory_order_acquire);
if (slot_state != SLOT_ACTIVE && (ss->slab_bitmap & slab_bit)) {
// Legacy path import: rebuild slot states from SuperSlab bitmap/class_map
sp_meta_sync_slots_from_ss(sp_meta, ss);
slot_state = atomic_load_explicit(
&sp_meta->slots[slab_idx].state,
memory_order_acquire);
}
if (slot_state != SLOT_ACTIVE || sp_slot_mark_empty(sp_meta, slab_idx) != 0) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return; // Slot wasn't ACTIVE
}
// Update SuperSlab metadata
uint32_t bit = (1u << slab_idx);
if (ss->slab_bitmap & bit) {
ss->slab_bitmap &= ~bit;
slab_meta->class_idx = 255; // UNASSIGNED
// P1.1: Mark class_map as UNASSIGNED when releasing slab
ss->class_map[slab_idx] = 255;
if (ss->active_slabs > 0) {
ss->active_slabs--;
if (ss->active_slabs == 0 && g_shared_pool.active_count > 0) {
g_shared_pool.active_count--;
}
}
if (class_idx < TINY_NUM_CLASSES_SS &&
g_shared_pool.class_active_slots[class_idx] > 0) {
g_shared_pool.class_active_slots[class_idx]--;
}
}
// P0-4: Push to lock-free per-class free list (enables reuse by same class)
// Note: push BEFORE releasing mutex (slot state already updated under lock)
if (class_idx < TINY_NUM_CLASSES_SS) {
sp_freelist_push_lockfree(class_idx, sp_meta, slab_idx);
#if !HAKMEM_BUILD_RELEASE
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_FREELIST_LOCKFREE] class=%d pushed slot (ss=%p slab=%d) active_slots=%u/%u\n",
class_idx, (void*)ss, slab_idx,
sp_meta->active_slots, sp_meta->total_slots);
}
#endif
}
// Check if SuperSlab is now completely empty (all slots EMPTY or UNUSED)
if (sp_meta->active_slots == 0) {
#if !HAKMEM_BUILD_RELEASE
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_COMPLETELY_EMPTY] ss=%p active_slots=0 (calling superslab_free)\n",
(void*)ss);
}
#endif
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
// RACE FIX: Set meta->ss to NULL BEFORE unlocking mutex
// This prevents Stage 2 from accessing freed SuperSlab
atomic_store_explicit(&sp_meta->ss, NULL, memory_order_release);
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
// Remove from legacy backend list (if present) to prevent dangling pointers
extern void remove_superslab_from_legacy_head(SuperSlab* ss);
remove_superslab_from_legacy_head(ss);
// Free SuperSlab:
// 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation
// 2. Or munmap if LRU is full - eager deallocation
extern void superslab_free(SuperSlab* ss);
superslab_free(ss);
return;
}
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}