Files
hakmem/core/hakmem_shared_pool.c
Moe Charm (CI) ec453d67f2 Mid-Large Phase 12 Complete + P0-5 Lock-Free Stage 2
**Phase 12 第1ラウンド完了** 
- 0.24M → 2.39M ops/s (8T, **+896%**)
- SEGFAULT → Zero crashes (**100% → 0%**)
- futex: 209 → 10 calls (**-95%**)

**P0-5: Lock-Free Stage 2 (Slot Claiming)**
- Atomic SlotState: `_Atomic SlotState state`
- sp_slot_claim_lockfree(): CAS-based UNUSED→ACTIVE transition
- acquire_slab() Stage 2: Lock-free claiming (mutex only for metadata)
- Result: 2.34M → 2.39M ops/s (+2.5% @ 8T)

**Implementation**:
- core/hakmem_shared_pool.h: Atomic SlotState definition
- core/hakmem_shared_pool.c:
  - sp_slot_claim_lockfree() (+40 lines)
  - Atomic helpers: sp_slot_find_unused/mark_active/mark_empty
  - Stage 2 lock-free integration
- Verified via debug logs: STAGE2_LOCKFREE claiming works

**Reports**:
- MID_LARGE_P0_PHASE_REPORT.md: P0-0 to P0-4 comprehensive summary
- MID_LARGE_FINAL_AB_REPORT.md: Complete Phase 12 A/B comparison (17KB)
  - Performance evolution table
  - Lock contention analysis  - Lessons learned
  - File inventory

**Tiny Baseline Measurement** 📊
- System malloc: 82.9M ops/s (256B)
- HAKMEM:        8.88M ops/s (256B)
- **Gap: 9.3x slower** (target for next phase)

**Next**: Tiny allocator optimization (drain interval, front cache, perf profile)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 16:51:53 +09:00

862 lines
30 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "hakmem_shared_pool.h"
#include "hakmem_tiny_superslab.h"
#include "hakmem_tiny_superslab_constants.h"
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
#include <stdio.h>
// ============================================================================
// P0 Lock Contention Instrumentation
// ============================================================================
static _Atomic uint64_t g_lock_acquire_count = 0; // Total lock acquisitions
static _Atomic uint64_t g_lock_release_count = 0; // Total lock releases
static _Atomic uint64_t g_lock_acquire_slab_count = 0; // Locks from acquire_slab path
static _Atomic uint64_t g_lock_release_slab_count = 0; // Locks from release_slab path
static int g_lock_stats_enabled = -1; // -1=uninitialized, 0=off, 1=on
// Initialize lock stats from environment variable
static inline void lock_stats_init(void) {
if (__builtin_expect(g_lock_stats_enabled == -1, 0)) {
const char* env = getenv("HAKMEM_SHARED_POOL_LOCK_STATS");
g_lock_stats_enabled = (env && *env && *env != '0') ? 1 : 0;
}
}
// Report lock statistics at shutdown
static void __attribute__((destructor)) lock_stats_report(void) {
if (g_lock_stats_enabled != 1) {
return;
}
uint64_t acquires = atomic_load(&g_lock_acquire_count);
uint64_t releases = atomic_load(&g_lock_release_count);
uint64_t acquire_path = atomic_load(&g_lock_acquire_slab_count);
uint64_t release_path = atomic_load(&g_lock_release_slab_count);
fprintf(stderr, "\n=== SHARED POOL LOCK STATISTICS ===\n");
fprintf(stderr, "Total lock ops: %lu (acquire) + %lu (release) = %lu\n",
acquires, releases, acquires + releases);
fprintf(stderr, "Balance: %ld (should be 0)\n",
(int64_t)acquires - (int64_t)releases);
fprintf(stderr, "\n--- Breakdown by Code Path ---\n");
fprintf(stderr, "acquire_slab(): %lu (%.1f%%)\n",
acquire_path, 100.0 * acquire_path / (acquires ? acquires : 1));
fprintf(stderr, "release_slab(): %lu (%.1f%%)\n",
release_path, 100.0 * release_path / (acquires ? acquires : 1));
fprintf(stderr, "===================================\n");
}
// ============================================================================
// P0-4: Lock-Free Free Slot List - Node Pool
// ============================================================================
// Pre-allocated node pools (one per class, to avoid malloc/free)
FreeSlotNode g_free_node_pool[TINY_NUM_CLASSES_SS][MAX_FREE_NODES_PER_CLASS];
_Atomic uint32_t g_node_alloc_index[TINY_NUM_CLASSES_SS] = {0};
// Allocate a node from pool (lock-free, never fails until pool exhausted)
static inline FreeSlotNode* node_alloc(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return NULL;
}
uint32_t idx = atomic_fetch_add(&g_node_alloc_index[class_idx], 1);
if (idx >= MAX_FREE_NODES_PER_CLASS) {
// Pool exhausted - should not happen in practice
static _Atomic int warn_once = 0;
if (atomic_exchange(&warn_once, 1) == 0) {
fprintf(stderr, "[P0-4 WARN] Node pool exhausted for class %d\n", class_idx);
}
return NULL;
}
return &g_free_node_pool[class_idx][idx];
}
// ============================================================================
// Phase 12-2: SharedSuperSlabPool skeleton implementation
// Goal:
// - Centralize SuperSlab allocation/registration
// - Provide acquire_slab/release_slab APIs for later refill/free integration
// - Keep logic simple & conservative; correctness and observability first.
//
// Notes:
// - Concurrency: protected by g_shared_pool.alloc_lock for now.
// - class_hints is best-effort: read lock-free, written under lock.
// - LRU hooks left as no-op placeholders.
SharedSuperSlabPool g_shared_pool = {
.slabs = NULL,
.capacity = 0,
.total_count = 0,
.active_count = 0,
.alloc_lock = PTHREAD_MUTEX_INITIALIZER,
.class_hints = { NULL },
.lru_head = NULL,
.lru_tail = NULL,
.lru_count = 0,
// P0-4: Lock-free free slot lists (zero-initialized atomic pointers)
.free_slots_lockfree = {{.head = ATOMIC_VAR_INIT(NULL)}},
// Legacy: mutex-protected free lists
.free_slots = {{.entries = {{0}}, .count = 0}},
// Phase 12: SP-SLOT fields
.ss_metadata = NULL,
.ss_meta_capacity = 0,
.ss_meta_count = 0
};
static void
shared_pool_ensure_capacity_unlocked(uint32_t min_capacity)
{
if (g_shared_pool.capacity >= min_capacity) {
return;
}
uint32_t new_cap = g_shared_pool.capacity ? g_shared_pool.capacity : 16;
while (new_cap < min_capacity) {
new_cap *= 2;
}
SuperSlab** new_slabs = (SuperSlab**)realloc(g_shared_pool.slabs,
new_cap * sizeof(SuperSlab*));
if (!new_slabs) {
// Allocation failure: keep old state; caller must handle NULL later.
return;
}
// Zero new entries to keep scanning logic simple.
memset(new_slabs + g_shared_pool.capacity, 0,
(new_cap - g_shared_pool.capacity) * sizeof(SuperSlab*));
g_shared_pool.slabs = new_slabs;
g_shared_pool.capacity = new_cap;
}
void
shared_pool_init(void)
{
// Idempotent init; safe to call from multiple early paths.
// pthread_mutex_t with static initializer is already valid.
pthread_mutex_lock(&g_shared_pool.alloc_lock);
if (g_shared_pool.capacity == 0 && g_shared_pool.slabs == NULL) {
shared_pool_ensure_capacity_unlocked(16);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}
// ============================================================================
// Phase 12: SP-SLOT Box - Modular Helper Functions
// ============================================================================
// ---------- Layer 1: Slot Operations (Low-level) ----------
// Find first unused slot in SharedSSMeta
// P0-5: Uses atomic load for state check
// Returns: slot_idx on success, -1 if no unused slots
static int sp_slot_find_unused(SharedSSMeta* meta) {
if (!meta) return -1;
for (int i = 0; i < meta->total_slots; i++) {
SlotState state = atomic_load_explicit(&meta->slots[i].state, memory_order_acquire);
if (state == SLOT_UNUSED) {
return i;
}
}
return -1;
}
// Mark slot as ACTIVE (UNUSED→ACTIVE or EMPTY→ACTIVE)
// P0-5: Uses atomic store for state transition (caller must hold mutex!)
// Returns: 0 on success, -1 on error
static int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx) {
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
SharedSlot* slot = &meta->slots[slot_idx];
// Load state atomically
SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire);
// Transition: UNUSED→ACTIVE or EMPTY→ACTIVE
if (state == SLOT_UNUSED || state == SLOT_EMPTY) {
atomic_store_explicit(&slot->state, SLOT_ACTIVE, memory_order_release);
slot->class_idx = (uint8_t)class_idx;
slot->slab_idx = (uint8_t)slot_idx;
meta->active_slots++;
return 0;
}
return -1; // Already ACTIVE or invalid state
}
// Mark slot as EMPTY (ACTIVE→EMPTY)
// P0-5: Uses atomic store for state transition (caller must hold mutex!)
// Returns: 0 on success, -1 on error
static int sp_slot_mark_empty(SharedSSMeta* meta, int slot_idx) {
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
SharedSlot* slot = &meta->slots[slot_idx];
// Load state atomically
SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire);
if (state == SLOT_ACTIVE) {
atomic_store_explicit(&slot->state, SLOT_EMPTY, memory_order_release);
if (meta->active_slots > 0) {
meta->active_slots--;
}
return 0;
}
return -1; // Not ACTIVE
}
// ---------- Layer 2: Metadata Management (Mid-level) ----------
// Ensure ss_metadata array has capacity for at least min_count entries
// Caller must hold alloc_lock
// Returns: 0 on success, -1 on allocation failure
static int sp_meta_ensure_capacity(uint32_t min_count) {
if (g_shared_pool.ss_meta_capacity >= min_count) {
return 0;
}
uint32_t new_cap = g_shared_pool.ss_meta_capacity ? g_shared_pool.ss_meta_capacity : 16;
while (new_cap < min_count) {
new_cap *= 2;
}
SharedSSMeta* new_meta = (SharedSSMeta*)realloc(
g_shared_pool.ss_metadata,
new_cap * sizeof(SharedSSMeta)
);
if (!new_meta) {
return -1;
}
// Zero new entries
memset(new_meta + g_shared_pool.ss_meta_capacity, 0,
(new_cap - g_shared_pool.ss_meta_capacity) * sizeof(SharedSSMeta));
g_shared_pool.ss_metadata = new_meta;
g_shared_pool.ss_meta_capacity = new_cap;
return 0;
}
// Find SharedSSMeta for given SuperSlab, or create if not exists
// Caller must hold alloc_lock
// Returns: SharedSSMeta* on success, NULL on error
static SharedSSMeta* sp_meta_find_or_create(SuperSlab* ss) {
if (!ss) return NULL;
// Search existing metadata
for (uint32_t i = 0; i < g_shared_pool.ss_meta_count; i++) {
if (g_shared_pool.ss_metadata[i].ss == ss) {
return &g_shared_pool.ss_metadata[i];
}
}
// Create new metadata entry
if (sp_meta_ensure_capacity(g_shared_pool.ss_meta_count + 1) != 0) {
return NULL;
}
SharedSSMeta* meta = &g_shared_pool.ss_metadata[g_shared_pool.ss_meta_count];
meta->ss = ss;
meta->total_slots = (uint8_t)ss_slabs_capacity(ss);
meta->active_slots = 0;
// Initialize all slots as UNUSED
// P0-5: Use atomic store for state initialization
for (int i = 0; i < meta->total_slots; i++) {
atomic_store_explicit(&meta->slots[i].state, SLOT_UNUSED, memory_order_relaxed);
meta->slots[i].class_idx = 0;
meta->slots[i].slab_idx = (uint8_t)i;
}
g_shared_pool.ss_meta_count++;
return meta;
}
// ---------- Layer 3: Free List Management ----------
// Push empty slot to per-class free list
// Caller must hold alloc_lock
// Returns: 0 on success, -1 if list is full
static int sp_freelist_push(int class_idx, SharedSSMeta* meta, int slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
FreeSlotList* list = &g_shared_pool.free_slots[class_idx];
if (list->count >= MAX_FREE_SLOTS_PER_CLASS) {
return -1; // List full
}
list->entries[list->count].meta = meta;
list->entries[list->count].slot_idx = (uint8_t)slot_idx;
list->count++;
return 0;
}
// Pop empty slot from per-class free list
// Caller must hold alloc_lock
// Returns: 1 if popped (out params filled), 0 if list empty
static int sp_freelist_pop(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0;
if (!out_meta || !out_slot_idx) return 0;
FreeSlotList* list = &g_shared_pool.free_slots[class_idx];
if (list->count == 0) {
return 0; // List empty
}
// Pop from end (LIFO for cache locality)
list->count--;
*out_meta = list->entries[list->count].meta;
*out_slot_idx = list->entries[list->count].slot_idx;
return 1;
}
// ============================================================================
// P0-5: Lock-Free Slot Claiming (Stage 2 Optimization)
// ============================================================================
// Try to claim an UNUSED slot via lock-free CAS
// Returns: slot_idx on success, -1 if no UNUSED slots available
// LOCK-FREE: Can be called from any thread without mutex
static int sp_slot_claim_lockfree(SharedSSMeta* meta, int class_idx) {
if (!meta) return -1;
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
// Scan all slots for UNUSED state
for (int i = 0; i < meta->total_slots; i++) {
SlotState expected = SLOT_UNUSED;
// Try to claim this slot atomically (UNUSED → ACTIVE)
if (atomic_compare_exchange_strong_explicit(
&meta->slots[i].state,
&expected,
SLOT_ACTIVE,
memory_order_acq_rel, // Success: acquire+release semantics
memory_order_relaxed // Failure: just retry next slot
)) {
// Successfully claimed! Update non-atomic fields
// (Safe because we now own this slot)
meta->slots[i].class_idx = (uint8_t)class_idx;
meta->slots[i].slab_idx = (uint8_t)i;
// Increment active_slots counter atomically
// (Multiple threads may claim slots concurrently)
atomic_fetch_add_explicit(
(_Atomic uint8_t*)&meta->active_slots, 1,
memory_order_relaxed
);
return i; // Return claimed slot index
}
// CAS failed (slot was not UNUSED) - continue to next slot
}
return -1; // No UNUSED slots available
}
// ============================================================================
// P0-4: Lock-Free Free Slot List Operations
// ============================================================================
// Push empty slot to lock-free per-class free list (LIFO)
// LOCK-FREE: Can be called from any thread without mutex
// Returns: 0 on success, -1 on failure (node pool exhausted)
static int sp_freelist_push_lockfree(int class_idx, SharedSSMeta* meta, int slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
// Allocate node from pool
FreeSlotNode* node = node_alloc(class_idx);
if (!node) {
return -1; // Pool exhausted
}
// Fill node data
node->meta = meta;
node->slot_idx = (uint8_t)slot_idx;
// Lock-free LIFO push using CAS loop
LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx];
FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_relaxed);
do {
node->next = old_head;
} while (!atomic_compare_exchange_weak_explicit(
&list->head, &old_head, node,
memory_order_release, // Success: publish node to other threads
memory_order_relaxed // Failure: retry with updated old_head
));
return 0; // Success
}
// Pop empty slot from lock-free per-class free list (LIFO)
// LOCK-FREE: Can be called from any thread without mutex
// Returns: 1 if popped (out params filled), 0 if list empty
static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0;
if (!out_meta || !out_slot_idx) return 0;
LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx];
FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_acquire);
// Lock-free LIFO pop using CAS loop
do {
if (old_head == NULL) {
return 0; // List empty
}
} while (!atomic_compare_exchange_weak_explicit(
&list->head, &old_head, old_head->next,
memory_order_acquire, // Success: acquire node data
memory_order_acquire // Failure: retry with updated old_head
));
// Extract data from popped node
*out_meta = old_head->meta;
*out_slot_idx = old_head->slot_idx;
// NOTE: We do NOT free the node back to pool (no node recycling yet)
// This is acceptable because MAX_FREE_NODES_PER_CLASS (512) is generous
// and workloads typically don't push/pop the same slot repeatedly
return 1; // Success
}
/*
* Internal: allocate and register a new SuperSlab for the shared pool.
*
* Phase 12 NOTE:
* - We MUST use the real superslab_allocate() path so that:
* - backing memory is a full SuperSlab region (12MB),
* - header/layout are initialized correctly,
* - registry integration stays consistent.
* - shared_pool is responsible only for:
* - tracking pointers,
* - marking per-slab class_idx as UNASSIGNED initially.
* It does NOT bypass registry/LRU.
*
* Caller must hold alloc_lock.
*/
static SuperSlab*
shared_pool_allocate_superslab_unlocked(void)
{
// Use size_class 0 as a neutral hint; Phase 12 per-slab class_idx is authoritative.
extern SuperSlab* superslab_allocate(uint8_t size_class);
SuperSlab* ss = superslab_allocate(0);
if (!ss) {
return NULL;
}
// superslab_allocate() already:
// - zeroes slab metadata / remote queues,
// - sets magic/lg_size/etc,
// - registers in global registry.
// For shared-pool semantics we normalize all slab class_idx to UNASSIGNED.
int max_slabs = ss_slabs_capacity(ss);
for (int i = 0; i < max_slabs; i++) {
ss->slabs[i].class_idx = 255; // UNASSIGNED
}
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
// Pool table expansion failed; leave ss alive (registry-owned),
// but do not treat it as part of shared_pool.
return NULL;
}
}
g_shared_pool.slabs[g_shared_pool.total_count] = ss;
g_shared_pool.total_count++;
// Not counted as active until at least one slab is assigned.
return ss;
}
SuperSlab*
shared_pool_acquire_superslab(void)
{
// Phase 12 debug safety:
// If shared backend is disabled at Box API level, this function SHOULD NOT be called.
// But since bench currently SEGVs here even with legacy forced, treat this as a hard guard:
// we early-return error instead of touching potentially-bad state.
//
// This isolates shared_pool from the current crash so we can validate legacy path first.
// FIXED: Remove the return -1; that was preventing operation
shared_pool_init();
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// For now, always allocate a fresh SuperSlab and register it.
// More advanced reuse/GC comes later.
SuperSlab* ss = shared_pool_allocate_superslab_unlocked();
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return ss;
}
// ---------- Layer 4: Public API (High-level) ----------
int
shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
{
// Phase 12: SP-SLOT Box - 3-Stage Acquire Logic
//
// Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE)
// Stage 2: Find UNUSED slots in existing SuperSlabs
// Stage 3: Get new SuperSlab (LRU pop or mmap)
//
// Invariants:
// - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots
// - The chosen slab has meta->class_idx == class_idx
if (!ss_out || !slab_idx_out) {
return -1;
}
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return -1;
}
shared_pool_init();
// Debug logging
static int dbg_acquire = -1;
if (__builtin_expect(dbg_acquire == -1, 0)) {
const char* e = getenv("HAKMEM_SS_ACQUIRE_DEBUG");
dbg_acquire = (e && *e && *e != '0') ? 1 : 0;
}
// ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
// P0-4: Lock-free pop from per-class free list (no mutex needed!)
// Best case: Same class freed a slot, reuse immediately (cache-hot)
SharedSSMeta* reuse_meta = NULL;
int reuse_slot_idx = -1;
if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
// Found EMPTY slot from lock-free list!
// Now acquire mutex ONLY for slot activation and metadata update
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// Activate slot under mutex (slot state transition requires protection)
if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
SuperSlab* ss = reuse_meta->ss;
if (dbg_acquire == 1) {
fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
class_idx, (void*)ss, reuse_slot_idx);
}
// Update SuperSlab metadata
ss->slab_bitmap |= (1u << reuse_slot_idx);
ss->slabs[reuse_slot_idx].class_idx = (uint8_t)class_idx;
if (ss->active_slabs == 0) {
// Was empty, now active again
ss->active_slabs = 1;
g_shared_pool.active_count++;
}
// Update hint
g_shared_pool.class_hints[class_idx] = ss;
*ss_out = ss;
*slab_idx_out = reuse_slot_idx;
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return 0; // ✅ Stage 1 (lock-free) success
}
// Slot activation failed (race condition?) - release lock and fall through
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}
// ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
// P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
// Read ss_meta_count atomically (safe: only grows, never shrinks)
uint32_t meta_count = atomic_load_explicit(
(_Atomic uint32_t*)&g_shared_pool.ss_meta_count,
memory_order_acquire
);
for (uint32_t i = 0; i < meta_count; i++) {
SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];
// Try lock-free claiming (UNUSED → ACTIVE via CAS)
int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
if (claimed_idx >= 0) {
// Successfully claimed slot! Now acquire mutex ONLY for metadata update
SuperSlab* ss = meta->ss;
if (dbg_acquire == 1) {
fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
class_idx, (void*)ss, claimed_idx);
}
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// Update SuperSlab metadata under mutex
ss->slab_bitmap |= (1u << claimed_idx);
ss->slabs[claimed_idx].class_idx = (uint8_t)class_idx;
if (ss->active_slabs == 0) {
ss->active_slabs = 1;
g_shared_pool.active_count++;
}
// Update hint
g_shared_pool.class_hints[class_idx] = ss;
*ss_out = ss;
*slab_idx_out = claimed_idx;
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return 0; // ✅ Stage 2 (lock-free) success
}
// Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
}
// ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
// All existing SuperSlabs have no UNUSED slots → need new SuperSlab
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// ========== Stage 3: Get new SuperSlab ==========
// Try LRU cache first, then mmap
SuperSlab* new_ss = NULL;
// Stage 3a: Try LRU cache
extern SuperSlab* hak_ss_lru_pop(uint8_t size_class);
new_ss = hak_ss_lru_pop((uint8_t)class_idx);
int from_lru = (new_ss != NULL);
// Stage 3b: If LRU miss, allocate new SuperSlab
if (!new_ss) {
new_ss = shared_pool_allocate_superslab_unlocked();
}
if (dbg_acquire == 1 && new_ss) {
fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n",
class_idx, (void*)new_ss, from_lru);
}
if (!new_ss) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // ❌ Out of memory
}
// Create metadata for this new SuperSlab
SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
if (!new_meta) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // ❌ Metadata allocation failed
}
// Assign first slot to this class
int first_slot = 0;
if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // ❌ Should not happen
}
// Update SuperSlab metadata
new_ss->slab_bitmap |= (1u << first_slot);
new_ss->slabs[first_slot].class_idx = (uint8_t)class_idx;
new_ss->active_slabs = 1;
g_shared_pool.active_count++;
// Update hint
g_shared_pool.class_hints[class_idx] = new_ss;
*ss_out = new_ss;
*slab_idx_out = first_slot;
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return 0; // ✅ Stage 3 success
}
void
shared_pool_release_slab(SuperSlab* ss, int slab_idx)
{
// Phase 12: SP-SLOT Box - Slot-based Release
//
// Flow:
// 1. Validate inputs and check meta->used == 0
// 2. Find SharedSSMeta for this SuperSlab
// 3. Mark slot ACTIVE → EMPTY
// 4. Push to per-class free list (enables same-class reuse)
// 5. If all slots EMPTY → superslab_free() → LRU cache
if (!ss) {
return;
}
if (slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) {
return;
}
// Debug logging
static int dbg = -1;
if (__builtin_expect(dbg == -1, 0)) {
const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
dbg = (e && *e && *e != '0') ? 1 : 0;
}
// P0 instrumentation: count lock acquisitions
lock_stats_init();
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_release_slab_count, 1);
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
if (slab_meta->used != 0) {
// Not actually empty; nothing to do
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return;
}
uint8_t class_idx = slab_meta->class_idx;
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_RELEASE] ss=%p slab_idx=%d class=%d used=0 (marking EMPTY)\n",
(void*)ss, slab_idx, class_idx);
}
// Find SharedSSMeta for this SuperSlab
SharedSSMeta* sp_meta = NULL;
for (uint32_t i = 0; i < g_shared_pool.ss_meta_count; i++) {
if (g_shared_pool.ss_metadata[i].ss == ss) {
sp_meta = &g_shared_pool.ss_metadata[i];
break;
}
}
if (!sp_meta) {
// SuperSlab not in SP-SLOT system yet - create metadata
sp_meta = sp_meta_find_or_create(ss);
if (!sp_meta) {
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return; // Failed to create metadata
}
}
// Mark slot as EMPTY (ACTIVE → EMPTY)
if (sp_slot_mark_empty(sp_meta, slab_idx) != 0) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return; // Slot wasn't ACTIVE
}
// Update SuperSlab metadata
uint32_t bit = (1u << slab_idx);
if (ss->slab_bitmap & bit) {
ss->slab_bitmap &= ~bit;
slab_meta->class_idx = 255; // UNASSIGNED
if (ss->active_slabs > 0) {
ss->active_slabs--;
if (ss->active_slabs == 0 && g_shared_pool.active_count > 0) {
g_shared_pool.active_count--;
}
}
}
// P0-4: Push to lock-free per-class free list (enables reuse by same class)
// Note: push BEFORE releasing mutex (slot state already updated under lock)
if (class_idx < TINY_NUM_CLASSES_SS) {
sp_freelist_push_lockfree(class_idx, sp_meta, slab_idx);
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_FREELIST_LOCKFREE] class=%d pushed slot (ss=%p slab=%d) active_slots=%u/%u\n",
class_idx, (void*)ss, slab_idx,
sp_meta->active_slots, sp_meta->total_slots);
}
}
// Check if SuperSlab is now completely empty (all slots EMPTY or UNUSED)
if (sp_meta->active_slots == 0) {
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_COMPLETELY_EMPTY] ss=%p active_slots=0 (calling superslab_free)\n",
(void*)ss);
}
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
// Free SuperSlab:
// 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation
// 2. Or munmap if LRU is full - eager deallocation
extern void superslab_free(SuperSlab* ss);
superslab_free(ss);
return;
}
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}