Feat(phase9): Make shared_pool SuperSlab acquisition deadlock-free (Task 1)

Refactored SuperSlab allocation within shared pool to prevent deadlocks.
 replaced by ,
which is now lock-agnostic.  is temporarily released
before calling  and re-acquired afterwards in .
This eliminates deadlock potential between shared pool and registry locks.
OOMs previously observed were due to shared pool's soft limits, not a code bug.
This commit is contained in:
Moe Charm (CI)
2025-11-30 15:14:34 +09:00
parent 0558a9391d
commit e3b0fdce57

View File

@ -544,7 +544,7 @@ static inline void sp_stage_stats_dump_if_enabled(void) {
fprintf(stderr, "[SP_STAGE_STATS] total: stage0.5=%lu stage1=%lu stage2=%lu stage3=%lu\n",
(unsigned long)s0, (unsigned long)s1, (unsigned long)s2, (unsigned long)s3);
#else
(void)g_sp_stage0_hits; (void)g_sp_stage1_hits; (void)g_sp_stage2_hits; (void)g_sp_stage3_hits;
(void)g_sp_stage1_hits; (void)g_sp_stage2_hits; (void)g_sp_stage3_hits;
#endif
}
@ -795,23 +795,11 @@ static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int*
return 1; // Success
}
/*
* Internal: allocate and register a new SuperSlab for the shared pool.
*
* Phase 12 NOTE:
* - We MUST use the real superslab_allocate() path so that:
* - backing memory is a full SuperSlab region (12MB),
* - header/layout are initialized correctly,
* - registry integration stays consistent.
* - shared_pool is responsible only for:
* - tracking pointers,
* - marking per-slab class_idx as UNASSIGNED initially.
* It does NOT bypass registry/LRU.
*
* Caller must hold alloc_lock.
*/
// Internal helper: Allocates a new SuperSlab from the OS and performs basic initialization.
// Does NOT interact with g_shared_pool.slabs[] or g_shared_pool.total_count directly.
// Caller is responsible for adding the SuperSlab to g_shared_pool's arrays and metadata.
static SuperSlab*
shared_pool_allocate_superslab_unlocked(void)
sp_internal_allocate_superslab(void)
{
// Use size_class 0 as a neutral hint; Phase 12 per-slab class_idx is authoritative.
extern SuperSlab* superslab_allocate(uint8_t size_class);
@ -838,41 +826,42 @@ shared_pool_allocate_superslab_unlocked(void)
// P1.1: Initialize class_map to UNASSIGNED as well
ss->class_map[i] = 255;
}
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
// Pool table expansion failed; leave ss alive (registry-owned),
// but do not treat it as part of shared_pool.
return NULL;
}
}
g_shared_pool.slabs[g_shared_pool.total_count] = ss;
g_shared_pool.total_count++;
// Not counted as active until at least one slab is assigned.
return ss;
}
SuperSlab*
shared_pool_acquire_superslab(void)
{
// Phase 12 debug safety:
// If shared backend is disabled at Box API level, this function SHOULD NOT be called.
// But since bench currently SEGVs here even with legacy forced, treat this as a hard guard:
// we early-return error instead of touching potentially-bad state.
//
// This isolates shared_pool from the current crash so we can validate legacy path first.
// FIXED: Remove the return -1; that was preventing operation
shared_pool_init();
pthread_mutex_lock(&g_shared_pool.alloc_lock);
// For now, always allocate a fresh SuperSlab and register it.
// More advanced reuse/GC comes later.
SuperSlab* ss = shared_pool_allocate_superslab_unlocked();
// Release lock to avoid deadlock with registry during superslab_allocate
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
SuperSlab* ss = sp_internal_allocate_superslab(); // Call lock-free internal helper
pthread_mutex_lock(&g_shared_pool.alloc_lock);
if (!ss) {
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return NULL;
}
// Add newly allocated SuperSlab to the shared pool's internal array
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
// Pool table expansion failed; leave ss alive (registry-owned),
// but do not treat it as part of shared_pool.
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return NULL;
}
}
g_shared_pool.slabs[g_shared_pool.total_count] = ss;
g_shared_pool.total_count++;
// Not counted as active until at least one slab is assigned.
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return ss;
}
@ -1221,7 +1210,48 @@ stage2_fallback:
// Stage 3b: If LRU miss, allocate new SuperSlab
if (!new_ss) {
new_ss = shared_pool_allocate_superslab_unlocked();
// Release the alloc_lock to avoid deadlock with registry during superslab_allocate
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
SuperSlab* allocated_ss = sp_internal_allocate_superslab();
// Re-acquire the alloc_lock
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_acquire_count, 1);
atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
}
pthread_mutex_lock(&g_shared_pool.alloc_lock);
if (!allocated_ss) {
// Allocation failed; return now.
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // Out of memory
}
new_ss = allocated_ss;
// Add newly allocated SuperSlab to the shared pool's internal array
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
// Pool table expansion failed; leave ss alive (registry-owned),
// but do not treat it as part of shared_pool.
// This is a critical error, return early.
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1;
}
}
g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
g_shared_pool.total_count++;
}
#if !HAKMEM_BUILD_RELEASE