Feat(phase9): Make shared_pool SuperSlab acquisition deadlock-free (Task 1)

Refactored SuperSlab allocation within shared pool to prevent deadlocks. replaced by , which is now lock-agnostic. is temporarily released before calling and re-acquired afterwards in . This eliminates deadlock potential between shared pool and registry locks. OOMs previously observed were due to shared pool's soft limits, not a code bug.
2025-11-30 15:14:34 +09:00
parent 0558a9391d
commit e3b0fdce57
1 changed files with 70 additions and 40 deletions
--- a/core/hakmem_shared_pool.c
+++ b/core/hakmem_shared_pool.c
@ -544,7 +544,7 @@ static inline void sp_stage_stats_dump_if_enabled(void) {
    fprintf(stderr, "[SP_STAGE_STATS] total: stage0.5=%lu stage1=%lu stage2=%lu stage3=%lu\n",
            (unsigned long)s0, (unsigned long)s1, (unsigned long)s2, (unsigned long)s3);
 #else
-    (void)g_sp_stage0_hits; (void)g_sp_stage1_hits; (void)g_sp_stage2_hits; (void)g_sp_stage3_hits;
+    (void)g_sp_stage1_hits; (void)g_sp_stage2_hits; (void)g_sp_stage3_hits;
 #endif
 }

@ -795,23 +795,11 @@ static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int*
    return 1;  // Success
 }

-/*
- * Internal: allocate and register a new SuperSlab for the shared pool.
- *
- * Phase 12 NOTE:
- *  - We MUST use the real superslab_allocate() path so that:
- *      - backing memory is a full SuperSlab region (1–2MB),
- *      - header/layout are initialized correctly,
- *      - registry integration stays consistent.
- *  - shared_pool is responsible only for:
- *      - tracking pointers,
- *      - marking per-slab class_idx as UNASSIGNED initially.
- *    It does NOT bypass registry/LRU.
- *
- * Caller must hold alloc_lock.
- */
+// Internal helper: Allocates a new SuperSlab from the OS and performs basic initialization.
+// Does NOT interact with g_shared_pool.slabs[] or g_shared_pool.total_count directly.
+// Caller is responsible for adding the SuperSlab to g_shared_pool's arrays and metadata.
 static SuperSlab*
-shared_pool_allocate_superslab_unlocked(void)
+sp_internal_allocate_superslab(void)
 {
    // Use size_class 0 as a neutral hint; Phase 12 per-slab class_idx is authoritative.
    extern SuperSlab* superslab_allocate(uint8_t size_class);
@ -838,41 +826,42 @@ shared_pool_allocate_superslab_unlocked(void)
        // P1.1: Initialize class_map to UNASSIGNED as well
        ss->class_map[i] = 255;
    }
-
-    if (g_shared_pool.total_count >= g_shared_pool.capacity) {
-        shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
-        if (g_shared_pool.total_count >= g_shared_pool.capacity) {
-            // Pool table expansion failed; leave ss alive (registry-owned),
-            // but do not treat it as part of shared_pool.
-            return NULL;
-        }
-    }
-
-    g_shared_pool.slabs[g_shared_pool.total_count] = ss;
-    g_shared_pool.total_count++;
-    // Not counted as active until at least one slab is assigned.
    return ss;
 }

 SuperSlab*
 shared_pool_acquire_superslab(void)
 {
-    // Phase 12 debug safety:
-    //   If shared backend is disabled at Box API level, this function SHOULD NOT be called.
-    //   But since bench currently SEGVs here even with legacy forced, treat this as a hard guard:
-    //   we early-return error instead of touching potentially-bad state.
-    //
-    //   This isolates shared_pool from the current crash so we can validate legacy path first.
-    // FIXED: Remove the return -1; that was preventing operation
-
    shared_pool_init();

    pthread_mutex_lock(&g_shared_pool.alloc_lock);

    // For now, always allocate a fresh SuperSlab and register it.
    // More advanced reuse/GC comes later.
-    SuperSlab* ss = shared_pool_allocate_superslab_unlocked();
+    // Release lock to avoid deadlock with registry during superslab_allocate
+    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+    SuperSlab* ss = sp_internal_allocate_superslab(); // Call lock-free internal helper
+    pthread_mutex_lock(&g_shared_pool.alloc_lock);

+    if (!ss) {
+        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+        return NULL;
+    }
+
+    // Add newly allocated SuperSlab to the shared pool's internal array
+    if (g_shared_pool.total_count >= g_shared_pool.capacity) {
+        shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
+        if (g_shared_pool.total_count >= g_shared_pool.capacity) {
+            // Pool table expansion failed; leave ss alive (registry-owned),
+            // but do not treat it as part of shared_pool.
+            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+            return NULL;
+        }
+    }
+    g_shared_pool.slabs[g_shared_pool.total_count] = ss;
+    g_shared_pool.total_count++;
+
+    // Not counted as active until at least one slab is assigned.
    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
    return ss;
 }
@ -1221,7 +1210,48 @@ stage2_fallback:

    // Stage 3b: If LRU miss, allocate new SuperSlab
    if (!new_ss) {
-        new_ss = shared_pool_allocate_superslab_unlocked();
+        // Release the alloc_lock to avoid deadlock with registry during superslab_allocate
+        if (g_lock_stats_enabled == 1) {
+            atomic_fetch_add(&g_lock_release_count, 1);
+        }
+        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+
+        SuperSlab* allocated_ss = sp_internal_allocate_superslab();
+
+        // Re-acquire the alloc_lock
+        if (g_lock_stats_enabled == 1) {
+            atomic_fetch_add(&g_lock_acquire_count, 1);
+            atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
+        }
+        pthread_mutex_lock(&g_shared_pool.alloc_lock);
+
+        if (!allocated_ss) {
+            // Allocation failed; return now.
+            if (g_lock_stats_enabled == 1) {
+                atomic_fetch_add(&g_lock_release_count, 1);
+            }
+            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+            return -1; // Out of memory
+        }
+
+        new_ss = allocated_ss;
+
+        // Add newly allocated SuperSlab to the shared pool's internal array
+        if (g_shared_pool.total_count >= g_shared_pool.capacity) {
+            shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
+            if (g_shared_pool.total_count >= g_shared_pool.capacity) {
+                // Pool table expansion failed; leave ss alive (registry-owned),
+                // but do not treat it as part of shared_pool.
+                // This is a critical error, return early.
+                if (g_lock_stats_enabled == 1) {
+                    atomic_fetch_add(&g_lock_release_count, 1);
+                }
+                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+                return -1;
+            }
+        }
+        g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
+        g_shared_pool.total_count++;
    }

    #if !HAKMEM_BUILD_RELEASE