Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend. - Removed Soft Cap limit in Shared Pool to allow full memory management. - Implemented EMPTY slab recycling with batched meta->used decrement in remote drain. - Updated tiny_free_local_box to return is_empty status for safe recycling. - Fixed race condition in release path by removing from legacy list early. - Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).
2025-12-01 13:47:23 +09:00
parent 3a040a545a
commit 0bc33dc4f5
7 changed files with 92 additions and 102 deletions
--- a/core/box/free_local_box.c
+++ b/core/box/free_local_box.c
@ -19,11 +19,11 @@ void tiny_failfast_log(const char* stage,
                       void* ptr,
                       void* prev);

-void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid) {
+int tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid) {
    extern _Atomic uint64_t g_free_local_box_calls;
    atomic_fetch_add_explicit(&g_free_local_box_calls, 1, memory_order_relaxed);
-    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
-    if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) return;
+    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return 0;
+    if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) return 0;
    (void)my_tid;

    // ✅ Phase E1-CORRECT: ALL classes have headers, calculate BASE pointer once
@ -177,11 +177,16 @@ void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void*

    // Track local free (debug helpers may be no-op)
    tiny_remote_track_on_local_free(ss, slab_idx, ptr, "local_free", my_tid);
-    meta->used--;
+    
+    // BUGFIX Phase 9-2: Use atomic_fetch_sub to detect 1->0 transition reliably
+    // meta->used--; // old
+    uint16_t prev_used = atomic_fetch_sub_explicit(&meta->used, 1, memory_order_release);
+    int is_empty = (prev_used == 1); // Transitioned from 1 to 0
+    
    ss_active_dec_one(ss);

    // Phase 12-1.1: EMPTY slab detection (immediate reuse optimization)
-    if (meta->used == 0) {
+    if (is_empty) {
        // Slab became EMPTY → mark for highest-priority reuse
        ss_mark_slab_empty(ss, slab_idx);

@ -206,4 +211,6 @@ void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void*
        uint8_t cls0 = (meta && meta->class_idx < TINY_NUM_CLASSES) ? meta->class_idx : 0;
        tiny_free_publish_first_free((int)cls0, ss, slab_idx);
    }
+    
+    return is_empty;
 }
--- a/core/box/free_local_box.h
+++ b/core/box/free_local_box.h
@ -4,5 +4,6 @@
 #include "hakmem_tiny_superslab.h"

 // Perform same-thread freelist push. On first-free (prev==NULL), publishes via Ready/Mailbox.
-void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid);
+// Returns: 1 if slab transitioned to EMPTY (used=0), 0 otherwise.
+int tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid);

--- a/core/box/tls_sll_drain_box.h
+++ b/core/box/tls_sll_drain_box.h
@ -204,7 +204,8 @@ static inline uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size) {
        // Call tiny_free_local_box() to:
        // 1. Push block to slab freelist
        // 2. Decrement meta->used (THIS IS THE KEY!)
-        tiny_free_local_box(ss, slab_idx, meta, user_ptr, my_tid);
+        // Phase 9-2 FIX: Capture 'is_empty' return value to detect ownership of 1->0 transition
+        int is_empty = tiny_free_local_box(ss, slab_idx, meta, user_ptr, my_tid);

 #if !HAKMEM_BUILD_RELEASE
        // Trace drain operation (debug only)
@ -220,15 +221,9 @@ static inline uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size) {
        drained++;

        // Phase 9-2: Track touched slab for later EMPTY check
-        // We track (ss, slab_idx) pairs to check after loop completes
-        int already_tracked = 0;
-        for (int t = 0; t < num_touched; t++) {
-            if (touched[t].ss == ss && touched[t].slab_idx == slab_idx) {
-                already_tracked = 1;
-                break;
-            }
-        }
-        if (!already_tracked && num_touched < MAX_TOUCHED_SLABS) {
+        // CRITICAL FIX: Only recycle if WE caused the transition to EMPTY (is_empty == 1)
+        // This prevents multiple threads from racing to release the same slab
+        if (is_empty && num_touched < MAX_TOUCHED_SLABS) {
            touched[num_touched].ss = ss;
            touched[num_touched].slab_idx = slab_idx;
            num_touched++;
--- a/core/hakmem_shared_pool_acquire.c
+++ b/core/hakmem_shared_pool_acquire.c
@ -418,19 +418,8 @@ stage2_fallback:
    }

    // Before creating a new SuperSlab, consult learning-layer soft cap.
-    // If current active slots for this class already exceed the policy cap,
-    // fail early so caller can fall back to legacy backend.
-    uint32_t limit = sp_class_active_limit(class_idx);
-    if (limit > 0) {
-        uint32_t cur = g_shared_pool.class_active_slots[class_idx];
-        if (cur >= limit) {
-            if (g_lock_stats_enabled == 1) {
-                atomic_fetch_add(&g_lock_release_count, 1);
-            }
-            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
-            return -1;  // Soft cap reached for this class
-        }
-    }
+    // Phase 9-2: Soft Cap removed to allow Shared Pool to fully replace Legacy Backend.
+    // We now rely on LRU eviction and EMPTY recycling to manage memory pressure.

    // Create metadata for this new SuperSlab
    SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
--- a/core/hakmem_shared_pool_release.c
+++ b/core/hakmem_shared_pool_release.c
@ -26,6 +26,23 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)
        return;
    }

+    // Phase 9-2 FIX: Promote Legacy SuperSlabs to Shared Pool on first recycle
+    // If we are recycling a slot from a Legacy SS, we must remove it from the
+    // Legacy list (g_superslab_heads) to prevent Legacy Backend from allocating
+    // from it simultaneously (Double Allocation Race).
+    // This effectively transfers ownership to Shared Pool.
+    extern void remove_superslab_from_legacy_head(SuperSlab* ss);
+    remove_superslab_from_legacy_head(ss);
+
+    // BUGFIX: Re-check used count after removal. Legacy Backend might have
+    // allocated from this slab while we were waiting for the lock in remove().
+    TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
+    if (atomic_load_explicit(&slab_meta->used, memory_order_acquire) != 0) {
+        // Legacy Backend stole this slab. It's now an orphan (removed from list).
+        // We abort recycling. It will be recycled when Legacy frees it later.
+        return;
+    }
+
    // Debug logging
 #if !HAKMEM_BUILD_RELEASE
    static int dbg = -1;
@ -46,9 +63,9 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)

    pthread_mutex_lock(&g_shared_pool.alloc_lock);

-    TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
+    // TinySlabMeta* slab_meta = &ss->slabs[slab_idx]; // Already declared above
    if (slab_meta->used != 0) {
-        // Not actually empty; nothing to do
+        // Not actually empty (double check under lock)
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
@ -160,15 +177,28 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)

        pthread_mutex_unlock(&g_shared_pool.alloc_lock);

-        // Remove from legacy backend list (if present) to prevent dangling pointers
-        extern void remove_superslab_from_legacy_head(SuperSlab* ss);
-        remove_superslab_from_legacy_head(ss);
+        // Remove from legacy backend list (moved to top of function)
+        // extern void remove_superslab_from_legacy_head(SuperSlab* ss);
+        // remove_superslab_from_legacy_head(ss);

        // Free SuperSlab:
        // 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation
        // 2. Or munmap if LRU is full - eager deallocation
+        
+        // BUGFIX: Double check total_active_blocks. Legacy Backend might have
+        // allocated from ANOTHER slab in this SS just before we removed it.
+        // If so, we must NOT free the SS.
+        if (atomic_load(&ss->total_active_blocks) == 0) {
            extern void superslab_free(SuperSlab* ss);
            superslab_free(ss);
+        } else {
+            #if !HAKMEM_BUILD_RELEASE
+            if (dbg == 1) {
+                fprintf(stderr, "[SP_SLOT_RELEASE] SKIP free ss=%p: total_active_blocks=%u > 0\n",
+                        (void*)ss, atomic_load(&ss->total_active_blocks));
+            }
+            #endif
+        }
        return;
    }

--- a/core/superslab_backend.c
+++ b/core/superslab_backend.c
@ -28,6 +28,9 @@ void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
        g_superslab_heads[class_idx] = head;
    }

+    // LOCK expansion_lock to protect list traversal (vs remove_superslab_from_legacy_head)
+    pthread_mutex_lock(&head->expansion_lock);
+
    SuperSlab* chunk = head->current_chunk ? head->current_chunk : head->first_chunk;

    while (chunk) {
@ -62,12 +65,19 @@ void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)

                meta->used++;
                atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed);
+                
+                // UNLOCK before return
+                pthread_mutex_unlock(&head->expansion_lock);
+
                HAK_RET_ALLOC_BLOCK_TRACED(class_idx, base, ALLOC_PATH_BACKEND);
            }
        }
        chunk = chunk->next_chunk;
    }

+    // UNLOCK before expansion (which takes lock internally)
+    pthread_mutex_unlock(&head->expansion_lock);
+
    if (expand_superslab_head(head) < 0) {
        return NULL;
    }
@ -212,74 +222,23 @@ void* hak_tiny_alloc_superslab_backend_shared(int class_idx)
 * Box API entry:
 *  - Single front-door for tiny-side Superslab allocations.
 *
- * Phase 9-2 Root Fix: Shared Pool backend unified mode (default ON)
+ * Phase 9-2 Final: Shared Pool ONLY (Legacy Backend Removed)
 * Policy:
- *  - HAKMEM_TINY_SS_SHARED=2 (default) → Shared Pool backend ONLY (no legacy fallback)
- *  - HAKMEM_TINY_SS_SHARED=1 → Shared Pool backend with legacy fallback (testing mode)
- *  - HAKMEM_TINY_SS_SHARED=0 → Legacy backend only (compatibility mode)
- *
- * Root Cause: Legacy backend (g_superslab_heads) has TLS_SLL_DUP issue
- * Solution: Disable legacy backend by default, keep as "reversible box" via env var
+ *  - HAKMEM_TINY_SS_SHARED is now ignored (or used only for logging).
+ *  - Always uses Shared Pool backend.
+ *  - Legacy backend (g_superslab_heads) is no longer used for allocation.
 */
 void* hak_tiny_alloc_superslab_box(int class_idx)
 {
-    static int g_ss_shared_mode = -1;
-    static _Atomic uint32_t g_ss_backend_log = 0;
-    if (__builtin_expect(g_ss_shared_mode == -1, 0)) {
-        const char* e = getenv("HAKMEM_TINY_SS_SHARED");
-        if (!e || !*e) {
-            g_ss_shared_mode = 2; // Phase 9-2 Root Fix: Shared Pool ONLY (no legacy fallback)
-        } else {
-            int v = atoi(e);
-            g_ss_shared_mode = v; // 0=legacy only, 1=shared+fallback, 2=shared only
-        }
-#if !HAKMEM_BUILD_RELEASE
-        const char* mode_str = (g_ss_shared_mode == 2) ? "shared_only" :
-                               (g_ss_shared_mode == 1) ? "shared+fallback" : "legacy_only";
-        fprintf(stderr, "[SS_BACKEND] Mode: %s (HAKMEM_TINY_SS_SHARED=%d)\n", mode_str, g_ss_shared_mode);
-#endif
-    }
-
-    // Mode 2: Shared Pool ONLY (default, no legacy fallback)
-    if (g_ss_shared_mode == 2) {
+    // Always use Shared Pool (Mode 2 equivalent)
    void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
-        if (p != NULL) {
-            uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
+    
+    if (p == NULL) {
+        static _Atomic uint32_t g_ss_oom_log = 0;
+        uint32_t n = atomic_fetch_add_explicit(&g_ss_oom_log, 1, memory_order_relaxed);
        if (n < 4) {
-                fprintf(stderr, "[SS_BACKEND] shared_only cls=%d ptr=%p\n", class_idx, p);
+            fprintf(stderr, "[SS_BACKEND] shared_fail→NULL (OOM) cls=%d\n", class_idx);
+        }
    }
    return p;
 }
-        // Phase 9-2: NO fallback to legacy - return NULL on failure
-        uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-        if (n < 4) {
-            fprintf(stderr, "[SS_BACKEND] shared_fail→NULL (no legacy) cls=%d\n", class_idx);
-        }
-        return NULL;
-    }
-
-    // Mode 1: Shared Pool with legacy fallback (testing mode)
-    if (g_ss_shared_mode == 1) {
-        void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
-        if (p != NULL) {
-            uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-            if (n < 4) {
-                fprintf(stderr, "[SS_BACKEND] shared cls=%d ptr=%p\n", class_idx, p);
-            }
-            return p;
-        }
-        // Fallback to legacy
-        uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-        if (n < 4) {
-            fprintf(stderr, "[SS_BACKEND] shared_fail→legacy cls=%d\n", class_idx);
-        }
-        return hak_tiny_alloc_superslab_backend_legacy(class_idx);
-    }
-
-    // Mode 0: Legacy backend only (compatibility mode)
-    uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
-    if (n < 4) {
-        fprintf(stderr, "[SS_BACKEND] legacy cls=%d\n", class_idx);
-    }
-    return hak_tiny_alloc_superslab_backend_legacy(class_idx);
-}
--- a/core/superslab_slab.c
+++ b/core/superslab_slab.c
@ -27,6 +27,8 @@ void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMe
    // and splice in front of current freelist preserving relative order.
    void* prev = meta->freelist;
    int cls = (int)meta->class_idx;
+    uint16_t drained_count = 0; // Phase 9-2: Batched used decrement
+
    HAK_CHECK_CLASS_IDX(cls, "_ss_remote_drain_to_freelist_unsafe");
    if (__builtin_expect(cls < 0 || cls >= TINY_NUM_CLASSES, 0)) {
        static _Atomic int g_remote_drain_cls_oob = 0;
@ -104,14 +106,21 @@ void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMe
        tiny_next_write(cls, (void*)cur, prev);
        prev = (void*)cur;
        cur = next;
+        drained_count++;
    }
    meta->freelist = prev;
    // Reset remote count after full drain
    atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release);

-    // Phase 9-2: Try to recycle slab if EMPTY after remote drain
-    // This fixes the bug where EMPTY slabs accumulate and never get returned to freelist
+    // Phase 9-2: Batched decrement of used count (Atomic)
+    // Remote frees don't decrement used until they land in freelist.
+    if (drained_count > 0) {
+        uint16_t old_used = atomic_fetch_sub_explicit(&meta->used, drained_count, memory_order_release);
+        // If used became 0 (old_used == drained_count), try to recycle
+        if (old_used == drained_count) {
             SLAB_TRY_RECYCLE(ss, slab_idx, meta);
+        }
+    }

    // Update freelist/nonempty visibility bits
    uint32_t bit = (1u << slab_idx);