Mid-Large Phase 12 Complete + P0-5 Lock-Free Stage 2

**Phase 12 第1ラウンド完了** ✅ - 0.24M → 2.39M ops/s (8T, **+896%**) - SEGFAULT → Zero crashes (**100% → 0%**) - futex: 209 → 10 calls (**-95%**) **P0-5: Lock-Free Stage 2 (Slot Claiming)** - Atomic SlotState: `_Atomic SlotState state` - sp_slot_claim_lockfree(): CAS-based UNUSED→ACTIVE transition - acquire_slab() Stage 2: Lock-free claiming (mutex only for metadata) - Result: 2.34M → 2.39M ops/s (+2.5% @ 8T) **Implementation**: - core/hakmem_shared_pool.h: Atomic SlotState definition - core/hakmem_shared_pool.c: - sp_slot_claim_lockfree() (+40 lines) - Atomic helpers: sp_slot_find_unused/mark_active/mark_empty - Stage 2 lock-free integration - Verified via debug logs: STAGE2_LOCKFREE claiming works **Reports**: - MID_LARGE_P0_PHASE_REPORT.md: P0-0 to P0-4 comprehensive summary - MID_LARGE_FINAL_AB_REPORT.md: Complete Phase 12 A/B comparison (17KB) - Performance evolution table - Lock contention analysis - Lessons learned - File inventory **Tiny Baseline Measurement** 📊 - System malloc: 82.9M ops/s (256B) - HAKMEM: 8.88M ops/s (256B) - **Gap: 9.3x slower** (target for next phase) **Next**: Tiny allocator optimization (drain interval, front cache, perf profile) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 16:51:53 +09:00
parent 29fefa2018
commit ec453d67f2
4 changed files with 1489 additions and 62 deletions
--- a/core/hakmem_shared_pool.c
+++ b/core/hakmem_shared_pool.c
@ -48,6 +48,34 @@ static void __attribute__((destructor)) lock_stats_report(void) {
    fprintf(stderr, "===================================\n");
 }

+// ============================================================================
+// P0-4: Lock-Free Free Slot List - Node Pool
+// ============================================================================
+
+// Pre-allocated node pools (one per class, to avoid malloc/free)
+FreeSlotNode g_free_node_pool[TINY_NUM_CLASSES_SS][MAX_FREE_NODES_PER_CLASS];
+_Atomic uint32_t g_node_alloc_index[TINY_NUM_CLASSES_SS] = {0};
+
+// Allocate a node from pool (lock-free, never fails until pool exhausted)
+static inline FreeSlotNode* node_alloc(int class_idx) {
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
+        return NULL;
+    }
+
+    uint32_t idx = atomic_fetch_add(&g_node_alloc_index[class_idx], 1);
+    if (idx >= MAX_FREE_NODES_PER_CLASS) {
+        // Pool exhausted - should not happen in practice
+        static _Atomic int warn_once = 0;
+        if (atomic_exchange(&warn_once, 1) == 0) {
+            fprintf(stderr, "[P0-4 WARN] Node pool exhausted for class %d\n", class_idx);
+        }
+        return NULL;
+    }
+
+    return &g_free_node_pool[class_idx][idx];
+}
+
+// ============================================================================
 // Phase 12-2: SharedSuperSlabPool skeleton implementation
 // Goal:
 //   - Centralize SuperSlab allocation/registration
@ -69,8 +97,11 @@ SharedSuperSlabPool g_shared_pool = {
    .lru_head     = NULL,
    .lru_tail     = NULL,
    .lru_count    = 0,
+    // P0-4: Lock-free free slot lists (zero-initialized atomic pointers)
+    .free_slots_lockfree = {{.head = ATOMIC_VAR_INIT(NULL)}},
+    // Legacy: mutex-protected free lists
+    .free_slots   = {{.entries = {{0}}, .count = 0}},
    // Phase 12: SP-SLOT fields
-    .free_slots   = {{.entries = {{0}}, .count = 0}},  // Zero-init all class free lists
    .ss_metadata  = NULL,
    .ss_meta_capacity = 0,
    .ss_meta_count = 0
@ -122,12 +153,14 @@ shared_pool_init(void)
 // ---------- Layer 1: Slot Operations (Low-level) ----------

 // Find first unused slot in SharedSSMeta
+// P0-5: Uses atomic load for state check
 // Returns: slot_idx on success, -1 if no unused slots
 static int sp_slot_find_unused(SharedSSMeta* meta) {
    if (!meta) return -1;

    for (int i = 0; i < meta->total_slots; i++) {
-        if (meta->slots[i].state == SLOT_UNUSED) {
+        SlotState state = atomic_load_explicit(&meta->slots[i].state, memory_order_acquire);
+        if (state == SLOT_UNUSED) {
            return i;
        }
    }
@ -135,6 +168,7 @@ static int sp_slot_find_unused(SharedSSMeta* meta) {
 }

 // Mark slot as ACTIVE (UNUSED→ACTIVE or EMPTY→ACTIVE)
+// P0-5: Uses atomic store for state transition (caller must hold mutex!)
 // Returns: 0 on success, -1 on error
 static int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx) {
    if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
@ -142,9 +176,12 @@ static int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx)

    SharedSlot* slot = &meta->slots[slot_idx];

+    // Load state atomically
+    SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire);
+
    // Transition: UNUSED→ACTIVE or EMPTY→ACTIVE
-    if (slot->state == SLOT_UNUSED || slot->state == SLOT_EMPTY) {
-        slot->state = SLOT_ACTIVE;
+    if (state == SLOT_UNUSED || state == SLOT_EMPTY) {
+        atomic_store_explicit(&slot->state, SLOT_ACTIVE, memory_order_release);
        slot->class_idx = (uint8_t)class_idx;
        slot->slab_idx = (uint8_t)slot_idx;
        meta->active_slots++;
@ -155,14 +192,18 @@ static int sp_slot_mark_active(SharedSSMeta* meta, int slot_idx, int class_idx)
 }

 // Mark slot as EMPTY (ACTIVE→EMPTY)
+// P0-5: Uses atomic store for state transition (caller must hold mutex!)
 // Returns: 0 on success, -1 on error
 static int sp_slot_mark_empty(SharedSSMeta* meta, int slot_idx) {
    if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;

    SharedSlot* slot = &meta->slots[slot_idx];

-    if (slot->state == SLOT_ACTIVE) {
-        slot->state = SLOT_EMPTY;
+    // Load state atomically
+    SlotState state = atomic_load_explicit(&slot->state, memory_order_acquire);
+
+    if (state == SLOT_ACTIVE) {
+        atomic_store_explicit(&slot->state, SLOT_EMPTY, memory_order_release);
        if (meta->active_slots > 0) {
            meta->active_slots--;
        }
@ -228,8 +269,9 @@ static SharedSSMeta* sp_meta_find_or_create(SuperSlab* ss) {
    meta->active_slots = 0;

    // Initialize all slots as UNUSED
+    // P0-5: Use atomic store for state initialization
    for (int i = 0; i < meta->total_slots; i++) {
-        meta->slots[i].state = SLOT_UNUSED;
+        atomic_store_explicit(&meta->slots[i].state, SLOT_UNUSED, memory_order_relaxed);
        meta->slots[i].class_idx = 0;
        meta->slots[i].slab_idx = (uint8_t)i;
    }
@ -279,6 +321,118 @@ static int sp_freelist_pop(int class_idx, SharedSSMeta** out_meta, int* out_slot
    return 1;
 }

+// ============================================================================
+// P0-5: Lock-Free Slot Claiming (Stage 2 Optimization)
+// ============================================================================
+
+// Try to claim an UNUSED slot via lock-free CAS
+// Returns: slot_idx on success, -1 if no UNUSED slots available
+// LOCK-FREE: Can be called from any thread without mutex
+static int sp_slot_claim_lockfree(SharedSSMeta* meta, int class_idx) {
+    if (!meta) return -1;
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
+
+    // Scan all slots for UNUSED state
+    for (int i = 0; i < meta->total_slots; i++) {
+        SlotState expected = SLOT_UNUSED;
+
+        // Try to claim this slot atomically (UNUSED → ACTIVE)
+        if (atomic_compare_exchange_strong_explicit(
+            &meta->slots[i].state,
+            &expected,
+            SLOT_ACTIVE,
+            memory_order_acq_rel,  // Success: acquire+release semantics
+            memory_order_relaxed   // Failure: just retry next slot
+        )) {
+            // Successfully claimed! Update non-atomic fields
+            // (Safe because we now own this slot)
+            meta->slots[i].class_idx = (uint8_t)class_idx;
+            meta->slots[i].slab_idx = (uint8_t)i;
+
+            // Increment active_slots counter atomically
+            // (Multiple threads may claim slots concurrently)
+            atomic_fetch_add_explicit(
+                (_Atomic uint8_t*)&meta->active_slots, 1,
+                memory_order_relaxed
+            );
+
+            return i;  // Return claimed slot index
+        }
+
+        // CAS failed (slot was not UNUSED) - continue to next slot
+    }
+
+    return -1;  // No UNUSED slots available
+}
+
+// ============================================================================
+// P0-4: Lock-Free Free Slot List Operations
+// ============================================================================
+
+// Push empty slot to lock-free per-class free list (LIFO)
+// LOCK-FREE: Can be called from any thread without mutex
+// Returns: 0 on success, -1 on failure (node pool exhausted)
+static int sp_freelist_push_lockfree(int class_idx, SharedSSMeta* meta, int slot_idx) {
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return -1;
+    if (!meta || slot_idx < 0 || slot_idx >= meta->total_slots) return -1;
+
+    // Allocate node from pool
+    FreeSlotNode* node = node_alloc(class_idx);
+    if (!node) {
+        return -1;  // Pool exhausted
+    }
+
+    // Fill node data
+    node->meta = meta;
+    node->slot_idx = (uint8_t)slot_idx;
+
+    // Lock-free LIFO push using CAS loop
+    LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx];
+    FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_relaxed);
+
+    do {
+        node->next = old_head;
+    } while (!atomic_compare_exchange_weak_explicit(
+        &list->head, &old_head, node,
+        memory_order_release,   // Success: publish node to other threads
+        memory_order_relaxed    // Failure: retry with updated old_head
+    ));
+
+    return 0;  // Success
+}
+
+// Pop empty slot from lock-free per-class free list (LIFO)
+// LOCK-FREE: Can be called from any thread without mutex
+// Returns: 1 if popped (out params filled), 0 if list empty
+static int sp_freelist_pop_lockfree(int class_idx, SharedSSMeta** out_meta, int* out_slot_idx) {
+    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) return 0;
+    if (!out_meta || !out_slot_idx) return 0;
+
+    LockFreeFreeList* list = &g_shared_pool.free_slots_lockfree[class_idx];
+    FreeSlotNode* old_head = atomic_load_explicit(&list->head, memory_order_acquire);
+
+    // Lock-free LIFO pop using CAS loop
+    do {
+        if (old_head == NULL) {
+            return 0;  // List empty
+        }
+    } while (!atomic_compare_exchange_weak_explicit(
+        &list->head, &old_head, old_head->next,
+        memory_order_acquire,   // Success: acquire node data
+        memory_order_acquire    // Failure: retry with updated old_head
+    ));
+
+    // Extract data from popped node
+    *out_meta = old_head->meta;
+    *out_slot_idx = old_head->slot_idx;
+
+    // NOTE: We do NOT free the node back to pool (no node recycling yet)
+    // This is acceptable because MAX_FREE_NODES_PER_CLASS (512) is generous
+    // and workloads typically don't push/pop the same slot repeatedly
+
+    return 1;  // Success
+}
+
 /*
 * Internal: allocate and register a new SuperSlab for the shared pool.
 *
@ -383,27 +537,31 @@ shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
        dbg_acquire = (e && *e && *e != '0') ? 1 : 0;
    }

-    // P0 instrumentation: count lock acquisitions
-    lock_stats_init();
-    if (g_lock_stats_enabled == 1) {
-        atomic_fetch_add(&g_lock_acquire_count, 1);
-        atomic_fetch_add(&g_lock_acquire_slab_count, 1);
-    }
-
-    pthread_mutex_lock(&g_shared_pool.alloc_lock);
-
-    // ========== Stage 1: Reuse EMPTY slots from free list ==========
+    // ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
+    // P0-4: Lock-free pop from per-class free list (no mutex needed!)
    // Best case: Same class freed a slot, reuse immediately (cache-hot)
    SharedSSMeta* reuse_meta = NULL;
    int reuse_slot_idx = -1;

-    if (sp_freelist_pop(class_idx, &reuse_meta, &reuse_slot_idx)) {
-        // Found EMPTY slot for this class - reactivate it
+    if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
+        // Found EMPTY slot from lock-free list!
+        // Now acquire mutex ONLY for slot activation and metadata update
+
+        // P0 instrumentation: count lock acquisitions
+        lock_stats_init();
+        if (g_lock_stats_enabled == 1) {
+            atomic_fetch_add(&g_lock_acquire_count, 1);
+            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
+        }
+
+        pthread_mutex_lock(&g_shared_pool.alloc_lock);
+
+        // Activate slot under mutex (slot state transition requires protection)
        if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
            SuperSlab* ss = reuse_meta->ss;

            if (dbg_acquire == 1) {
-                fprintf(stderr, "[SP_ACQUIRE_STAGE1] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
+                fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
                        class_idx, (void*)ss, reuse_slot_idx);
            }

@ -427,50 +585,83 @@ shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
-            return 0;  // ✅ Stage 1 success
+            return 0;  // ✅ Stage 1 (lock-free) success
        }
+
+        // Slot activation failed (race condition?) - release lock and fall through
+        if (g_lock_stats_enabled == 1) {
+            atomic_fetch_add(&g_lock_release_count, 1);
+        }
+        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
    }

-    // ========== Stage 2: Find UNUSED slots in existing SuperSlabs ==========
-    // Scan all SuperSlabs for UNUSED slots
-    for (uint32_t i = 0; i < g_shared_pool.ss_meta_count; i++) {
+    // ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
+    // P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
+    // Read ss_meta_count atomically (safe: only grows, never shrinks)
+    uint32_t meta_count = atomic_load_explicit(
+        (_Atomic uint32_t*)&g_shared_pool.ss_meta_count,
+        memory_order_acquire
+    );
+
+    for (uint32_t i = 0; i < meta_count; i++) {
        SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];

-        int unused_idx = sp_slot_find_unused(meta);
-        if (unused_idx >= 0) {
-            // Found UNUSED slot - activate it
-            if (sp_slot_mark_active(meta, unused_idx, class_idx) == 0) {
-                SuperSlab* ss = meta->ss;
+        // Try lock-free claiming (UNUSED → ACTIVE via CAS)
+        int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
+        if (claimed_idx >= 0) {
+            // Successfully claimed slot! Now acquire mutex ONLY for metadata update
+            SuperSlab* ss = meta->ss;

-                if (dbg_acquire == 1) {
-                    fprintf(stderr, "[SP_ACQUIRE_STAGE2] class=%d using UNUSED slot (ss=%p slab=%d)\n",
-                            class_idx, (void*)ss, unused_idx);
-                }
-
-                // Update SuperSlab metadata
-                ss->slab_bitmap |= (1u << unused_idx);
-                ss->slabs[unused_idx].class_idx = (uint8_t)class_idx;
-
-                if (ss->active_slabs == 0) {
-                    ss->active_slabs = 1;
-                    g_shared_pool.active_count++;
-                }
-
-                // Update hint
-                g_shared_pool.class_hints[class_idx] = ss;
-
-                *ss_out = ss;
-                *slab_idx_out = unused_idx;
-
-                if (g_lock_stats_enabled == 1) {
-                    atomic_fetch_add(&g_lock_release_count, 1);
-                }
-                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
-                return 0;  // ✅ Stage 2 success
+            if (dbg_acquire == 1) {
+                fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
+                        class_idx, (void*)ss, claimed_idx);
            }
+
+            // P0 instrumentation: count lock acquisitions
+            lock_stats_init();
+            if (g_lock_stats_enabled == 1) {
+                atomic_fetch_add(&g_lock_acquire_count, 1);
+                atomic_fetch_add(&g_lock_acquire_slab_count, 1);
+            }
+
+            pthread_mutex_lock(&g_shared_pool.alloc_lock);
+
+            // Update SuperSlab metadata under mutex
+            ss->slab_bitmap |= (1u << claimed_idx);
+            ss->slabs[claimed_idx].class_idx = (uint8_t)class_idx;
+
+            if (ss->active_slabs == 0) {
+                ss->active_slabs = 1;
+                g_shared_pool.active_count++;
+            }
+
+            // Update hint
+            g_shared_pool.class_hints[class_idx] = ss;
+
+            *ss_out = ss;
+            *slab_idx_out = claimed_idx;
+
+            if (g_lock_stats_enabled == 1) {
+                atomic_fetch_add(&g_lock_release_count, 1);
+            }
+            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
+            return 0;  // ✅ Stage 2 (lock-free) success
        }
+
+        // Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
    }

+    // ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
+    // All existing SuperSlabs have no UNUSED slots → need new SuperSlab
+    // P0 instrumentation: count lock acquisitions
+    lock_stats_init();
+    if (g_lock_stats_enabled == 1) {
+        atomic_fetch_add(&g_lock_acquire_count, 1);
+        atomic_fetch_add(&g_lock_acquire_slab_count, 1);
+    }
+
+    pthread_mutex_lock(&g_shared_pool.alloc_lock);
+
    // ========== Stage 3: Get new SuperSlab ==========
    // Try LRU cache first, then mmap
    SuperSlab* new_ss = NULL;
@ -631,13 +822,14 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)
        }
    }

-    // Push to per-class free list (enables reuse by same class)
+    // P0-4: Push to lock-free per-class free list (enables reuse by same class)
+    // Note: push BEFORE releasing mutex (slot state already updated under lock)
    if (class_idx < TINY_NUM_CLASSES_SS) {
-        sp_freelist_push(class_idx, sp_meta, slab_idx);
+        sp_freelist_push_lockfree(class_idx, sp_meta, slab_idx);

        if (dbg == 1) {
-            fprintf(stderr, "[SP_SLOT_FREELIST] class=%d pushed slot (ss=%p slab=%d) count=%u active_slots=%u/%u\n",
-                    class_idx, (void*)ss, slab_idx, g_shared_pool.free_slots[class_idx].count,
+            fprintf(stderr, "[SP_SLOT_FREELIST_LOCKFREE] class=%d pushed slot (ss=%p slab=%d) active_slots=%u/%u\n",
+                    class_idx, (void*)ss, slab_idx,
                    sp_meta->active_slots, sp_meta->total_slots);
        }
    }