hakmem/core/hakmem_shared_pool_acquire.c

#include "hakmem_shared_pool_internal.h"
#include "hakmem_debug_master.h"
#include "hakmem_stats_master.h"
#include "box/ss_slab_meta_box.h"
#include "box/ss_hot_cold_box.h"
#include "box/pagefault_telemetry_box.h"
#include "box/tls_sll_drain_box.h"
#include "box/tls_slab_reuse_guard_box.h"
#include "hakmem_policy.h"
#include "hakmem_env_cache.h"  // Priority-2: ENV cache

#include <stdlib.h>
#include <stdio.h>
#include <stdatomic.h>

// Stage 0.5: EMPTY slab direct scan（registry ベースの EMPTY 再利用）
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
// avoid Stage 3 (mmap) when freed slabs are available.
static inline int
sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
{
    // Priority-2: Use cached ENV
    int empty_reuse_enabled = HAK_ENV_SS_EMPTY_REUSE();
    if (!empty_reuse_enabled) {
        return -1;
    }

    extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
    extern int g_super_reg_class_size[TINY_NUM_CLASSES];

    int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0;
    // Priority-2: Use cached ENV
    int scan_limit = HAK_ENV_SS_EMPTY_SCAN_LIMIT();
    if (scan_limit > reg_size) scan_limit = reg_size;

    // Stage 0.5 hit counter for visualization
    static _Atomic uint64_t stage05_hits = 0;
    static _Atomic uint64_t stage05_attempts = 0;
    atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);

    for (int i = 0; i < scan_limit; i++) {
        SuperSlab* ss = g_super_reg_by_class[class_idx][i];
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
        if (ss->empty_count == 0) continue;  // No EMPTY slabs in this SS

        uint32_t mask = ss->empty_mask;
        while (mask) {
            int empty_idx = __builtin_ctz(mask);
            mask &= (mask - 1);  // clear lowest bit

            TinySlabMeta* meta = &ss->slabs[empty_idx];
            if (meta->capacity > 0 && meta->used == 0) {
                tiny_tls_slab_reuse_guard(ss);
                ss_clear_slab_empty(ss, empty_idx);

                meta->class_idx = (uint8_t)class_idx;
                ss->class_map[empty_idx] = (uint8_t)class_idx;

#if !HAKMEM_BUILD_RELEASE
                if (dbg_acquire == 1) {
                    fprintf(stderr,
                            "[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u)\n",
                            class_idx, (void*)ss, empty_idx, ss->empty_count);
                }
#else
                (void)dbg_acquire;
#endif

                *ss_out = ss;
                *slab_idx_out = empty_idx;
                sp_stage_stats_init();
                if (g_sp_stage_stats_enabled) {
                    atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
                }
                atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
                
                // Stage 0.5 hit rate visualization (every 100 hits)
                uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
                if (hits % 100 == 1) {
                    uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
                    fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d)\n",
                            hits, attempts, (double)hits * 100.0 / attempts, scan_limit);
                }
                return 0;
            }
        }
    }
    return -1;
}

int
shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
{
    // Phase 12: SP-SLOT Box - 3-Stage Acquire Logic
    //
    // Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE)
    // Stage 2: Find UNUSED slots in existing SuperSlabs
    // Stage 3: Get new SuperSlab (LRU pop or mmap)
    //
    // Invariants:
    //  - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots
    //  - The chosen slab has meta->class_idx == class_idx

    if (!ss_out || !slab_idx_out) {
        return -1;
    }
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return -1;
    }

    shared_pool_init();

    // Debug logging / stage stats
#if !HAKMEM_BUILD_RELEASE
    // Priority-2: Use cached ENV
    int dbg_acquire = HAK_ENV_SS_ACQUIRE_DEBUG();
#else
    static const int dbg_acquire = 0;
#endif
    sp_stage_stats_init();

stage1_retry_after_tension_drain:
    // ========== Stage 0.5 (Phase 12-1.1): EMPTY slab direct scan ==========
    // Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
    // avoid Stage 3 (mmap) when freed slabs are available.
    if (sp_acquire_from_empty_scan(class_idx, ss_out, slab_idx_out, dbg_acquire) == 0) {
        return 0;
    }

    // ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
    // P0-4: Lock-free pop from per-class free list (no mutex needed!)
    // Best case: Same class freed a slot, reuse immediately (cache-hot)
    SharedSSMeta* reuse_meta = NULL;
    int reuse_slot_idx = -1;

    if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
        // Found EMPTY slot from lock-free list!
        // Now acquire mutex ONLY for slot activation and metadata update

        // P0 instrumentation: count lock acquisitions
        lock_stats_init();
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_acquire_count, 1);
            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
        }

        pthread_mutex_lock(&g_shared_pool.alloc_lock);

        // P0.3: Guard against TLS SLL orphaned pointers before reusing slab
        // RACE FIX: Load SuperSlab pointer atomically BEFORE guard (consistency)
        SuperSlab* ss_guard = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
        if (ss_guard) {
            tiny_tls_slab_reuse_guard(ss_guard);
        }

        // Activate slot under mutex (slot state transition requires protection)
        if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
            // RACE FIX: Load SuperSlab pointer atomically (consistency)
            SuperSlab* ss = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);

            // RACE FIX: Check if SuperSlab was freed (NULL pointer)
            // This can happen if Thread A freed the SuperSlab after pushing slot to freelist,
            // but Thread B popped the stale slot before the freelist was cleared.
            if (!ss) {
                // SuperSlab freed - skip and fall through to Stage 2/3
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                goto stage2_fallback;
            }

            #if !HAKMEM_BUILD_RELEASE
            if (dbg_acquire == 1) {
                fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
                        class_idx, (void*)ss, reuse_slot_idx);
            }
            #endif

            // Update SuperSlab metadata
            ss->slab_bitmap |= (1u << reuse_slot_idx);
            ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx);

            if (ss->active_slabs == 0) {
                // Was empty, now active again
                ss->active_slabs = 1;
                g_shared_pool.active_count++;
            }
            // Track per-class active slots (approximate, under alloc_lock)
            if (class_idx < TINY_NUM_CLASSES_SS) {
                g_shared_pool.class_active_slots[class_idx]++;
            }

            // Update hint
            g_shared_pool.class_hints[class_idx] = ss;

            *ss_out = ss;
            *slab_idx_out = reuse_slot_idx;

            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
	            if (g_sp_stage_stats_enabled) {
	                atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
	            }
            return 0;  // ✅ Stage 1 (lock-free) success
        }

        // Slot activation failed (race condition?) - release lock and fall through
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
    }

stage2_fallback:
    // ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
    // P0 Optimization: Try class hint FIRST for fast path (same class locality)
    // This reduces metadata scan from 100% to ~10% when hints are effective
    {
        SuperSlab* hint_ss = g_shared_pool.class_hints[class_idx];
        if (__builtin_expect(hint_ss != NULL, 1)) {
            // P0 Optimization: O(1) lookup via cached pointer (avoids metadata scan)
            SharedSSMeta* hint_meta = hint_ss->shared_meta;
            if (__builtin_expect(hint_meta != NULL, 1)) {
                // Try lock-free claiming on hint SuperSlab first
                int claimed_idx = sp_slot_claim_lockfree(hint_meta, class_idx);
                if (__builtin_expect(claimed_idx >= 0, 1)) {
                    // Fast path success! No need to scan all metadata
                    SuperSlab* ss = atomic_load_explicit(&hint_meta->ss, memory_order_acquire);
                    if (__builtin_expect(ss != NULL, 1)) {
                        #if !HAKMEM_BUILD_RELEASE
                        if (dbg_acquire == 1) {
                            fprintf(stderr, "[SP_ACQUIRE_STAGE2_HINT] class=%d claimed UNUSED slot from hint (ss=%p slab=%d)\n",
                                    class_idx, (void*)ss, claimed_idx);
                        }
                        #endif

                        // P0 instrumentation: count lock acquisitions
                        lock_stats_init();
                        if (g_lock_stats_enabled == 1) {
                            atomic_fetch_add(&g_lock_acquire_count, 1);
                            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
                        }

                        pthread_mutex_lock(&g_shared_pool.alloc_lock);

                        // Update SuperSlab metadata under mutex
                        ss->slab_bitmap |= (1u << claimed_idx);
                        ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);

                        if (ss->active_slabs == 0) {
                            ss->active_slabs = 1;
                            g_shared_pool.active_count++;
                        }
                        if (class_idx < TINY_NUM_CLASSES_SS) {
                            g_shared_pool.class_active_slots[class_idx]++;
                        }

                        // Hint is still good, no need to update
                        *ss_out = ss;
                        *slab_idx_out = claimed_idx;
                        sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);

                        if (g_lock_stats_enabled == 1) {
                            atomic_fetch_add(&g_lock_release_count, 1);
                        }
                        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                        if (g_sp_stage_stats_enabled) {
                            atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
                        }
                        return 0;  // ✅ Stage 2 (hint fast path) success
                    }
                }
            }
        }
    }

    // P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
    // RACE FIX: Read ss_meta_count atomically (now properly declared as _Atomic)
    // No cast needed! memory_order_acquire synchronizes with release in sp_meta_find_or_create
    uint32_t meta_count = atomic_load_explicit(
        &g_shared_pool.ss_meta_count,
        memory_order_acquire
    );

    for (uint32_t i = 0; i < meta_count; i++) {
        SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];

        // Try lock-free claiming (UNUSED → ACTIVE via CAS)
        int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
        if (claimed_idx >= 0) {
            // RACE FIX: Load SuperSlab pointer atomically (critical for lock-free Stage 2)
            // Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
            SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire);
            if (!ss) {
                // SuperSlab was freed between claiming and loading - skip this entry
                continue;
            }

            #if !HAKMEM_BUILD_RELEASE
            if (dbg_acquire == 1) {
                fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
                        class_idx, (void*)ss, claimed_idx);
            }
            #endif

            // P0 instrumentation: count lock acquisitions
            lock_stats_init();
            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_acquire_count, 1);
                atomic_fetch_add(&g_lock_acquire_slab_count, 1);
            }

            pthread_mutex_lock(&g_shared_pool.alloc_lock);

            // Update SuperSlab metadata under mutex
            ss->slab_bitmap |= (1u << claimed_idx);
            ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);

            if (ss->active_slabs == 0) {
                ss->active_slabs = 1;
                g_shared_pool.active_count++;
            }
            if (class_idx < TINY_NUM_CLASSES_SS) {
                g_shared_pool.class_active_slots[class_idx]++;
            }

            // Update hint
            g_shared_pool.class_hints[class_idx] = ss;

            *ss_out = ss;
            *slab_idx_out = claimed_idx;
            sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);

            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
	            if (g_sp_stage_stats_enabled) {
	                atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
	            }
            return 0;  // ✅ Stage 2 (lock-free) success
        }

        // Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
    }

    // ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ==========
    // If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection
    // This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1
    // ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1)
    // ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024)
    {
        // Priority-2: Use cached ENV
        int tension_drain_enabled = HAK_ENV_TINY_TENSION_DRAIN_ENABLE();
        uint32_t tension_threshold = (uint32_t)HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD();

        if (tension_drain_enabled) {
            extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
            extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size);

            uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll[class_idx].count : 0;

            if (sll_count >= tension_threshold) {
                // Drain all blocks to maximize EMPTY slot creation
                uint32_t drained = tiny_tls_sll_drain(class_idx, 0);  // 0 = drain all

                if (drained > 0) {
                    // Retry Stage 1 (EMPTY reuse) after drain
                    // Some slabs might have become EMPTY (meta->used == 0)
                    goto stage1_retry_after_tension_drain;
                }
            }
        }
    }

    // ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
    // All existing SuperSlabs have no UNUSED slots → need new SuperSlab
    // P0 instrumentation: count lock acquisitions
    lock_stats_init();
    if (g_lock_stats_enabled == 1) {
        atomic_fetch_add(&g_lock_acquire_count, 1);
        atomic_fetch_add(&g_lock_acquire_slab_count, 1);
    }

    pthread_mutex_lock(&g_shared_pool.alloc_lock);

    // ========== Stage 3: Get new SuperSlab ==========
    // Try LRU cache first, then mmap
    SuperSlab* new_ss = NULL;

    // Stage 3a: Try LRU cache
    extern SuperSlab* hak_ss_lru_pop(uint8_t size_class);
    new_ss = hak_ss_lru_pop((uint8_t)class_idx);

    int from_lru = (new_ss != NULL);

    // Stage 3b: If LRU miss, allocate new SuperSlab
    if (!new_ss) {
        // Release the alloc_lock to avoid deadlock with registry during superslab_allocate
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);

        SuperSlab* allocated_ss = sp_internal_allocate_superslab(class_idx);

        // Re-acquire the alloc_lock
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_acquire_count, 1);
            atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
        }
        pthread_mutex_lock(&g_shared_pool.alloc_lock);

        if (!allocated_ss) {
            // Allocation failed; return now.
            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
            return -1; // Out of memory
        }

        new_ss = allocated_ss;

        // Add newly allocated SuperSlab to the shared pool's internal array
        if (g_shared_pool.total_count >= g_shared_pool.capacity) {
            shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
            if (g_shared_pool.total_count >= g_shared_pool.capacity) {
                // Pool table expansion failed; leave ss alive (registry-owned),
                // but do not treat it as part of shared_pool.
                // This is a critical error, return early.
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                return -1;
            }
        }
        g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
        g_shared_pool.total_count++;
    }

    #if !HAKMEM_BUILD_RELEASE
    if (dbg_acquire == 1 && new_ss) {
        fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n",
                class_idx, (void*)new_ss, from_lru);
    }
    #endif

    if (!new_ss) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;  // ❌ Out of memory
    }

    // Before creating a new SuperSlab, consult learning-layer soft cap.
    // Phase 9-2: Soft Cap removed to allow Shared Pool to fully replace Legacy Backend.
    // We now rely on LRU eviction and EMPTY recycling to manage memory pressure.

    // Create metadata for this new SuperSlab
    SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
    if (!new_meta) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;  // ❌ Metadata allocation failed
    }

    // Assign first slot to this class
    int first_slot = 0;
    if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;  // ❌ Should not happen
    }

    // Update SuperSlab metadata
    new_ss->slab_bitmap |= (1u << first_slot);
    ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx);
    new_ss->active_slabs = 1;
    g_shared_pool.active_count++;
    if (class_idx < TINY_NUM_CLASSES_SS) {
        g_shared_pool.class_active_slots[class_idx]++;
    }

    // Update hint
    g_shared_pool.class_hints[class_idx] = new_ss;

    *ss_out = new_ss;
    *slab_idx_out = first_slot;
    sp_fix_geometry_if_needed(new_ss, first_slot, class_idx);

    if (g_lock_stats_enabled == 1) {
        atomic_fetch_add(&g_lock_release_count, 1);
    }
    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
	    if (g_sp_stage_stats_enabled) {
	        atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1);
	    }
    return 0;  // ✅ Stage 3 success
}
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								#include "hakmem_shared_pool_internal.h"
 								#include "hakmem_debug_master.h"
 								#include "hakmem_stats_master.h"
 								#include "box/ss_slab_meta_box.h"
 								#include "box/ss_hot_cold_box.h"
 								#include "box/pagefault_telemetry_box.h"
 								#include "box/tls_sll_drain_box.h"
 								#include "box/tls_slab_reuse_guard_box.h"
 								#include "hakmem_policy.h"
-												Priority-2 ENV Cache: Shared Pool Acquire (5変数追加、5箇所置換)

【追加ENV変数】
- HAKMEM_SS_EMPTY_REUSE (default: 1)
- HAKMEM_SS_EMPTY_SCAN_LIMIT (default: 32)
- HAKMEM_SS_ACQUIRE_DEBUG (default: 0)
- HAKMEM_TINY_TENSION_DRAIN_ENABLE (default: 1)
- HAKMEM_TINY_TENSION_DRAIN_THRESHOLD (default: 1024)

【置換ファイル】
- core/hakmem_shared_pool_acquire.c (5箇所 → ENV Cache)

【変更詳細】
1. ENV Cache (hakmem_env_cache.h):
   - 構造体に5変数追加 (41→46変数)
   - hakmem_env_cache_init()に初期化追加
   - アクセサマクロ5個追加
   - カウント更新: 41→46

2. hakmem_shared_pool_acquire.c:
   - getenv("HAKMEM_SS_EMPTY_REUSE") → HAK_ENV_SS_EMPTY_REUSE()
   - getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT") → HAK_ENV_SS_EMPTY_SCAN_LIMIT()
   - getenv("HAKMEM_SS_ACQUIRE_DEBUG") → HAK_ENV_SS_ACQUIRE_DEBUG()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE") → HAK_ENV_TINY_TENSION_DRAIN_ENABLE()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD") → HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD()
   - #include "hakmem_env_cache.h" 追加

【効果】
- Shared Pool Acquire warm pathからgetenv()呼び出しを完全排除
- Lock-free Stage2のgetenv()オーバーヘッド削減

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:51:50 +09:00
+								#include "hakmem_env_cache.h"  // Priority-2: ENV cache
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								#include <stdlib.h>
 								#include <stdio.h>
 								#include <stdatomic.h>
 								// Stage 0.5: EMPTY slab direct scan（registry ベースの EMPTY 再利用）
 								// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
 								// avoid Stage 3 (mmap) when freed slabs are available.
 								static inline int
 								sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
 								{
-												Priority-2 ENV Cache: Shared Pool Acquire (5変数追加、5箇所置換)

【追加ENV変数】
- HAKMEM_SS_EMPTY_REUSE (default: 1)
- HAKMEM_SS_EMPTY_SCAN_LIMIT (default: 32)
- HAKMEM_SS_ACQUIRE_DEBUG (default: 0)
- HAKMEM_TINY_TENSION_DRAIN_ENABLE (default: 1)
- HAKMEM_TINY_TENSION_DRAIN_THRESHOLD (default: 1024)

【置換ファイル】
- core/hakmem_shared_pool_acquire.c (5箇所 → ENV Cache)

【変更詳細】
1. ENV Cache (hakmem_env_cache.h):
   - 構造体に5変数追加 (41→46変数)
   - hakmem_env_cache_init()に初期化追加
   - アクセサマクロ5個追加
   - カウント更新: 41→46

2. hakmem_shared_pool_acquire.c:
   - getenv("HAKMEM_SS_EMPTY_REUSE") → HAK_ENV_SS_EMPTY_REUSE()
   - getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT") → HAK_ENV_SS_EMPTY_SCAN_LIMIT()
   - getenv("HAKMEM_SS_ACQUIRE_DEBUG") → HAK_ENV_SS_ACQUIRE_DEBUG()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE") → HAK_ENV_TINY_TENSION_DRAIN_ENABLE()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD") → HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD()
   - #include "hakmem_env_cache.h" 追加

【効果】
- Shared Pool Acquire warm pathからgetenv()呼び出しを完全排除
- Lock-free Stage2のgetenv()オーバーヘッド削減

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:51:50 +09:00
+								    // Priority-2: Use cached ENV
 								    int empty_reuse_enabled = HAK_ENV_SS_EMPTY_REUSE();
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    if (!empty_reuse_enabled) {
 								        return -1;
 								    }
 								    extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
 								    extern int g_super_reg_class_size[TINY_NUM_CLASSES];
 								    int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0;
-												Priority-2 ENV Cache: Shared Pool Acquire (5変数追加、5箇所置換)

【追加ENV変数】
- HAKMEM_SS_EMPTY_REUSE (default: 1)
- HAKMEM_SS_EMPTY_SCAN_LIMIT (default: 32)
- HAKMEM_SS_ACQUIRE_DEBUG (default: 0)
- HAKMEM_TINY_TENSION_DRAIN_ENABLE (default: 1)
- HAKMEM_TINY_TENSION_DRAIN_THRESHOLD (default: 1024)

【置換ファイル】
- core/hakmem_shared_pool_acquire.c (5箇所 → ENV Cache)

【変更詳細】
1. ENV Cache (hakmem_env_cache.h):
   - 構造体に5変数追加 (41→46変数)
   - hakmem_env_cache_init()に初期化追加
   - アクセサマクロ5個追加
   - カウント更新: 41→46

2. hakmem_shared_pool_acquire.c:
   - getenv("HAKMEM_SS_EMPTY_REUSE") → HAK_ENV_SS_EMPTY_REUSE()
   - getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT") → HAK_ENV_SS_EMPTY_SCAN_LIMIT()
   - getenv("HAKMEM_SS_ACQUIRE_DEBUG") → HAK_ENV_SS_ACQUIRE_DEBUG()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE") → HAK_ENV_TINY_TENSION_DRAIN_ENABLE()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD") → HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD()
   - #include "hakmem_env_cache.h" 追加

【効果】
- Shared Pool Acquire warm pathからgetenv()呼び出しを完全排除
- Lock-free Stage2のgetenv()オーバーヘッド削減

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:51:50 +09:00
+								    // Priority-2: Use cached ENV
 								    int scan_limit = HAK_ENV_SS_EMPTY_SCAN_LIMIT();
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    if (scan_limit > reg_size) scan_limit = reg_size;
 								    // Stage 0.5 hit counter for visualization
 								    static _Atomic uint64_t stage05_hits = 0;
 								    static _Atomic uint64_t stage05_attempts = 0;
 								    atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);
 								    for (int i = 0; i < scan_limit; i++) {
 								        SuperSlab* ss = g_super_reg_by_class[class_idx][i];
 								        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
 								        if (ss->empty_count == 0) continue;  // No EMPTY slabs in this SS
 								        uint32_t mask = ss->empty_mask;
 								        while (mask) {
 								            int empty_idx = __builtin_ctz(mask);
 								            mask &= (mask - 1);  // clear lowest bit
 								            TinySlabMeta* meta = &ss->slabs[empty_idx];
 								            if (meta->capacity > 0 && meta->used == 0) {
 								                tiny_tls_slab_reuse_guard(ss);
 								                ss_clear_slab_empty(ss, empty_idx);
 								                meta->class_idx = (uint8_t)class_idx;
 								                ss->class_map[empty_idx] = (uint8_t)class_idx;
 								#if !HAKMEM_BUILD_RELEASE
 								                if (dbg_acquire == 1) {
 								                    fprintf(stderr,
 								                            "[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u)\n",
 								                            class_idx, (void*)ss, empty_idx, ss->empty_count);
 								                }
 								#else
 								                (void)dbg_acquire;
 								#endif
 								                *ss_out = ss;
 								                *slab_idx_out = empty_idx;
 								                sp_stage_stats_init();
 								                if (g_sp_stage_stats_enabled) {
 								                    atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
 								                }
 								                atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
 								                // Stage 0.5 hit rate visualization (every 100 hits)
 								                uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
 								                if (hits % 100 == 1) {
 								                    uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
 								                    fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d)\n",
 								                            hits, attempts, (double)hits * 100.0 / attempts, scan_limit);
 								                }
 								                return 0;
 								            }
 								        }
 								    }
 								    return -1;
 								}
 								int
 								shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
 								{
 								    // Phase 12: SP-SLOT Box - 3-Stage Acquire Logic
 								    //
 								    // Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE)
 								    // Stage 2: Find UNUSED slots in existing SuperSlabs
 								    // Stage 3: Get new SuperSlab (LRU pop or mmap)
 								    //
 								    // Invariants:
 								    //  - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots
 								    //  - The chosen slab has meta->class_idx == class_idx
 								    if (!ss_out || !slab_idx_out) {
 								        return -1;
 								    }
 								    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
 								        return -1;
 								    }
 								    shared_pool_init();
 								    // Debug logging / stage stats
 								#if !HAKMEM_BUILD_RELEASE
-												Priority-2 ENV Cache: Shared Pool Acquire (5変数追加、5箇所置換)

【追加ENV変数】
- HAKMEM_SS_EMPTY_REUSE (default: 1)
- HAKMEM_SS_EMPTY_SCAN_LIMIT (default: 32)
- HAKMEM_SS_ACQUIRE_DEBUG (default: 0)
- HAKMEM_TINY_TENSION_DRAIN_ENABLE (default: 1)
- HAKMEM_TINY_TENSION_DRAIN_THRESHOLD (default: 1024)

【置換ファイル】
- core/hakmem_shared_pool_acquire.c (5箇所 → ENV Cache)

【変更詳細】
1. ENV Cache (hakmem_env_cache.h):
   - 構造体に5変数追加 (41→46変数)
   - hakmem_env_cache_init()に初期化追加
   - アクセサマクロ5個追加
   - カウント更新: 41→46

2. hakmem_shared_pool_acquire.c:
   - getenv("HAKMEM_SS_EMPTY_REUSE") → HAK_ENV_SS_EMPTY_REUSE()
   - getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT") → HAK_ENV_SS_EMPTY_SCAN_LIMIT()
   - getenv("HAKMEM_SS_ACQUIRE_DEBUG") → HAK_ENV_SS_ACQUIRE_DEBUG()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE") → HAK_ENV_TINY_TENSION_DRAIN_ENABLE()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD") → HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD()
   - #include "hakmem_env_cache.h" 追加

【効果】
- Shared Pool Acquire warm pathからgetenv()呼び出しを完全排除
- Lock-free Stage2のgetenv()オーバーヘッド削減

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:51:50 +09:00
+								    // Priority-2: Use cached ENV
 								    int dbg_acquire = HAK_ENV_SS_ACQUIRE_DEBUG();
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								#else
 								    static const int dbg_acquire = 0;
 								#endif
 								    sp_stage_stats_init();
 								stage1_retry_after_tension_drain:
 								    // ========== Stage 0.5 (Phase 12-1.1): EMPTY slab direct scan ==========
 								    // Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
 								    // avoid Stage 3 (mmap) when freed slabs are available.
 								    if (sp_acquire_from_empty_scan(class_idx, ss_out, slab_idx_out, dbg_acquire) == 0) {
 								        return 0;
 								    }
 								    // ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
 								    // P0-4: Lock-free pop from per-class free list (no mutex needed!)
 								    // Best case: Same class freed a slot, reuse immediately (cache-hot)
 								    SharedSSMeta* reuse_meta = NULL;
 								    int reuse_slot_idx = -1;
 								    if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
 								        // Found EMPTY slot from lock-free list!
 								        // Now acquire mutex ONLY for slot activation and metadata update
 								        // P0 instrumentation: count lock acquisitions
 								        lock_stats_init();
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_acquire_count, 1);
 								            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
 								        }
 								        pthread_mutex_lock(&g_shared_pool.alloc_lock);
 								        // P0.3: Guard against TLS SLL orphaned pointers before reusing slab
 								        // RACE FIX: Load SuperSlab pointer atomically BEFORE guard (consistency)
 								        SuperSlab* ss_guard = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
 								        if (ss_guard) {
 								            tiny_tls_slab_reuse_guard(ss_guard);
 								        }
 								        // Activate slot under mutex (slot state transition requires protection)
 								        if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
 								            // RACE FIX: Load SuperSlab pointer atomically (consistency)
 								            SuperSlab* ss = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
 								            // RACE FIX: Check if SuperSlab was freed (NULL pointer)
 								            // This can happen if Thread A freed the SuperSlab after pushing slot to freelist,
 								            // but Thread B popped the stale slot before the freelist was cleared.
 								            if (!ss) {
 								                // SuperSlab freed - skip and fall through to Stage 2/3
 								                if (g_lock_stats_enabled == 1) {
 								                    atomic_fetch_add(&g_lock_release_count, 1);
 								                }
 								                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								                goto stage2_fallback;
 								            }
 								            #if !HAKMEM_BUILD_RELEASE
 								            if (dbg_acquire == 1) {
 								                fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
 								                        class_idx, (void*)ss, reuse_slot_idx);
 								            }
 								            #endif
 								            // Update SuperSlab metadata
 								            ss->slab_bitmap |= (1u << reuse_slot_idx);
 								            ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx);
 								            if (ss->active_slabs == 0) {
 								                // Was empty, now active again
 								                ss->active_slabs = 1;
 								                g_shared_pool.active_count++;
 								            }
 								            // Track per-class active slots (approximate, under alloc_lock)
 								            if (class_idx < TINY_NUM_CLASSES_SS) {
 								                g_shared_pool.class_active_slots[class_idx]++;
 								            }
 								            // Update hint
 								            g_shared_pool.class_hints[class_idx] = ss;
 								            *ss_out = ss;
 								            *slab_idx_out = reuse_slot_idx;
 								            if (g_lock_stats_enabled == 1) {
 								                atomic_fetch_add(&g_lock_release_count, 1);
 								            }
 								            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 									            if (g_sp_stage_stats_enabled) {
 									                atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
 									            }
 								            return 0;  // ✅ Stage 1 (lock-free) success
 								        }
 								        // Slot activation failed (race condition?) - release lock and fall through
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								    }
 								stage2_fallback:
 								    // ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
-												P0 Optimization: Shared Pool fast path with O(1) metadata lookup

Performance Results:
- Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement)
- sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer
- Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints)

Core Optimizations:

1. O(1) Metadata Lookup (superslab_types.h)
   - Added `shared_meta` pointer field to SuperSlab struct
   - Eliminates O(N) linear search through ss_metadata[] array
   - First access: O(N) scan + cache | Subsequent: O(1) direct return

2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c)
   - Check cached ss->shared_meta first before linear scan
   - Cache pointer after successful linear scan for future lookups
   - Reduces 7.8% CPU hotspot to near-zero for hot paths

3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c)
   - Try class_hints[class_idx] FIRST before full metadata scan
   - Uses O(1) ss->shared_meta lookup for hint validation
   - __builtin_expect() for branch prediction optimization
   - 80-90% of acquire calls now skip full metadata scan

4. Proper Initialization (ss_allocation_box.c)
   - Initialize shared_meta = NULL in superslab_allocate()
   - Ensures correct NULL-check semantics for new SuperSlabs

Additional Improvements:
- Updated ptr_trace and debug ring for release build efficiency
- Enhanced ENV variable documentation and analysis
- Added learner_env_box.h for configuration management
- Various Box optimizations for reduced overhead

Thread Safety:
- All atomic operations use correct memory ordering
- shared_meta cached under mutex protection
- Lock-free Stage 2 uses proper CAS with acquire/release semantics

Testing:
- Benchmark: 1M iterations, 3.8M ops/s stable
- Build: Clean compile RELEASE=0 and RELEASE=1
- No crashes, memory leaks, or correctness issues

Next Optimization Candidates:
- P1: Per-SuperSlab free slot bitmap for O(1) slot claiming
- P2: Reduce Stage 2 critical section size
- P3: Page pre-faulting (MAP_POPULATE)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 16:21:54 +09:00
+								    // P0 Optimization: Try class hint FIRST for fast path (same class locality)
 								    // This reduces metadata scan from 100% to ~10% when hints are effective
 								    {
 								        SuperSlab* hint_ss = g_shared_pool.class_hints[class_idx];
 								        if (__builtin_expect(hint_ss != NULL, 1)) {
 								            // P0 Optimization: O(1) lookup via cached pointer (avoids metadata scan)
 								            SharedSSMeta* hint_meta = hint_ss->shared_meta;
 								            if (__builtin_expect(hint_meta != NULL, 1)) {
 								                // Try lock-free claiming on hint SuperSlab first
 								                int claimed_idx = sp_slot_claim_lockfree(hint_meta, class_idx);
 								                if (__builtin_expect(claimed_idx >= 0, 1)) {
 								                    // Fast path success! No need to scan all metadata
 								                    SuperSlab* ss = atomic_load_explicit(&hint_meta->ss, memory_order_acquire);
 								                    if (__builtin_expect(ss != NULL, 1)) {
 								                        #if !HAKMEM_BUILD_RELEASE
 								                        if (dbg_acquire == 1) {
 								                            fprintf(stderr, "[SP_ACQUIRE_STAGE2_HINT] class=%d claimed UNUSED slot from hint (ss=%p slab=%d)\n",
 								                                    class_idx, (void*)ss, claimed_idx);
 								                        }
 								                        #endif
 								                        // P0 instrumentation: count lock acquisitions
 								                        lock_stats_init();
 								                        if (g_lock_stats_enabled == 1) {
 								                            atomic_fetch_add(&g_lock_acquire_count, 1);
 								                            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
 								                        }
 								                        pthread_mutex_lock(&g_shared_pool.alloc_lock);
 								                        // Update SuperSlab metadata under mutex
 								                        ss->slab_bitmap |= (1u << claimed_idx);
 								                        ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
 								                        if (ss->active_slabs == 0) {
 								                            ss->active_slabs = 1;
 								                            g_shared_pool.active_count++;
 								                        }
 								                        if (class_idx < TINY_NUM_CLASSES_SS) {
 								                            g_shared_pool.class_active_slots[class_idx]++;
 								                        }
 								                        // Hint is still good, no need to update
 								                        *ss_out = ss;
 								                        *slab_idx_out = claimed_idx;
 								                        sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);
 								                        if (g_lock_stats_enabled == 1) {
 								                            atomic_fetch_add(&g_lock_release_count, 1);
 								                        }
 								                        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								                        if (g_sp_stage_stats_enabled) {
 								                            atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
 								                        }
 								                        return 0;  // ✅ Stage 2 (hint fast path) success
 								                    }
 								                }
 								            }
 								        }
 								    }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    // P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
 								    // RACE FIX: Read ss_meta_count atomically (now properly declared as _Atomic)
 								    // No cast needed! memory_order_acquire synchronizes with release in sp_meta_find_or_create
 								    uint32_t meta_count = atomic_load_explicit(
 								        &g_shared_pool.ss_meta_count,
 								        memory_order_acquire
 								    );
 								    for (uint32_t i = 0; i < meta_count; i++) {
 								        SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];
 								        // Try lock-free claiming (UNUSED → ACTIVE via CAS)
 								        int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
 								        if (claimed_idx >= 0) {
 								            // RACE FIX: Load SuperSlab pointer atomically (critical for lock-free Stage 2)
 								            // Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
 								            SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire);
 								            if (!ss) {
 								                // SuperSlab was freed between claiming and loading - skip this entry
 								                continue;
 								            }
 								            #if !HAKMEM_BUILD_RELEASE
 								            if (dbg_acquire == 1) {
 								                fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
 								                        class_idx, (void*)ss, claimed_idx);
 								            }
 								            #endif
 								            // P0 instrumentation: count lock acquisitions
 								            lock_stats_init();
 								            if (g_lock_stats_enabled == 1) {
 								                atomic_fetch_add(&g_lock_acquire_count, 1);
 								                atomic_fetch_add(&g_lock_acquire_slab_count, 1);
 								            }
 								            pthread_mutex_lock(&g_shared_pool.alloc_lock);
 								            // Update SuperSlab metadata under mutex
 								            ss->slab_bitmap |= (1u << claimed_idx);
 								            ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
 								            if (ss->active_slabs == 0) {
 								                ss->active_slabs = 1;
 								                g_shared_pool.active_count++;
 								            }
 								            if (class_idx < TINY_NUM_CLASSES_SS) {
 								                g_shared_pool.class_active_slots[class_idx]++;
 								            }
 								            // Update hint
 								            g_shared_pool.class_hints[class_idx] = ss;
 								            *ss_out = ss;
 								            *slab_idx_out = claimed_idx;
 								            sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);
 								            if (g_lock_stats_enabled == 1) {
 								                atomic_fetch_add(&g_lock_release_count, 1);
 								            }
 								            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 									            if (g_sp_stage_stats_enabled) {
 									                atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
 									            }
 								            return 0;  // ✅ Stage 2 (lock-free) success
 								        }
 								        // Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
 								    }
 								    // ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ==========
 								    // If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection
 								    // This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1
 								    // ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1)
 								    // ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024)
 								    {
-												Priority-2 ENV Cache: Shared Pool Acquire (5変数追加、5箇所置換)

【追加ENV変数】
- HAKMEM_SS_EMPTY_REUSE (default: 1)
- HAKMEM_SS_EMPTY_SCAN_LIMIT (default: 32)
- HAKMEM_SS_ACQUIRE_DEBUG (default: 0)
- HAKMEM_TINY_TENSION_DRAIN_ENABLE (default: 1)
- HAKMEM_TINY_TENSION_DRAIN_THRESHOLD (default: 1024)

【置換ファイル】
- core/hakmem_shared_pool_acquire.c (5箇所 → ENV Cache)

【変更詳細】
1. ENV Cache (hakmem_env_cache.h):
   - 構造体に5変数追加 (41→46変数)
   - hakmem_env_cache_init()に初期化追加
   - アクセサマクロ5個追加
   - カウント更新: 41→46

2. hakmem_shared_pool_acquire.c:
   - getenv("HAKMEM_SS_EMPTY_REUSE") → HAK_ENV_SS_EMPTY_REUSE()
   - getenv("HAKMEM_SS_EMPTY_SCAN_LIMIT") → HAK_ENV_SS_EMPTY_SCAN_LIMIT()
   - getenv("HAKMEM_SS_ACQUIRE_DEBUG") → HAK_ENV_SS_ACQUIRE_DEBUG()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_ENABLE") → HAK_ENV_TINY_TENSION_DRAIN_ENABLE()
   - getenv("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD") → HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD()
   - #include "hakmem_env_cache.h" 追加

【効果】
- Shared Pool Acquire warm pathからgetenv()呼び出しを完全排除
- Lock-free Stage2のgetenv()オーバーヘッド削減

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:51:50 +09:00
+								        // Priority-2: Use cached ENV
 								        int tension_drain_enabled = HAK_ENV_TINY_TENSION_DRAIN_ENABLE();
 								        uint32_t tension_threshold = (uint32_t)HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD();
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								        if (tension_drain_enabled) {
 								            extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
 								            extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size);
 								            uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll[class_idx].count : 0;
 								            if (sll_count >= tension_threshold) {
 								                // Drain all blocks to maximize EMPTY slot creation
 								                uint32_t drained = tiny_tls_sll_drain(class_idx, 0);  // 0 = drain all
 								                if (drained > 0) {
 								                    // Retry Stage 1 (EMPTY reuse) after drain
 								                    // Some slabs might have become EMPTY (meta->used == 0)
 								                    goto stage1_retry_after_tension_drain;
 								                }
 								            }
 								        }
 								    }
 								    // ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
 								    // All existing SuperSlabs have no UNUSED slots → need new SuperSlab
 								    // P0 instrumentation: count lock acquisitions
 								    lock_stats_init();
 								    if (g_lock_stats_enabled == 1) {
 								        atomic_fetch_add(&g_lock_acquire_count, 1);
 								        atomic_fetch_add(&g_lock_acquire_slab_count, 1);
 								    }
 								    pthread_mutex_lock(&g_shared_pool.alloc_lock);
 								    // ========== Stage 3: Get new SuperSlab ==========
 								    // Try LRU cache first, then mmap
 								    SuperSlab* new_ss = NULL;
 								    // Stage 3a: Try LRU cache
 								    extern SuperSlab* hak_ss_lru_pop(uint8_t size_class);
 								    new_ss = hak_ss_lru_pop((uint8_t)class_idx);
 								    int from_lru = (new_ss != NULL);
 								    // Stage 3b: If LRU miss, allocate new SuperSlab
 								    if (!new_ss) {
 								        // Release the alloc_lock to avoid deadlock with registry during superslab_allocate
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
-												WIP: Add TLS SLL validation and SuperSlab registry fallback

ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue.
Current status: Partial mitigation, but root cause remains.

Changes Applied:
1. SuperSlab Registry Fallback (hakmem_super_registry.h)
   - Added legacy table probe when hash map lookup misses
   - Prevents NULL returns for valid SuperSlabs during initialization
   - Status: ✅ Works but may hide underlying registration issues

2. TLS SLL Push Validation (tls_sll_box.h)
   - Reject push if SuperSlab lookup returns NULL
   - Reject push if class_idx mismatch detected
   - Added [TLS_SLL_PUSH_NO_SS] diagnostic message
   - Status: ✅ Prevents list corruption (defensive)

3. SuperSlab Allocation Class Fix (superslab_allocate.c)
   - Pass actual class_idx to sp_internal_allocate_superslab
   - Prevents dummy class=8 causing OOB access
   - Status: ✅ Root cause fix for allocation path

4. Debug Output Additions
   - First 256 push/pop operations traced
   - First 4 mismatches logged with details
   - SuperSlab registration state logged
   - Status: ✅ Diagnostic tool (not a fix)

5. TLS Hint Box Removed
   - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization)
   - Simplified to focus on stability first
   - Status: ⏳ Can be re-added after root cause fixed

Current Problem (REMAINS UNSOLVED):
- [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench
- Pointer is 16 bytes offset from expected (class 1 → class 2 boundary)
- hak_super_lookup returns NULL for that pointer
- Suggests: Use-After-Free, Double-Free, or pointer arithmetic error

Root Cause Analysis:
- Pattern: Pointer offset by +16 (one class 1 stride)
- Timing: Cumulative problem (appears after 60s, not immediately)
- Location: Header corruption detected during TLS SLL pop

Remaining Issues:
⚠️ Registry fallback is defensive (may hide registration bugs)
⚠️ Push validation prevents symptoms but not root cause
⚠️ 16-byte pointer offset source unidentified

Next Steps for Investigation:
1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths)
2. Enhanced logging at HDR_RESET point:
   - Expected vs actual pointer value
   - Pointer provenance (where it came from)
   - Allocation trace for that block
3. Verify Headerless flag is OFF throughout build
4. Check for double-offset application in conversions

Technical Assessment:
- 60% root cause fixes (allocation class, validation)
- 40% defensive mitigation (registry fallback, push rejection)

Performance Impact:
- Registry fallback: +10-30 cycles on cold path (negligible)
- Push validation: +5-10 cycles per push (acceptable)
- Overall: < 2% performance impact estimated

Related Issues:
- Phase 1 TLS Hint Box removed temporarily
- Phase 2 Headerless blocked until stability achieved

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 20:42:28 +09:00
+								        SuperSlab* allocated_ss = sp_internal_allocate_superslab(class_idx);
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								        // Re-acquire the alloc_lock
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_acquire_count, 1);
 								            atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
 								        }
 								        pthread_mutex_lock(&g_shared_pool.alloc_lock);
 								        if (!allocated_ss) {
 								            // Allocation failed; return now.
 								            if (g_lock_stats_enabled == 1) {
 								                atomic_fetch_add(&g_lock_release_count, 1);
 								            }
 								            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								            return -1; // Out of memory
 								        }
 								        new_ss = allocated_ss;
 								        // Add newly allocated SuperSlab to the shared pool's internal array
 								        if (g_shared_pool.total_count >= g_shared_pool.capacity) {
 								            shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
 								            if (g_shared_pool.total_count >= g_shared_pool.capacity) {
 								                // Pool table expansion failed; leave ss alive (registry-owned),
 								                // but do not treat it as part of shared_pool.
 								                // This is a critical error, return early.
 								                if (g_lock_stats_enabled == 1) {
 								                    atomic_fetch_add(&g_lock_release_count, 1);
 								                }
 								                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								                return -1;
 								            }
 								        }
 								        g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
 								        g_shared_pool.total_count++;
 								    }
 								    #if !HAKMEM_BUILD_RELEASE
 								    if (dbg_acquire == 1 && new_ss) {
 								        fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n",
 								                class_idx, (void*)new_ss, from_lru);
 								    }
 								    #endif
 								    if (!new_ss) {
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								        return -1;  // ❌ Out of memory
 								    }
 								    // Before creating a new SuperSlab, consult learning-layer soft cap.
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								    // Phase 9-2: Soft Cap removed to allow Shared Pool to fully replace Legacy Backend.
 								    // We now rely on LRU eviction and EMPTY recycling to manage memory pressure.
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								    // Create metadata for this new SuperSlab
 								    SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
 								    if (!new_meta) {
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								        return -1;  // ❌ Metadata allocation failed
 								    }
 								    // Assign first slot to this class
 								    int first_slot = 0;
 								    if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) {
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								        return -1;  // ❌ Should not happen
 								    }
 								    // Update SuperSlab metadata
 								    new_ss->slab_bitmap |= (1u << first_slot);
 								    ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx);
 								    new_ss->active_slabs = 1;
 								    g_shared_pool.active_count++;
 								    if (class_idx < TINY_NUM_CLASSES_SS) {
 								        g_shared_pool.class_active_slots[class_idx]++;
 								    }
 								    // Update hint
 								    g_shared_pool.class_hints[class_idx] = new_ss;
 								    *ss_out = new_ss;
 								    *slab_idx_out = first_slot;
 								    sp_fix_geometry_if_needed(new_ss, first_slot, class_idx);
 								    if (g_lock_stats_enabled == 1) {
 								        atomic_fetch_add(&g_lock_release_count, 1);
 								    }
 								    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 									    if (g_sp_stage_stats_enabled) {
 									        atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1);
 									    }
 								    return 0;  // ✅ Stage 3 success
 								}