#include "hakmem_shared_pool_internal.h"
#include "hakmem_debug_master.h"
#include "hakmem_stats_master.h"
#include "box/ss_slab_meta_box.h"
#include "box/ss_hot_cold_box.h"
#include "box/pagefault_telemetry_box.h"
#include "box/tls_sll_drain_box.h"
#include "box/tls_slab_reuse_guard_box.h"
#include "box/ss_tier_box.h"  // P-Tier: Tier filtering support
#include "hakmem_policy.h"
#include "hakmem_env_cache.h"  // Priority-2: ENV cache
#include "front/tiny_warm_pool.h"  // Warm Pool: Prefill during registry scans
#include "box/ss_slab_reset_box.h" // Box: Reset slab metadata on reuse (C7 guard)
#include "box/tiny_class_stats_box.h" // OBSERVE: per-class shared lock stats
#include "box/ss_stats_box.h" // OBSERVE: Superslab/slab event counters
#include "box/ss_budget_box.h" // Budget guard for Superslab growth (larson_guard)
#include "box/super_reg_box.h" // Logical limit for registry scan
#include "box/shared_pool_box.h" // Logical cap for shared pool slots (bench profile)

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdatomic.h>

// Stage3(LRU) 由来の Superslab をトレースするための簡易マジック
_Atomic uintptr_t g_c7_stage3_magic_ss = 0;

static inline void sp_lock_with_stats(int class_idx) {
    pthread_mutex_lock(&g_shared_pool.alloc_lock);
    tiny_class_stats_on_shared_lock(class_idx);
}

static inline void c7_log_meta_state(const char* tag, SuperSlab* ss, int slab_idx) {
    if (!ss) return;
#if HAKMEM_BUILD_RELEASE
    static _Atomic uint32_t rel_c7_meta_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&rel_c7_meta_logs, 1, memory_order_relaxed);
    if (n < 8) {
        TinySlabMeta* m = &ss->slabs[slab_idx];
        fprintf(stderr,
                "[REL_C7_%s] ss=%p slab=%d cls=%u used=%u cap=%u carved=%u freelist=%p\n",
                tag,
                (void*)ss,
                slab_idx,
                (unsigned)m->class_idx,
                (unsigned)m->used,
                (unsigned)m->capacity,
                (unsigned)m->carved,
                m->freelist);
    }
#else
    static _Atomic uint32_t dbg_c7_meta_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&dbg_c7_meta_logs, 1, memory_order_relaxed);
    if (n < 8) {
        TinySlabMeta* m = &ss->slabs[slab_idx];
        fprintf(stderr,
                "[DBG_C7_%s] ss=%p slab=%d cls=%u used=%u cap=%u carved=%u freelist=%p\n",
                tag,
                (void*)ss,
                slab_idx,
                (unsigned)m->class_idx,
                (unsigned)m->used,
                (unsigned)m->capacity,
                (unsigned)m->carved,
                m->freelist);
    }
#endif
}

static inline int c7_meta_is_pristine(TinySlabMeta* m) {
    return m && m->used == 0 && m->carved == 0 && m->freelist == NULL;
}

static inline void c7_log_skip_nonempty_acquire(SuperSlab* ss,
                                                int slab_idx,
                                                TinySlabMeta* m,
                                                const char* tag) {
    if (!(ss && m)) return;
#if HAKMEM_BUILD_RELEASE
    static _Atomic uint32_t rel_c7_skip_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&rel_c7_skip_logs, 1, memory_order_relaxed);
    if (n < 4) {
        fprintf(stderr,
                "[REL_C7_%s] ss=%p slab=%d cls=%u used=%u cap=%u carved=%u freelist=%p\n",
                tag,
                (void*)ss,
                slab_idx,
                (unsigned)m->class_idx,
                (unsigned)m->used,
                (unsigned)m->capacity,
                (unsigned)m->carved,
                m->freelist);
    }
#else
    static _Atomic uint32_t dbg_c7_skip_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&dbg_c7_skip_logs, 1, memory_order_relaxed);
    if (n < 4) {
        fprintf(stderr,
                "[DBG_C7_%s] ss=%p slab=%d cls=%u used=%u cap=%u carved=%u freelist=%p\n",
                tag,
                (void*)ss,
                slab_idx,
                (unsigned)m->class_idx,
                (unsigned)m->used,
                (unsigned)m->capacity,
                (unsigned)m->carved,
                m->freelist);
    }
#endif
}

static inline int c7_reset_and_log_if_needed(SuperSlab* ss,
                                             int slab_idx,
                                             int class_idx) {
    if (class_idx != 7) {
        return 0;
    }

    TinySlabMeta* m = &ss->slabs[slab_idx];
    c7_log_meta_state("ACQUIRE_META", ss, slab_idx);

    if (m->class_idx != 255 && m->class_idx != (uint8_t)class_idx) {
#if HAKMEM_BUILD_RELEASE
        static _Atomic uint32_t rel_c7_class_mismatch_logs = 0;
        uint32_t n = atomic_fetch_add_explicit(&rel_c7_class_mismatch_logs, 1, memory_order_relaxed);
        if (n < 4) {
            fprintf(stderr,
                    "[REL_C7_CLASS_MISMATCH] ss=%p slab=%d want=%d have=%u used=%u cap=%u carved=%u\n",
                    (void*)ss,
                    slab_idx,
                    class_idx,
                    (unsigned)m->class_idx,
                    (unsigned)m->used,
                    (unsigned)m->capacity,
                    (unsigned)m->carved);
        }
#else
        static _Atomic uint32_t dbg_c7_class_mismatch_logs = 0;
        uint32_t n = atomic_fetch_add_explicit(&dbg_c7_class_mismatch_logs, 1, memory_order_relaxed);
        if (n < 4) {
            fprintf(stderr,
                    "[DBG_C7_CLASS_MISMATCH] ss=%p slab=%d want=%d have=%u used=%u cap=%u carved=%u freelist=%p\n",
                    (void*)ss,
                    slab_idx,
                    class_idx,
                    (unsigned)m->class_idx,
                    (unsigned)m->used,
                    (unsigned)m->capacity,
                    (unsigned)m->carved,
                    m->freelist);
        }
#endif
        return -1;
    }

    if (!c7_meta_is_pristine(m)) {
        c7_log_skip_nonempty_acquire(ss, slab_idx, m, "SKIP_NONEMPTY_ACQUIRE");
        return -1;
    }

    ss_slab_reset_meta_for_tiny(ss, slab_idx, class_idx);
    c7_log_meta_state("ACQUIRE", ss, slab_idx);
    return 0;
}

static inline void sp_reset_superslab_all_slabs(SuperSlab* ss,
                                                int class_idx,
                                                int from_lru) {
    if (!ss) {
        return;
    }
    int cap = ss_slabs_capacity(ss);
    ss->slab_bitmap = 0;
    ss->nonempty_mask = 0;
    ss->freelist_mask = 0;
    ss->empty_mask = 0;
    ss->empty_count = 0;
    ss->active_slabs = 0;
    ss->hot_count = 0;
    ss->cold_count = 0;
    for (int s = 0; s < cap; s++) {
        ss_slab_reset_meta_for_tiny(ss, s, class_idx);
    }
    ss_stats_on_ss_scan(class_idx, 0, 1);
    static _Atomic uint32_t rel_stage3_reset_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&rel_stage3_reset_logs, 1, memory_order_relaxed);
    if (n < 4) {
        fprintf(stderr,
                "[REL_STAGE3_RESET] class=%d ss=%p from_lru=%d cap=%d\n",
                class_idx,
                (void*)ss,
                from_lru,
                cap);
    }
}

// ============================================================================
// Performance Measurement: Shared Pool Lock Contention (ENV-gated)
// ============================================================================
// Global atomic counters for lock contention measurement
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
_Atomic uint64_t g_sp_stage2_lock_acquired_global = 0;
_Atomic uint64_t g_sp_stage3_lock_acquired_global = 0;
_Atomic uint64_t g_sp_alloc_lock_contention_global = 0;

// Per-class lock acquisition statistics（Tiny クラス別の lock 負荷観測用）
_Atomic uint64_t g_sp_stage2_lock_acquired_by_class[TINY_NUM_CLASSES_SS] = {0};
_Atomic uint64_t g_sp_stage3_lock_acquired_by_class[TINY_NUM_CLASSES_SS] = {0};

// Check if measurement is enabled (cached)
static inline int sp_measure_enabled(void) {
    static int g_measure = -1;
    if (__builtin_expect(g_measure == -1, 0)) {
        const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
        g_measure = (e && *e && *e != '0') ? 1 : 0;
        if (g_measure == 1) {
            // Measurement が ON のときは per-class stage stats も有効化する
            // （Stage1/2/3 ヒット数は g_sp_stage*_hits に集計される）
            extern int g_sp_stage_stats_enabled;
            g_sp_stage_stats_enabled = 1;
        }
    }
    return g_measure;
}

// Print statistics function
void shared_pool_print_measurements(void);

// Stage 0.5: EMPTY slab direct scan（registry ベースの EMPTY 再利用）
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
// avoid Stage 3 (mmap) when freed slabs are available.
//
// WARM POOL OPTIMIZATION:
// - During the registry scan, prefill warm pool with HOT SuperSlabs
// - This eliminates future registry scans for cache misses
// - Expected gain: +40-50% by reducing O(N) scan overhead
static inline int
sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
{
    // Priority-2: Use cached ENV
    int empty_reuse_enabled = HAK_ENV_SS_EMPTY_REUSE();
    if (!empty_reuse_enabled) {
        return -1;
    }

    extern int g_super_reg_class_size[TINY_NUM_CLASSES];

    int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0;
    int reg_cap = super_reg_effective_per_class();
    if (reg_cap > 0 && reg_size > reg_cap) {
        reg_size = reg_cap;
    }
    // Priority-2: Use cached ENV
    int scan_limit = HAK_ENV_SS_EMPTY_SCAN_LIMIT();
    if (scan_limit > reg_size) scan_limit = reg_size;

    // Stage 0.5 hit counter for visualization
    static _Atomic uint64_t stage05_hits = 0;
    static _Atomic uint64_t stage05_attempts = 0;
    atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);

    // Initialize warm pool on first use (per-thread, one-time)
    tiny_warm_pool_init_once();

    // Track SuperSlabs scanned during this acquire call for warm pool prefill
    SuperSlab* primary_result = NULL;
    int primary_slab_idx = -1;

    // Cache warm pool cap once per acquire call (SSOT: same as unified_cache_refill()).
    const int warm_cap = warm_pool_max_per_class();

    for (int i = 0; i < scan_limit; i++) {
        SuperSlab* ss = super_reg_by_class_at(class_idx, i);
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
        // P-Tier: Skip DRAINING tier SuperSlabs
        if (!ss_tier_is_hot(ss)) continue;
        if (ss->empty_count == 0) continue;  // No EMPTY slabs in this SS

        // WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
        // This is low-cost during registry scan and avoids future expensive scans.
        if (ss != primary_result && tiny_warm_pool_count(class_idx) < warm_cap) {
            tiny_warm_pool_push(class_idx, ss);
            // Track prefilled SuperSlabs for metrics
            g_warm_pool_stats[class_idx].prefilled++;
        }

        uint32_t mask = ss->empty_mask;
        while (mask) {
            int empty_idx = __builtin_ctz(mask);
            mask &= (mask - 1);  // clear lowest bit

            TinySlabMeta* meta = &ss->slabs[empty_idx];
            if (meta->capacity > 0 && meta->used == 0) {
                tiny_tls_slab_reuse_guard(ss);
                ss_clear_slab_empty(ss, empty_idx);

                meta->class_idx = (uint8_t)class_idx;
                ss->class_map[empty_idx] = (uint8_t)class_idx;

#if !HAKMEM_BUILD_RELEASE
                if (dbg_acquire == 1) {
                    fprintf(stderr,
                            "[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u warm_pool_size=%d)\n",
                            class_idx, (void*)ss, empty_idx, ss->empty_count, tiny_warm_pool_count(class_idx));
                }
#else
                (void)dbg_acquire;
#endif

                // Store primary result but continue scanning to prefill warm pool
                if (primary_result == NULL) {
                    primary_result = ss;
                    primary_slab_idx = empty_idx;
                    *ss_out = ss;
                    *slab_idx_out = empty_idx;
                    sp_stage_stats_init();
                    if (g_sp_stage_stats_enabled) {
                        atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
                    }
                    atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
                }
            }
        }
    }

    if (primary_result != NULL) {
        // Stage 0.5 hit rate visualization (every 100 hits)
        uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
        if (hits % 100 == 1) {
            uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
            fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d warm_pool=%d)\n",
                    hits, attempts, (double)hits * 100.0 / attempts, scan_limit, tiny_warm_pool_count(class_idx));
        }
        if (c7_reset_and_log_if_needed(primary_result, primary_slab_idx, class_idx) == 0) {
            return 0;
        }
        primary_result = NULL;
        *ss_out = NULL;
        *slab_idx_out = -1;
    }
    return -1;
}

int
shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
{
    // Phase 12: SP-SLOT Box - 3-Stage Acquire Logic
    //
    // Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE)
    // Stage 2: Find UNUSED slots in existing SuperSlabs
    // Stage 3: Get new SuperSlab (LRU pop or mmap)
    //
    // Invariants:
    //  - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots
    //  - The chosen slab has meta->class_idx == class_idx

    if (!ss_out || !slab_idx_out) {
        return -1;
    }
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return -1;
    }

    shared_pool_init();

    // Debug logging / stage stats
#if !HAKMEM_BUILD_RELEASE
    // Priority-2: Use cached ENV
    int dbg_acquire = HAK_ENV_SS_ACQUIRE_DEBUG();
#else
    static const int dbg_acquire = 0;
#endif
    sp_stage_stats_init();

stage1_retry_after_tension_drain:
    // ========== Stage 0.5 (Phase 12-1.1): EMPTY slab direct scan ==========
    // Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
    // avoid Stage 3 (mmap) when freed slabs are available.
    if (sp_acquire_from_empty_scan(class_idx, ss_out, slab_idx_out, dbg_acquire) == 0) {
        return 0;
    }

    // ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
    // P0-4: Lock-free pop from per-class free list (no mutex needed!)
    // Best case: Same class freed a slot, reuse immediately (cache-hot)
    SharedSSMeta* reuse_meta = NULL;
    int reuse_slot_idx = -1;

    if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
        // Found EMPTY slot from lock-free list!
        // Now acquire mutex ONLY for slot activation and metadata update

        // P0 instrumentation: count lock acquisitions
        lock_stats_init();
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_acquire_count, 1);
            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
        }

        sp_lock_with_stats(class_idx);

        // P0.3: Guard against TLS SLL orphaned pointers before reusing slab
        // RACE FIX: Load SuperSlab pointer atomically BEFORE guard (consistency)
        SuperSlab* ss_guard = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
        if (ss_guard) {
            tiny_tls_slab_reuse_guard(ss_guard);

            if (class_idx == 7) {
                TinySlabMeta* meta = &ss_guard->slabs[reuse_slot_idx];
                int meta_ok = (meta->used == 0) && (meta->carved == 0) && (meta->freelist == NULL);
                if (!meta_ok) {
                    c7_log_skip_nonempty_acquire(ss_guard, reuse_slot_idx, meta, "SKIP_NONEMPTY_ACQUIRE");
                    sp_freelist_push_lockfree(class_idx, reuse_meta, reuse_slot_idx);
                    goto stage2_fallback;
                }
            }

            // P-Tier: Skip DRAINING tier SuperSlabs
            if (!ss_tier_is_hot(ss_guard)) {
                // DRAINING SuperSlab - skip this slot and fall through to Stage 2
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                goto stage2_fallback;
            }
        }

        // Activate slot under mutex (slot state transition requires protection)
        if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
            // RACE FIX: Load SuperSlab pointer atomically (consistency)
            SuperSlab* ss = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);

            // RACE FIX: Check if SuperSlab was freed (NULL pointer)
            // This can happen if Thread A freed the SuperSlab after pushing slot to freelist,
            // but Thread B popped the stale slot before the freelist was cleared.
            if (!ss) {
                // SuperSlab freed - skip and fall through to Stage 2/3
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                goto stage2_fallback;
            }

            #if !HAKMEM_BUILD_RELEASE
            if (dbg_acquire == 1) {
                fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
                        class_idx, (void*)ss, reuse_slot_idx);
            }
            #endif

            // Update SuperSlab metadata
            ss->slab_bitmap |= (1u << reuse_slot_idx);
            ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx);

            if (ss->active_slabs == 0) {
                // Was empty, now active again
                ss->active_slabs = 1;
                g_shared_pool.active_count++;
            }
            // Track per-class active slots (approximate, under alloc_lock)
            if (class_idx < TINY_NUM_CLASSES_SS) {
                g_shared_pool.class_active_slots[class_idx]++;
            }

            // Update hint
            g_shared_pool.class_hints[class_idx] = ss;

            *ss_out = ss;
            *slab_idx_out = reuse_slot_idx;
            if (class_idx == 7) {
                TinySlabMeta* meta_check = &ss->slabs[reuse_slot_idx];
                if (!((meta_check->used == 0) && (meta_check->carved == 0) && (meta_check->freelist == NULL))) {
                    sp_freelist_push_lockfree(class_idx, reuse_meta, reuse_slot_idx);
                    if (g_lock_stats_enabled == 1) {
                        atomic_fetch_add(&g_lock_release_count, 1);
                    }
                    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                    goto stage2_fallback;
                }
            }
            if (c7_reset_and_log_if_needed(ss, reuse_slot_idx, class_idx) != 0) {
                *ss_out = NULL;
                *slab_idx_out = -1;
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                goto stage2_fallback;
            }

            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
	            if (g_sp_stage_stats_enabled) {
	                atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
	            }
            return 0;  // ✅ Stage 1 (lock-free) success
        }

        // Slot activation failed (race condition?) - release lock and fall through
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
    }

stage2_fallback:
    // ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
    // P0 Optimization: Try class hint FIRST for fast path (same class locality)
    // This reduces metadata scan from 100% to ~10% when hints are effective
    {
        SuperSlab* hint_ss = g_shared_pool.class_hints[class_idx];
        if (__builtin_expect(hint_ss != NULL, 1)) {
            // P-Tier: Skip DRAINING tier SuperSlabs
            if (!ss_tier_is_hot(hint_ss)) {
                // Clear stale hint pointing to DRAINING SuperSlab
                g_shared_pool.class_hints[class_idx] = NULL;
                goto stage2_scan;
            }

            // P0 Optimization: O(1) lookup via cached pointer (avoids metadata scan)
            SharedSSMeta* hint_meta = hint_ss->shared_meta;
            if (__builtin_expect(hint_meta != NULL, 1)) {
                // Try lock-free claiming on hint SuperSlab first
                int claimed_idx = sp_slot_claim_lockfree(hint_meta, class_idx);
                if (__builtin_expect(claimed_idx >= 0, 1)) {
                    // Fast path success! No need to scan all metadata
                    SuperSlab* ss = atomic_load_explicit(&hint_meta->ss, memory_order_acquire);
                    if (__builtin_expect(ss != NULL, 1)) {
                        #if !HAKMEM_BUILD_RELEASE
                        if (dbg_acquire == 1) {
                            fprintf(stderr, "[SP_ACQUIRE_STAGE2_HINT] class=%d claimed UNUSED slot from hint (ss=%p slab=%d)\n",
                                    class_idx, (void*)ss, claimed_idx);
                        }
                        #endif

                        // P0 instrumentation: count lock acquisitions
                        lock_stats_init();
                        if (g_lock_stats_enabled == 1) {
                            atomic_fetch_add(&g_lock_acquire_count, 1);
                            atomic_fetch_add(&g_lock_acquire_slab_count, 1);
                        }

                        sp_lock_with_stats(class_idx);

                        // Performance measurement: count Stage 2 lock acquisitions
                        if (__builtin_expect(sp_measure_enabled(), 0)) {
                            atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global,
                                                      1, memory_order_relaxed);
                            atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
                                                      1, memory_order_relaxed);
                            atomic_fetch_add_explicit(
                                &g_sp_stage2_lock_acquired_by_class[class_idx],
                                1, memory_order_relaxed);
                        }

                        if (class_idx == 7) {
                            TinySlabMeta* meta = &ss->slabs[claimed_idx];
                            int meta_ok = (meta->used == 0) && (meta->carved == 0) &&
                                          (meta->freelist == NULL);
                            if (!meta_ok) {
                                c7_log_skip_nonempty_acquire(ss, claimed_idx, meta, "SKIP_NONEMPTY_ACQUIRE");
                                sp_slot_mark_empty(hint_meta, claimed_idx);
                                if (g_lock_stats_enabled == 1) {
                                    atomic_fetch_add(&g_lock_release_count, 1);
                                }
                                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                                goto stage2_scan;
                            }
                        }

                        // Update SuperSlab metadata under mutex
                        ss->slab_bitmap |= (1u << claimed_idx);
                        ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);

                        if (ss->active_slabs == 0) {
                            ss->active_slabs = 1;
                            g_shared_pool.active_count++;
                        }
                        if (class_idx < TINY_NUM_CLASSES_SS) {
                            g_shared_pool.class_active_slots[class_idx]++;
                        }

                        // Hint is still good, no need to update
                        *ss_out = ss;
                        *slab_idx_out = claimed_idx;
                        if (class_idx == 7) {
                            TinySlabMeta* meta_check = &ss->slabs[claimed_idx];
                            if (!((meta_check->used == 0) && (meta_check->carved == 0) &&
                                  (meta_check->freelist == NULL))) {
                                sp_slot_mark_empty(hint_meta, claimed_idx);
                                *ss_out = NULL;
                                *slab_idx_out = -1;
                                if (g_lock_stats_enabled == 1) {
                                    atomic_fetch_add(&g_lock_release_count, 1);
                                }
                                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                                goto stage2_scan;
                            }
                        }
                        if (c7_reset_and_log_if_needed(ss, claimed_idx, class_idx) != 0) {
                            *ss_out = NULL;
                            *slab_idx_out = -1;
                            if (g_lock_stats_enabled == 1) {
                                atomic_fetch_add(&g_lock_release_count, 1);
                            }
                            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                            goto stage2_scan;
                        }
                        sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);

                        if (g_lock_stats_enabled == 1) {
                            atomic_fetch_add(&g_lock_release_count, 1);
                        }
                        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                        if (g_sp_stage_stats_enabled) {
                            atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
                        }
                        return 0;  // ✅ Stage 2 (hint fast path) success
                    }
                }
            }
        }
    }

stage2_scan:
    // P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
    // RACE FIX: Read ss_meta_count atomically (now properly declared as _Atomic)
    // No cast needed! memory_order_acquire synchronizes with release in sp_meta_find_or_create
    uint32_t meta_count = atomic_load_explicit(
        &g_shared_pool.ss_meta_count,
        memory_order_acquire
    );

    for (uint32_t i = 0; i < meta_count; i++) {
        SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];

        // RACE FIX: Load SuperSlab pointer atomically BEFORE claiming
        // Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
        SuperSlab* ss_preflight = atomic_load_explicit(&meta->ss, memory_order_acquire);
        if (!ss_preflight) {
            // SuperSlab was freed - skip this entry
            continue;
        }

        // P-Tier: Skip DRAINING tier SuperSlabs
        if (!ss_tier_is_hot(ss_preflight)) {
            continue;
        }

        // Try lock-free claiming (UNUSED → ACTIVE via CAS)
        int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
        if (claimed_idx >= 0) {
            // RACE FIX: Load SuperSlab pointer atomically again after claiming
            // Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
            SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire);
            if (!ss) {
                // SuperSlab was freed between claiming and loading - skip this entry
                continue;
            }

            #if !HAKMEM_BUILD_RELEASE
            if (dbg_acquire == 1) {
                fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
                        class_idx, (void*)ss, claimed_idx);
            }
            #endif

            // P0 instrumentation: count lock acquisitions
            lock_stats_init();
            if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_acquire_count, 1);
                    atomic_fetch_add(&g_lock_acquire_slab_count, 1);
                }

                sp_lock_with_stats(class_idx);

            // Performance measurement: count Stage 2 scan lock acquisitions
            if (__builtin_expect(sp_measure_enabled(), 0)) {
                atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global,
                                          1, memory_order_relaxed);
                atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
                                          1, memory_order_relaxed);
                atomic_fetch_add_explicit(
                    &g_sp_stage2_lock_acquired_by_class[class_idx],
                    1, memory_order_relaxed);
            }

            if (class_idx == 7) {
                TinySlabMeta* meta_slab = &ss->slabs[claimed_idx];
                int meta_ok = (meta_slab->used == 0) && (meta_slab->carved == 0) &&
                              (meta_slab->freelist == NULL);
                if (!meta_ok) {
                    c7_log_skip_nonempty_acquire(ss, claimed_idx, meta_slab, "SKIP_NONEMPTY_ACQUIRE");
                    sp_slot_mark_empty(meta, claimed_idx);
                    if (g_lock_stats_enabled == 1) {
                        atomic_fetch_add(&g_lock_release_count, 1);
                    }
                    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                    continue;
                }
            }

            // Update SuperSlab metadata under mutex
            ss->slab_bitmap |= (1u << claimed_idx);
            ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);

            if (ss->active_slabs == 0) {
                ss->active_slabs = 1;
                g_shared_pool.active_count++;
            }
            if (class_idx < TINY_NUM_CLASSES_SS) {
                g_shared_pool.class_active_slots[class_idx]++;
            }

            // Update hint
            g_shared_pool.class_hints[class_idx] = ss;

            *ss_out = ss;
            *slab_idx_out = claimed_idx;
            if (class_idx == 7) {
                TinySlabMeta* meta_check = &ss->slabs[claimed_idx];
                if (!((meta_check->used == 0) && (meta_check->carved == 0) &&
                      (meta_check->freelist == NULL))) {
                    sp_slot_mark_empty(meta, claimed_idx);
                    *ss_out = NULL;
                    *slab_idx_out = -1;
                    if (g_lock_stats_enabled == 1) {
                        atomic_fetch_add(&g_lock_release_count, 1);
                    }
                    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                    continue;
                }
            }
            if (c7_reset_and_log_if_needed(ss, claimed_idx, class_idx) != 0) {
                *ss_out = NULL;
                *slab_idx_out = -1;
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                continue;
            }
            sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);

            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
	            if (g_sp_stage_stats_enabled) {
	                atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
	            }
            return 0;  // ✅ Stage 2 (lock-free) success
        }

        // Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
    }

    // ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ==========
    // If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection
    // This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1
    // ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1)
    // ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024)
    {
        // Priority-2: Use cached ENV
        int tension_drain_enabled = HAK_ENV_TINY_TENSION_DRAIN_ENABLE();
        uint32_t tension_threshold = (uint32_t)HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD();

        if (tension_drain_enabled) {
            extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
            extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size);

            uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll[class_idx].count : 0;

            if (sll_count >= tension_threshold) {
                // Drain all blocks to maximize EMPTY slot creation
                uint32_t drained = tiny_tls_sll_drain(class_idx, 0);  // 0 = drain all

                if (drained > 0) {
                    // Retry Stage 1 (EMPTY reuse) after drain
                    // Some slabs might have become EMPTY (meta->used == 0)
                    goto stage1_retry_after_tension_drain;
                }
            }
        }
    }

    // ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
    // All existing SuperSlabs have no UNUSED slots → need new SuperSlab
    // P0 instrumentation: count lock acquisitions
    lock_stats_init();
    if (g_lock_stats_enabled == 1) {
        atomic_fetch_add(&g_lock_acquire_count, 1);
        atomic_fetch_add(&g_lock_acquire_slab_count, 1);
    }

    pthread_mutex_lock(&g_shared_pool.alloc_lock);

    // Performance measurement: count Stage 3 lock acquisitions
    if (__builtin_expect(sp_measure_enabled(), 0)) {
        atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_global,
                                  1, memory_order_relaxed);
        atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
                                  1, memory_order_relaxed);
        atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_by_class[class_idx],
                                  1, memory_order_relaxed);
    }

    // bench プロファイルでは Shared Pool の論理上限を軽くかけておく
    uint32_t total_limit = shared_pool_effective_total_slots();
    if (total_limit > 0 && g_shared_pool.total_count >= total_limit) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;
    }
    uint32_t class_limit = shared_pool_effective_class_slots(class_idx);
    if (class_limit > 0 &&
        class_idx < TINY_NUM_CLASSES_SS &&
        (uint32_t)g_shared_pool.class_active_slots[class_idx] >= class_limit) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;
    }

    // ========== Stage 3: Get new SuperSlab ==========
    // Try LRU cache first, then mmap
    SuperSlab* new_ss = NULL;

    // Stage 3a: Try LRU cache
    extern SuperSlab* hak_ss_lru_pop(uint8_t size_class);
    int from_lru = 0;
    if (class_idx != 7) {
        new_ss = hak_ss_lru_pop((uint8_t)class_idx);
        from_lru = (new_ss != NULL);
    } else {
        // C7: Stage3 LRU 再利用は一旦封じる（再利用が汚染源かを切り分ける）
        atomic_store_explicit(&g_c7_stage3_magic_ss, 0, memory_order_relaxed);
    }

    // Stage 3b: If LRU miss, allocate new SuperSlab
    if (!new_ss) {
        if (!ss_budget_on_alloc(class_idx)) {
            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
            return -1;
        }
        // Release the alloc_lock to avoid deadlock with registry during superslab_allocate
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);

        SuperSlab* allocated_ss = sp_internal_allocate_superslab(class_idx);

        // Re-acquire the alloc_lock
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_acquire_count, 1);
            atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
        }
        sp_lock_with_stats(class_idx);

        if (!allocated_ss) {
            // Allocation failed; return now.
            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
            return -1; // Out of memory
        }

        new_ss = allocated_ss;
        if (class_idx == 7) {
            // Stage3 経由の C7 Superslab は新規確保のみ（magic もリセット扱い）
            atomic_store_explicit(&g_c7_stage3_magic_ss, 0, memory_order_relaxed);
        }

        // Add newly allocated SuperSlab to the shared pool's internal array
        if (g_shared_pool.total_count >= g_shared_pool.capacity) {
            shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
            if (g_shared_pool.total_count >= g_shared_pool.capacity) {
                // Pool table expansion failed; leave ss alive (registry-owned),
                // but do not treat it as part of shared_pool.
                // This is a critical error, return early.
                if (g_lock_stats_enabled == 1) {
                    atomic_fetch_add(&g_lock_release_count, 1);
                }
                pthread_mutex_unlock(&g_shared_pool.alloc_lock);
                return -1;
            }
        }
        g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
        g_shared_pool.total_count++;
    }

    // Stage3 から返す前に、LRU 再利用分は必ず空スラブ化する。
    // C7 以外でも from_lru の場合は全スラブをリセットしておく。
    if (new_ss && (from_lru || class_idx == 7)) {
        sp_reset_superslab_all_slabs(new_ss, class_idx, from_lru);
    }

    #if !HAKMEM_BUILD_RELEASE
    if (dbg_acquire == 1 && new_ss) {
        fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n",
                class_idx, (void*)new_ss, from_lru);
    }
    #endif

    if (!new_ss) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;  // ❌ Out of memory
    }

    // Before creating a new SuperSlab, consult learning-layer soft cap.
    // Phase 9-2: Soft Cap removed to allow Shared Pool to fully replace Legacy Backend.
    // We now rely on LRU eviction and EMPTY recycling to manage memory pressure.

    // Create metadata for this new SuperSlab
    SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
    if (!new_meta) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;  // ❌ Metadata allocation failed
    }

    // Assign first slot to this class
    int first_slot = 0;
    if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;  // ❌ Should not happen
    }

    // Update SuperSlab metadata
    new_ss->slab_bitmap |= (1u << first_slot);
    ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx);
    new_ss->active_slabs = 1;
    g_shared_pool.active_count++;
    if (class_idx < TINY_NUM_CLASSES_SS) {
        g_shared_pool.class_active_slots[class_idx]++;
    }

    // Update hint
    g_shared_pool.class_hints[class_idx] = new_ss;

    *ss_out = new_ss;
    *slab_idx_out = first_slot;
    if (c7_reset_and_log_if_needed(new_ss, first_slot, class_idx) != 0) {
        *ss_out = NULL;
        *slab_idx_out = -1;
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return -1;
    }
    sp_fix_geometry_if_needed(new_ss, first_slot, class_idx);

    if (g_lock_stats_enabled == 1) {
        atomic_fetch_add(&g_lock_release_count, 1);
    }
    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
	    if (g_sp_stage_stats_enabled) {
	        atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1);
	    }
    return 0;  // ✅ Stage 3 success
}

// ============================================================================
// Performance Measurement: Print Statistics
// ============================================================================
void shared_pool_print_measurements(void) {
    if (!sp_measure_enabled()) {
        return;  // Measurement disabled
    }

    uint64_t stage2 = atomic_load_explicit(&g_sp_stage2_lock_acquired_global,
                                           memory_order_relaxed);
    uint64_t stage3 = atomic_load_explicit(&g_sp_stage3_lock_acquired_global,
                                           memory_order_relaxed);
    uint64_t total_locks = atomic_load_explicit(&g_sp_alloc_lock_contention_global,
                                                memory_order_relaxed);

    if (total_locks == 0) {
        fprintf(stderr, "\n========================================\n");
        fprintf(stderr, "Shared Pool Contention Statistics\n");
        fprintf(stderr, "========================================\n");
        fprintf(stderr, "No lock acquisitions recorded\n");
        fprintf(stderr, "========================================\n\n");
        return;
    }

    double stage2_pct = (100.0 * stage2) / total_locks;
    double stage3_pct = (100.0 * stage3) / total_locks;

    fprintf(stderr, "\n========================================\n");
    fprintf(stderr, "Shared Pool Contention Statistics\n");
    fprintf(stderr, "========================================\n");
    fprintf(stderr, "Stage 2 Locks:    %llu (%.1f%%)\n",
            (unsigned long long)stage2, stage2_pct);
    fprintf(stderr, "Stage 3 Locks:    %llu (%.1f%%)\n",
            (unsigned long long)stage3, stage3_pct);
    fprintf(stderr, "Total Contention: %llu lock acquisitions\n",
            (unsigned long long)total_locks);

    // Per-class breakdown（Tiny 用クラス 0-7、特に C5–C7 を観測）
    fprintf(stderr, "\nPer-class Shared Pool Locks (Stage2/Stage3):\n");
    for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) {
        uint64_t s2c = atomic_load_explicit(
            &g_sp_stage2_lock_acquired_by_class[cls],
            memory_order_relaxed);
        uint64_t s3c = atomic_load_explicit(
            &g_sp_stage3_lock_acquired_by_class[cls],
            memory_order_relaxed);
        uint64_t tc = s2c + s3c;
        if (tc == 0) {
            continue;  // ロック取得のないクラスは省略
        }
        fprintf(stderr,
                "  C%d: Stage2=%llu Stage3=%llu Total=%llu\n",
                cls,
                (unsigned long long)s2c,
                (unsigned long long)s3c,
                (unsigned long long)tc);
    }

    fprintf(stderr, "========================================\n\n");
}