hakmem/core/hakmem_shared_pool_release.c

#include "hakmem_shared_pool_internal.h"
#include "hakmem_debug_master.h"
#include "box/ss_slab_meta_box.h"
#include "box/ss_hot_cold_box.h"
#include "box/ss_tier_box.h"              // P-Tier: Utilization-aware tiering
#include "hakmem_env_cache.h"  // Priority-2: ENV cache
#include "superslab/superslab_inline.h"  // superslab_ref_get guard for TLS pins
#include "box/ss_release_guard_box.h"    // Box: SuperSlab Release Guard
#include "box/ss_slab_reset_box.h"       // Box: Reset slab metadata on reuse path
#include "box/ss_stats_box.h"            // Observability: Superslab/slab counters
#include "box/ss_budget_box.h"           // Budget guard (global/class caps)

#include <stdlib.h>
#include <stdio.h>
#include <stdatomic.h>

static inline void c7_release_log_once(SuperSlab* ss, int slab_idx) {
#if HAKMEM_BUILD_RELEASE
    static _Atomic uint32_t rel_c7_release_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&rel_c7_release_logs, 1, memory_order_relaxed);
    if (n < 8) {
        TinySlabMeta* meta = &ss->slabs[slab_idx];
        fprintf(stderr,
                "[REL_C7_RELEASE] ss=%p slab=%d used=%u cap=%u carved=%u\n",
                (void*)ss,
                slab_idx,
                (unsigned)meta->used,
                (unsigned)meta->capacity,
                (unsigned)meta->carved);
    }
#else
    static _Atomic uint32_t dbg_c7_release_logs = 0;
    uint32_t n = atomic_fetch_add_explicit(&dbg_c7_release_logs, 1, memory_order_relaxed);
    if (n < 8) {
        TinySlabMeta* meta = &ss->slabs[slab_idx];
        fprintf(stderr,
                "[DBG_C7_RELEASE] ss=%p slab=%d used=%u cap=%u carved=%u\n",
                (void*)ss,
                slab_idx,
                (unsigned)meta->used,
                (unsigned)meta->capacity,
                (unsigned)meta->carved);
    }
#endif
}

void
shared_pool_release_slab(SuperSlab* ss, int slab_idx)
{
    // Phase 12: SP-SLOT Box - Slot-based Release
    //
    // Flow:
    //   1. Validate inputs and check meta->used == 0
    //   2. Find SharedSSMeta for this SuperSlab
    //   3. Mark slot ACTIVE → EMPTY
    //   4. Push to per-class free list (enables same-class reuse)
    //   5. If all slots EMPTY → superslab_free() → LRU cache

    if (!ss) {
        return;
    }
    if (slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) {
        return;
    }

    // Phase 9-2 FIX: Promote Legacy SuperSlabs to Shared Pool on first recycle
    // If we are recycling a slot from a Legacy SS, we must remove it from the
    // Legacy list (g_superslab_heads) to prevent Legacy Backend from allocating
    // from it simultaneously (Double Allocation Race).
    // This effectively transfers ownership to Shared Pool.
    extern void remove_superslab_from_legacy_head(SuperSlab* ss);
    remove_superslab_from_legacy_head(ss);

    // BUGFIX: Re-check used count after removal. Legacy Backend might have
    // allocated from this slab while we were waiting for the lock in remove().
    TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
    if (atomic_load_explicit(&slab_meta->used, memory_order_acquire) != 0) {
        // Legacy Backend stole this slab. It's now an orphan (removed from list).
        // We abort recycling. It will be recycled when Legacy frees it later.
        return;
    }

    // Debug logging
#if !HAKMEM_BUILD_RELEASE
    // Priority-2: Use cached ENV
    int dbg = HAK_ENV_SS_FREE_DEBUG();
#else
    static const int dbg = 0;
#endif

    // P0 instrumentation: count lock acquisitions
    lock_stats_init();
    if (g_lock_stats_enabled == 1) {
        atomic_fetch_add(&g_lock_stats_enabled, 1);
        atomic_fetch_add(&g_lock_release_slab_count, 1);
    }

    pthread_mutex_lock(&g_shared_pool.alloc_lock);

    // TinySlabMeta* slab_meta = &ss->slabs[slab_idx]; // Already declared above
    if (slab_meta->used != 0) {
        // Not actually empty (double check under lock)
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return;
    }

    uint8_t class_idx = slab_meta->class_idx;
    if (class_idx == 7) {
        c7_release_log_once(ss, slab_idx);
    }

    // Guard: if SuperSlab is pinned (TLS/remote references), defer release to avoid
    // class_map=255 while pointers are still in-flight.
    uint32_t ss_refs_guard = superslab_ref_get(ss);
    if (ss_refs_guard != 0) {
#if !HAKMEM_BUILD_RELEASE
        if (dbg == 1) {
            fprintf(stderr,
                    "[SP_SLOT_RELEASE_SKIP_PINNED] ss=%p slab_idx=%d class=%d refcount=%u\n",
                    (void*)ss, slab_idx, class_idx, (unsigned)ss_refs_guard);
        }
#endif
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return;
    }

    #if !HAKMEM_BUILD_RELEASE
    if (dbg == 1) {
        fprintf(stderr, "[SP_SLOT_RELEASE] ss=%p slab_idx=%d class=%d used=0 (marking EMPTY)\n",
                (void*)ss, slab_idx, class_idx);
    }
    #endif

    if (class_idx == 7) {
        ss_slab_reset_meta_for_tiny(ss, slab_idx, class_idx);
#if HAKMEM_BUILD_RELEASE
        static _Atomic uint32_t rel_c7_reset_logs = 0;
        uint32_t rn = atomic_fetch_add_explicit(&rel_c7_reset_logs, 1, memory_order_relaxed);
        if (rn < 4) {
            TinySlabMeta* m = &ss->slabs[slab_idx];
            fprintf(stderr,
                    "[REL_C7_RELEASE_RESET] ss=%p slab=%d used=%u cap=%u carved=%u freelist=%p\n",
                    (void*)ss,
                    slab_idx,
                    (unsigned)m->used,
                    (unsigned)m->capacity,
                    (unsigned)m->carved,
                    m->freelist);
        }
#else
        static _Atomic uint32_t dbg_c7_reset_logs = 0;
        uint32_t rn = atomic_fetch_add_explicit(&dbg_c7_reset_logs, 1, memory_order_relaxed);
        if (rn < 4) {
            TinySlabMeta* m = &ss->slabs[slab_idx];
            fprintf(stderr,
                    "[DBG_C7_RELEASE_RESET] ss=%p slab=%d used=%u cap=%u carved=%u freelist=%p\n",
                    (void*)ss,
                    slab_idx,
                    (unsigned)m->used,
                    (unsigned)m->capacity,
                    (unsigned)m->carved,
                    m->freelist);
        }
#endif
    }

    // Find SharedSSMeta for this SuperSlab
    SharedSSMeta* sp_meta = NULL;
    uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed);
    for (uint32_t i = 0; i < count; i++) {
        // RACE FIX: Load pointer atomically
        SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed);
        if (meta_ss == ss) {
            sp_meta = &g_shared_pool.ss_metadata[i];
            break;
        }
    }

    if (!sp_meta) {
        // SuperSlab not in SP-SLOT system yet - create metadata
        sp_meta = sp_meta_find_or_create(ss);
        if (!sp_meta) {
            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
            return;  // Failed to create metadata
        }
    }

    // Mark slot as EMPTY (ACTIVE → EMPTY)
    uint32_t slab_bit = (1u << slab_idx);
    SlotState slot_state = atomic_load_explicit(
        &sp_meta->slots[slab_idx].state,
        memory_order_acquire);
    if (slot_state != SLOT_ACTIVE && (ss->slab_bitmap & slab_bit)) {
        // Legacy path import: rebuild slot states from SuperSlab bitmap/class_map
        sp_meta_sync_slots_from_ss(sp_meta, ss);
        slot_state = atomic_load_explicit(
            &sp_meta->slots[slab_idx].state,
            memory_order_acquire);
    }

    if (slot_state != SLOT_ACTIVE || sp_slot_mark_empty(sp_meta, slab_idx) != 0) {
        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }
        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
        return;  // Slot wasn't ACTIVE
    }

    // Update SuperSlab metadata
    uint32_t bit = (1u << slab_idx);
    if (ss->slab_bitmap & bit) {
        ss->slab_bitmap &= ~bit;
        slab_meta->class_idx = 255;  // UNASSIGNED
        // P1.1: Mark class_map as UNASSIGNED when releasing slab
        ss->class_map[slab_idx] = 255;
        // Reset slab metadata to a pristine state for all classes (C0–C7)
        ss_slab_reset_meta_for_tiny(ss, slab_idx, -1);

        if (ss->active_slabs > 0) {
            ss->active_slabs--;
            if (ss->active_slabs == 0 && g_shared_pool.active_count > 0) {
                g_shared_pool.active_count--;
            }
        }
        if (class_idx < TINY_NUM_CLASSES_SS &&
            g_shared_pool.class_active_slots[class_idx] > 0) {
            g_shared_pool.class_active_slots[class_idx]--;
        }
    }

    // P0-4: Push to lock-free per-class free list (enables reuse by same class)
    // Note: push BEFORE releasing mutex (slot state already updated under lock)
    if (class_idx < TINY_NUM_CLASSES_SS) {
        sp_freelist_push_lockfree(class_idx, sp_meta, slab_idx);

        #if !HAKMEM_BUILD_RELEASE
        if (dbg == 1) {
            fprintf(stderr, "[SP_SLOT_FREELIST_LOCKFREE] class=%d pushed slot (ss=%p slab=%d) active_slots=%u/%u\n",
                    class_idx, (void*)ss, slab_idx,
                    sp_meta->active_slots, sp_meta->total_slots);
        }
        #endif
    }

    // P-Tier: Check tier transition after releasing slab
    // This may transition HOT → DRAINING if utilization dropped below threshold
    // or DRAINING → FREE if utilization reached 0
    ss_tier_check_transition(ss);

    // P-Tier Step B: Eager FREE eviction
    // If tier transitioned to FREE (total_active_blocks == 0), immediately try to
    // release the SuperSlab regardless of active_slots. This prevents registry bloat.
    SSTier current_tier = ss_tier_get(ss);
    if (current_tier == SS_TIER_FREE) {
        // Double-check: total_active_blocks should be 0 for FREE tier
        uint32_t active_blocks = atomic_load_explicit(&ss->total_active_blocks, memory_order_acquire);
        if (active_blocks == 0 && ss_release_guard_superslab_can_free(ss)) {
            #if !HAKMEM_BUILD_RELEASE
            if (dbg == 1) {
                fprintf(stderr, "[SP_TIER_FREE_EAGER] ss=%p tier=FREE active_slots=%u -> immediate free\n",
                        (void*)ss, sp_meta->active_slots);
            }
            #endif

            // Force all remaining slots to EMPTY state for clean metadata
            for (uint32_t i = 0; i < sp_meta->total_slots; i++) {
                SlotState st = atomic_load_explicit(&sp_meta->slots[i].state, memory_order_relaxed);
                if (st == SLOT_ACTIVE) {
                    atomic_store_explicit(&sp_meta->slots[i].state, SLOT_EMPTY, memory_order_relaxed);
                }
            }
            sp_meta->active_slots = 0;

            if (g_lock_stats_enabled == 1) {
                atomic_fetch_add(&g_lock_release_count, 1);
            }

            // Clear meta->ss before unlocking (race prevention)
            atomic_store_explicit(&sp_meta->ss, NULL, memory_order_release);

            pthread_mutex_unlock(&g_shared_pool.alloc_lock);

            // Free SuperSlab immediately (bypasses normal active_slots==0 check)
            extern void superslab_free(SuperSlab* ss);
            ss_stats_on_ss_free_class(class_idx);
            ss_budget_on_free(class_idx);
            superslab_free(ss);
            return;
        }
    }

    // Check if SuperSlab is now completely empty (all slots EMPTY or UNUSED)
    if (sp_meta->active_slots == 0) {
        #if !HAKMEM_BUILD_RELEASE
        if (dbg == 1) {
            fprintf(stderr, "[SP_SLOT_COMPLETELY_EMPTY] ss=%p active_slots=0 (calling superslab_free)\n",
                    (void*)ss);
        }
        #endif

        if (g_lock_stats_enabled == 1) {
            atomic_fetch_add(&g_lock_release_count, 1);
        }

        // RACE FIX: Set meta->ss to NULL BEFORE unlocking mutex
        // This prevents Stage 2 from accessing freed SuperSlab
        atomic_store_explicit(&sp_meta->ss, NULL, memory_order_release);

        pthread_mutex_unlock(&g_shared_pool.alloc_lock);

        // Remove from legacy backend list (moved to top of function)
        // extern void remove_superslab_from_legacy_head(SuperSlab* ss);
        // remove_superslab_from_legacy_head(ss);

        // Free SuperSlab:
        // 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation
        // 2. Or munmap if LRU is full - eager deallocation
        
        // BUGFIX: Double check total_active_blocks and refcount. Legacy Backend might have
        // allocated from ANOTHER slab in this SS just before we removed it.
        // If so, we must NOT free the SS.
        if (ss_release_guard_superslab_can_free(ss)) {
            extern void superslab_free(SuperSlab* ss);
            ss_stats_on_ss_free_class(class_idx);
            ss_budget_on_free(class_idx);
            superslab_free(ss);
        } else {
            #if !HAKMEM_BUILD_RELEASE
            if (dbg == 1) {
                uint32_t active_blocks = atomic_load_explicit(&ss->total_active_blocks, memory_order_acquire);
                uint32_t ss_refs = superslab_ref_get(ss);
                fprintf(stderr,
                        "[SP_SLOT_RELEASE] SKIP free ss=%p: total_active_blocks=%u refcount=%u\n",
                        (void*)ss,
                        (unsigned)active_blocks,
                        (unsigned)ss_refs);
            }
            #endif
        }
        return;
    }

    if (g_lock_stats_enabled == 1) {
        atomic_fetch_add(&g_lock_release_count, 1);
    }
    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
}
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								#include "hakmem_shared_pool_internal.h"
 								#include "hakmem_debug_master.h"
 								#include "box/ss_slab_meta_box.h"
 								#include "box/ss_hot_cold_box.h"
-												P-Tier + Tiny Route Policy: Aggressive Superslab Management + Safe Routing

## Phase 1: Utilization-Aware Superslab Tiering (案B実装済)

- Add ss_tier_box.h: Classify SuperSlabs into HOT/DRAINING/FREE based on utilization
  - HOT (>25%): Accept new allocations
  - DRAINING (≤25%): Drain only, no new allocs
  - FREE (0%): Ready for eager munmap

- Enhanced shared_pool_release_slab():
  - Check tier transition after each slab release
  - If tier→FREE: Force remaining slots to EMPTY and call superslab_free() immediately
  - Bypasses LRU cache to prevent registry bloat from accumulating DRAINING SuperSlabs

- Test results (bench_random_mixed_hakmem):
  - 1M iterations: ✅ ~1.03M ops/s (previously passed)
  - 10M iterations: ✅ ~1.15M ops/s (previously: registry full error)
  - 50M iterations: ✅ ~1.08M ops/s (stress test)

## Phase 2: Tiny Front Routing Policy (新規Box)

- Add tiny_route_box.h/c: Single 8-byte table for class→routing decisions
  - ROUTE_TINY_ONLY: Tiny front exclusive (no fallback)
  - ROUTE_TINY_FIRST: Try Tiny, fallback to Pool if fails
  - ROUTE_POOL_ONLY: Skip Tiny entirely

- Profiles via HAKMEM_TINY_PROFILE ENV:
  - "hot": C0-C3=TINY_ONLY, C4-C6=TINY_FIRST, C7=POOL_ONLY
  - "conservative" (default): All TINY_FIRST
  - "off": All POOL_ONLY (disable Tiny)
  - "full": All TINY_ONLY (microbench mode)

- A/B test results (ws=256, 100k ops random_mixed):
  - Default (conservative): ~2.90M ops/s
  - hot: ~2.65M ops/s (more conservative)
  - off: ~2.86M ops/s
  - full: ~2.98M ops/s (slightly best)

## Design Rationale

### Registry Pressure Fix (案B)
- Problem: DRAINING tier SS occupied registry indefinitely
- Solution: When total_active_blocks→0, immediately free to clear registry slot
- Result: No more "registry full" errors under stress

### Routing Policy Box (新)
- Problem: Tiny front optimization scattered across ENV/branches
- Solution: Centralize routing in single table, select profiles via ENV
- Benefit: Safe A/B testing without touching hot path code
- Future: Integrate with RSS budget/learning layers for dynamic profile switching

## Next Steps (性能最適化)
- Profile Tiny front internals (TLS SLL, FastCache, Superslab backend latency)
- Identify bottleneck between current ~2.9M ops/s and mimalloc ~100M ops/s
- Consider:
  - Reduce shared pool lock contention
  - Optimize unified cache hit rate
  - Streamline Superslab carving logic

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 18:01:25 +09:00
+								#include "box/ss_tier_box.h"              // P-Tier: Utilization-aware tiering
-												Priority-2 ENV Cache: Shared Pool Release (1箇所置換)

【置換ファイル】
- core/hakmem_shared_pool_release.c (1箇所 → ENV Cache)

【変更詳細】
- getenv("HAKMEM_SS_FREE_DEBUG") → HAK_ENV_SS_FREE_DEBUG()
- #include "hakmem_env_cache.h" 追加
- static変数の遅延初期化パターンを削除

【効果】
- Shared Pool Release pathからgetenv()呼び出しを排除
- SS_FREE_DEBUG変数は既にENV Cacheに登録済み (Hot Path Free系)

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:52:48 +09:00
+								#include "hakmem_env_cache.h"  // Priority-2: ENV cache
-												Add SuperSlab refcount pinning and critical failsafe guards

Major breakthrough: sh8bench now completes without SIGSEGV!
Added defensive refcounting and failsafe mechanisms to prevent
use-after-free and corruption propagation.

Changes:
1. SuperSlab Refcount Pinning (core/box/tls_sll_box.h)
   - tls_sll_push_impl: increment refcount before adding to list
   - tls_sll_pop_impl: decrement refcount when removing from list
   - Prevents SuperSlab from being freed while TLS SLL holds pointers

2. SuperSlab Release Guards (core/superslab_allocate.c, shared_pool_release.c)
   - Check refcount > 0 before freeing SuperSlab
   - If refcount > 0, defer release instead of freeing
   - Prevents use-after-free when TLS/remote/freelist hold stale pointers

3. TLS SLL Next Pointer Validation (core/box/tls_sll_box.h)
   - Detect invalid next pointer during traversal
   - Log [TLS_SLL_NEXT_INVALID] when detected
   - Drop list to prevent corruption propagation

4. Unified Cache Freelist Validation (core/front/tiny_unified_cache.c)
   - Validate freelist head before use
   - Log [UNIFIED_FREELIST_INVALID] for corrupted lists
   - Defensive drop to prevent bad allocations

5. Early Refcount Decrement Fix (core/tiny_free_fast.inc.h)
   - Removed ss_active_dec_one from fast path
   - Prevents premature refcount depletion
   - Defers decrement to proper cleanup path

Test Results:
✅ sh8bench completes successfully (exit code 0)
✅ No SIGSEGV or ABORT signals
✅ Short runs (5s) crash-free
⚠️ Multiple [TLS_SLL_NEXT_INVALID] / [UNIFIED_FREELIST_INVALID] logged
⚠️ Invalid pointers still present (stale references exist)

Status Analysis:
- Stability: ACHIEVED (no crashes)
- Root Cause: NOT FULLY SOLVED (invalid pointers remain)
- Approach: Defensive + refcount guards working well

Remaining Issues:
❌ Why does SuperSlab get unregistered while TLS SLL holds pointers?
❌ SuperSlab lifecycle: remote_queue / adopt / LRU interactions?
❌ Stale pointers indicate improper SuperSlab lifetime management

Performance Impact:
- Refcount operations: +1-3 cycles per push/pop (minor)
- Validation checks: +2-5 cycles (minor)
- Overall: < 5% overhead estimated

Next Investigation:
- Trace SuperSlab lifecycle (allocation → registration → unregister → free)
- Check remote_queue handling
- Verify adopt/LRU mechanisms
- Correlate stale pointer logs with SuperSlab unregister events

Log Volume Warning:
- May produce many diagnostic logs on long runs
- Consider ENV gating for production

Technical Notes:
- Refcount is per-SuperSlab, not global
- Guards prevent symptom propagation, not root cause
- Root cause is in SuperSlab lifecycle management

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 21:56:52 +09:00
+								#include "superslab/superslab_inline.h"  // superslab_ref_get guard for TLS pins
-												Add SuperSlab Release Guard Box for centralized slab lifecycle decisions

Consolidates all slab recycling and SuperSlab free logic into a single
point of authority.

Box Theory compliance:
- Single Responsibility: Guard slab lifecycle transitions only
- No side effects: Pure decision logic, no mutations
- Clear API: ss_release_guard_slab_can_recycle, ss_release_guard_superslab_can_free
- Fail-fast friendly: Callers handle decision policy

Implementation:
- core/box/ss_release_guard_box.h: New guard box (68 lines)
- core/box/slab_recycling_box.h: Integrated into recycling decisions
- core/hakmem_shared_pool_release.c: Guards superslab_free() calls

Architecture:
- Protects against: premature slab recycling, UAF, double-free
- Validates: meta->used==0, meta->capacity>0, total_active_blocks==0
- Provides: single decision point for slab lifecycle

Testing: 60+ seconds stable
- 60s test: exit code 0, 0 crashes
- Slab lifecycle properly guarded
- All critical release paths protected

Benefits:
- Centralizes scattered slab validity checks
- Prevents race conditions in slab lifecycle
- Single policy point for future enhancements
- Foundation for slab state machine

Note: 180s test shows pre-existing TLS SLL issue (unrelated to this box).
The Release Guard Box itself is functioning correctly and is production-ready.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 06:22:09 +09:00
+								#include "box/ss_release_guard_box.h"    // Box: SuperSlab Release Guard
-												Fix C7 warm/TLS Release path and unify debug instrumentation

											
										
										
											2025-12-05 23:41:01 +09:00
+								#include "box/ss_slab_reset_box.h"       // Box: Reset slab metadata on reuse path
-												Boxify superslab registry, add bench profile, and document C7 hotpath experiments

											
										
										
											2025-12-07 03:12:27 +09:00
+								#include "box/ss_stats_box.h"            // Observability: Superslab/slab counters
 								#include "box/ss_budget_box.h"           // Budget guard (global/class caps)
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								#include <stdlib.h>
 								#include <stdio.h>
 								#include <stdatomic.h>
-												Fix C7 warm/TLS Release path and unify debug instrumentation

											
										
										
											2025-12-05 23:41:01 +09:00
+								static inline void c7_release_log_once(SuperSlab* ss, int slab_idx) {
 								#if HAKMEM_BUILD_RELEASE
 								    static _Atomic uint32_t rel_c7_release_logs = 0;
 								    uint32_t n = atomic_fetch_add_explicit(&rel_c7_release_logs, 1, memory_order_relaxed);
 								    if (n < 8) {
 								        TinySlabMeta* meta = &ss->slabs[slab_idx];
 								        fprintf(stderr,
 								                "[REL_C7_RELEASE] ss=%p slab=%d used=%u cap=%u carved=%u\n",
 								                (void*)ss,
 								                slab_idx,
 								                (unsigned)meta->used,
 								                (unsigned)meta->capacity,
 								                (unsigned)meta->carved);
 								    }
 								#else
 								    static _Atomic uint32_t dbg_c7_release_logs = 0;
 								    uint32_t n = atomic_fetch_add_explicit(&dbg_c7_release_logs, 1, memory_order_relaxed);
 								    if (n < 8) {
 								        TinySlabMeta* meta = &ss->slabs[slab_idx];
 								        fprintf(stderr,
 								                "[DBG_C7_RELEASE] ss=%p slab=%d used=%u cap=%u carved=%u\n",
 								                (void*)ss,
 								                slab_idx,
 								                (unsigned)meta->used,
 								                (unsigned)meta->capacity,
 								                (unsigned)meta->carved);
 								    }
 								#endif
 								}
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								void
 								shared_pool_release_slab(SuperSlab* ss, int slab_idx)
 								{
 								    // Phase 12: SP-SLOT Box - Slot-based Release
 								    //
 								    // Flow:
 								    //   1. Validate inputs and check meta->used == 0
 								    //   2. Find SharedSSMeta for this SuperSlab
 								    //   3. Mark slot ACTIVE → EMPTY
 								    //   4. Push to per-class free list (enables same-class reuse)
 								    //   5. If all slots EMPTY → superslab_free() → LRU cache
 								    if (!ss) {
 								        return;
 								    }
 								    if (slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) {
 								        return;
 								    }
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								    // Phase 9-2 FIX: Promote Legacy SuperSlabs to Shared Pool on first recycle
 								    // If we are recycling a slot from a Legacy SS, we must remove it from the
 								    // Legacy list (g_superslab_heads) to prevent Legacy Backend from allocating
 								    // from it simultaneously (Double Allocation Race).
 								    // This effectively transfers ownership to Shared Pool.
 								    extern void remove_superslab_from_legacy_head(SuperSlab* ss);
 								    remove_superslab_from_legacy_head(ss);
 								    // BUGFIX: Re-check used count after removal. Legacy Backend might have
 								    // allocated from this slab while we were waiting for the lock in remove().
 								    TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
 								    if (atomic_load_explicit(&slab_meta->used, memory_order_acquire) != 0) {
 								        // Legacy Backend stole this slab. It's now an orphan (removed from list).
 								        // We abort recycling. It will be recycled when Legacy frees it later.
 								        return;
 								    }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    // Debug logging
 								#if !HAKMEM_BUILD_RELEASE
-												Priority-2 ENV Cache: Shared Pool Release (1箇所置換)

【置換ファイル】
- core/hakmem_shared_pool_release.c (1箇所 → ENV Cache)

【変更詳細】
- getenv("HAKMEM_SS_FREE_DEBUG") → HAK_ENV_SS_FREE_DEBUG()
- #include "hakmem_env_cache.h" 追加
- static変数の遅延初期化パターンを削除

【効果】
- Shared Pool Release pathからgetenv()呼び出しを排除
- SS_FREE_DEBUG変数は既にENV Cacheに登録済み (Hot Path Free系)

【テスト】
✅ make shared → 成功
✅ /tmp/test_mixed3_final → PASSED

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-02 20:52:48 +09:00
+								    // Priority-2: Use cached ENV
 								    int dbg = HAK_ENV_SS_FREE_DEBUG();
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								#else
 								    static const int dbg = 0;
 								#endif
 								    // P0 instrumentation: count lock acquisitions
 								    lock_stats_init();
 								    if (g_lock_stats_enabled == 1) {
 								        atomic_fetch_add(&g_lock_stats_enabled, 1);
 								        atomic_fetch_add(&g_lock_release_slab_count, 1);
 								    }
 								    pthread_mutex_lock(&g_shared_pool.alloc_lock);
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								    // TinySlabMeta* slab_meta = &ss->slabs[slab_idx]; // Already declared above
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    if (slab_meta->used != 0) {
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								        // Not actually empty (double check under lock)
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								        return;
 								    }
 								    uint8_t class_idx = slab_meta->class_idx;
-												Fix C7 warm/TLS Release path and unify debug instrumentation

											
										
										
											2025-12-05 23:41:01 +09:00
+								    if (class_idx == 7) {
 								        c7_release_log_once(ss, slab_idx);
 								    }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
-												Add defensive layers mapping and diagnostic logging enhancements

Documentation:
- Created docs/DEFENSIVE_LAYERS_MAPPING.md documenting all 5 defensive layers
- Maps which symptoms each layer suppresses
- Defines safe removal order after root cause fix
- Includes test methods for each layer removal

Diagnostic Logging Enhancements (ChatGPT work):
- TLS_SLL_HEAD_SET log with count and backtrace for NORMALIZE_USERPTR
- tiny_next_store_log with filtering capability
- Environment variables for log filtering:
  - HAKMEM_TINY_SLL_NEXTCLS: class filter for next store (-1 disables)
  - HAKMEM_TINY_SLL_NEXTTAG: tag filter (substring match)
  - HAKMEM_TINY_SLL_HEADCLS: class filter for head trace

Current Investigation Status:
- sh8bench 60/120s: crash-free, zero NEXT_INVALID/HDR_RESET/SANITIZE
- BUT: shot limit (256) exhausted by class3 tls_push before class1/drain
- Need: Add tags to pop/clear paths, or increase shot limit for class1

Purpose of this commit:
- Document defensive layers for safe removal later
- Enable targeted diagnostic logging
- Prepare for final root cause identification

Next Steps:
1. Add tags to tls_sll_pop tiny_next_write (e.g., "tls_pop_clear")
2. Re-run with HAKMEM_TINY_SLL_NEXTTAG=tls_pop
3. Capture class1 writes that lead to corruption

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 04:15:10 +09:00
+								    // Guard: if SuperSlab is pinned (TLS/remote references), defer release to avoid
 								    // class_map=255 while pointers are still in-flight.
 								    uint32_t ss_refs_guard = superslab_ref_get(ss);
 								    if (ss_refs_guard != 0) {
 								#if !HAKMEM_BUILD_RELEASE
 								        if (dbg == 1) {
 								            fprintf(stderr,
 								                    "[SP_SLOT_RELEASE_SKIP_PINNED] ss=%p slab_idx=%d class=%d refcount=%u\n",
 								                    (void*)ss, slab_idx, class_idx, (unsigned)ss_refs_guard);
 								        }
 								#endif
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								        return;
 								    }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    #if !HAKMEM_BUILD_RELEASE
 								    if (dbg == 1) {
 								        fprintf(stderr, "[SP_SLOT_RELEASE] ss=%p slab_idx=%d class=%d used=0 (marking EMPTY)\n",
 								                (void*)ss, slab_idx, class_idx);
 								    }
 								    #endif
-												Fix C7 warm/TLS Release path and unify debug instrumentation

											
										
										
											2025-12-05 23:41:01 +09:00
+								    if (class_idx == 7) {
 								        ss_slab_reset_meta_for_tiny(ss, slab_idx, class_idx);
 								#if HAKMEM_BUILD_RELEASE
 								        static _Atomic uint32_t rel_c7_reset_logs = 0;
 								        uint32_t rn = atomic_fetch_add_explicit(&rel_c7_reset_logs, 1, memory_order_relaxed);
 								        if (rn < 4) {
 								            TinySlabMeta* m = &ss->slabs[slab_idx];
 								            fprintf(stderr,
 								                    "[REL_C7_RELEASE_RESET] ss=%p slab=%d used=%u cap=%u carved=%u freelist=%p\n",
 								                    (void*)ss,
 								                    slab_idx,
 								                    (unsigned)m->used,
 								                    (unsigned)m->capacity,
 								                    (unsigned)m->carved,
 								                    m->freelist);
 								        }
 								#else
 								        static _Atomic uint32_t dbg_c7_reset_logs = 0;
 								        uint32_t rn = atomic_fetch_add_explicit(&dbg_c7_reset_logs, 1, memory_order_relaxed);
 								        if (rn < 4) {
 								            TinySlabMeta* m = &ss->slabs[slab_idx];
 								            fprintf(stderr,
 								                    "[DBG_C7_RELEASE_RESET] ss=%p slab=%d used=%u cap=%u carved=%u freelist=%p\n",
 								                    (void*)ss,
 								                    slab_idx,
 								                    (unsigned)m->used,
 								                    (unsigned)m->capacity,
 								                    (unsigned)m->carved,
 								                    m->freelist);
 								        }
 								#endif
 								    }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    // Find SharedSSMeta for this SuperSlab
 								    SharedSSMeta* sp_meta = NULL;
 								    uint32_t count = atomic_load_explicit(&g_shared_pool.ss_meta_count, memory_order_relaxed);
 								    for (uint32_t i = 0; i < count; i++) {
 								        // RACE FIX: Load pointer atomically
 								        SuperSlab* meta_ss = atomic_load_explicit(&g_shared_pool.ss_metadata[i].ss, memory_order_relaxed);
 								        if (meta_ss == ss) {
 								            sp_meta = &g_shared_pool.ss_metadata[i];
 								            break;
 								        }
 								    }
 								    if (!sp_meta) {
 								        // SuperSlab not in SP-SLOT system yet - create metadata
 								        sp_meta = sp_meta_find_or_create(ss);
 								        if (!sp_meta) {
 								            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								            return;  // Failed to create metadata
 								        }
 								    }
 								    // Mark slot as EMPTY (ACTIVE → EMPTY)
 								    uint32_t slab_bit = (1u << slab_idx);
 								    SlotState slot_state = atomic_load_explicit(
 								        &sp_meta->slots[slab_idx].state,
 								        memory_order_acquire);
 								    if (slot_state != SLOT_ACTIVE && (ss->slab_bitmap & slab_bit)) {
 								        // Legacy path import: rebuild slot states from SuperSlab bitmap/class_map
 								        sp_meta_sync_slots_from_ss(sp_meta, ss);
 								        slot_state = atomic_load_explicit(
 								            &sp_meta->slots[slab_idx].state,
 								            memory_order_acquire);
 								    }
 								    if (slot_state != SLOT_ACTIVE || sp_slot_mark_empty(sp_meta, slab_idx) != 0) {
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								        return;  // Slot wasn't ACTIVE
 								    }
 								    // Update SuperSlab metadata
 								    uint32_t bit = (1u << slab_idx);
 								    if (ss->slab_bitmap & bit) {
 								        ss->slab_bitmap &= ~bit;
 								        slab_meta->class_idx = 255;  // UNASSIGNED
 								        // P1.1: Mark class_map as UNASSIGNED when releasing slab
 								        ss->class_map[slab_idx] = 255;
-												Boxify superslab registry, add bench profile, and document C7 hotpath experiments

											
										
										
											2025-12-07 03:12:27 +09:00
+								        // Reset slab metadata to a pristine state for all classes (C0–C7)
 								        ss_slab_reset_meta_for_tiny(ss, slab_idx, -1);
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								        if (ss->active_slabs > 0) {
 								            ss->active_slabs--;
 								            if (ss->active_slabs == 0 && g_shared_pool.active_count > 0) {
 								                g_shared_pool.active_count--;
 								            }
 								        }
 								        if (class_idx < TINY_NUM_CLASSES_SS &&
 								            g_shared_pool.class_active_slots[class_idx] > 0) {
 								            g_shared_pool.class_active_slots[class_idx]--;
 								        }
 								    }
 								    // P0-4: Push to lock-free per-class free list (enables reuse by same class)
 								    // Note: push BEFORE releasing mutex (slot state already updated under lock)
 								    if (class_idx < TINY_NUM_CLASSES_SS) {
 								        sp_freelist_push_lockfree(class_idx, sp_meta, slab_idx);
 								        #if !HAKMEM_BUILD_RELEASE
 								        if (dbg == 1) {
 								            fprintf(stderr, "[SP_SLOT_FREELIST_LOCKFREE] class=%d pushed slot (ss=%p slab=%d) active_slots=%u/%u\n",
 								                    class_idx, (void*)ss, slab_idx,
 								                    sp_meta->active_slots, sp_meta->total_slots);
 								        }
 								        #endif
 								    }
-												P-Tier + Tiny Route Policy: Aggressive Superslab Management + Safe Routing

## Phase 1: Utilization-Aware Superslab Tiering (案B実装済)

- Add ss_tier_box.h: Classify SuperSlabs into HOT/DRAINING/FREE based on utilization
  - HOT (>25%): Accept new allocations
  - DRAINING (≤25%): Drain only, no new allocs
  - FREE (0%): Ready for eager munmap

- Enhanced shared_pool_release_slab():
  - Check tier transition after each slab release
  - If tier→FREE: Force remaining slots to EMPTY and call superslab_free() immediately
  - Bypasses LRU cache to prevent registry bloat from accumulating DRAINING SuperSlabs

- Test results (bench_random_mixed_hakmem):
  - 1M iterations: ✅ ~1.03M ops/s (previously passed)
  - 10M iterations: ✅ ~1.15M ops/s (previously: registry full error)
  - 50M iterations: ✅ ~1.08M ops/s (stress test)

## Phase 2: Tiny Front Routing Policy (新規Box)

- Add tiny_route_box.h/c: Single 8-byte table for class→routing decisions
  - ROUTE_TINY_ONLY: Tiny front exclusive (no fallback)
  - ROUTE_TINY_FIRST: Try Tiny, fallback to Pool if fails
  - ROUTE_POOL_ONLY: Skip Tiny entirely

- Profiles via HAKMEM_TINY_PROFILE ENV:
  - "hot": C0-C3=TINY_ONLY, C4-C6=TINY_FIRST, C7=POOL_ONLY
  - "conservative" (default): All TINY_FIRST
  - "off": All POOL_ONLY (disable Tiny)
  - "full": All TINY_ONLY (microbench mode)

- A/B test results (ws=256, 100k ops random_mixed):
  - Default (conservative): ~2.90M ops/s
  - hot: ~2.65M ops/s (more conservative)
  - off: ~2.86M ops/s
  - full: ~2.98M ops/s (slightly best)

## Design Rationale

### Registry Pressure Fix (案B)
- Problem: DRAINING tier SS occupied registry indefinitely
- Solution: When total_active_blocks→0, immediately free to clear registry slot
- Result: No more "registry full" errors under stress

### Routing Policy Box (新)
- Problem: Tiny front optimization scattered across ENV/branches
- Solution: Centralize routing in single table, select profiles via ENV
- Benefit: Safe A/B testing without touching hot path code
- Future: Integrate with RSS budget/learning layers for dynamic profile switching

## Next Steps (性能最適化)
- Profile Tiny front internals (TLS SLL, FastCache, Superslab backend latency)
- Identify bottleneck between current ~2.9M ops/s and mimalloc ~100M ops/s
- Consider:
  - Reduce shared pool lock contention
  - Optimize unified cache hit rate
  - Streamline Superslab carving logic

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 18:01:25 +09:00
+								    // P-Tier: Check tier transition after releasing slab
 								    // This may transition HOT → DRAINING if utilization dropped below threshold
 								    // or DRAINING → FREE if utilization reached 0
 								    ss_tier_check_transition(ss);
 								    // P-Tier Step B: Eager FREE eviction
 								    // If tier transitioned to FREE (total_active_blocks == 0), immediately try to
 								    // release the SuperSlab regardless of active_slots. This prevents registry bloat.
 								    SSTier current_tier = ss_tier_get(ss);
 								    if (current_tier == SS_TIER_FREE) {
 								        // Double-check: total_active_blocks should be 0 for FREE tier
 								        uint32_t active_blocks = atomic_load_explicit(&ss->total_active_blocks, memory_order_acquire);
 								        if (active_blocks == 0 && ss_release_guard_superslab_can_free(ss)) {
 								            #if !HAKMEM_BUILD_RELEASE
 								            if (dbg == 1) {
 								                fprintf(stderr, "[SP_TIER_FREE_EAGER] ss=%p tier=FREE active_slots=%u -> immediate free\n",
 								                        (void*)ss, sp_meta->active_slots);
 								            }
 								            #endif
 								            // Force all remaining slots to EMPTY state for clean metadata
 								            for (uint32_t i = 0; i < sp_meta->total_slots; i++) {
 								                SlotState st = atomic_load_explicit(&sp_meta->slots[i].state, memory_order_relaxed);
 								                if (st == SLOT_ACTIVE) {
 								                    atomic_store_explicit(&sp_meta->slots[i].state, SLOT_EMPTY, memory_order_relaxed);
 								                }
 								            }
 								            sp_meta->active_slots = 0;
 								            if (g_lock_stats_enabled == 1) {
 								                atomic_fetch_add(&g_lock_release_count, 1);
 								            }
 								            // Clear meta->ss before unlocking (race prevention)
 								            atomic_store_explicit(&sp_meta->ss, NULL, memory_order_release);
 								            pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								            // Free SuperSlab immediately (bypasses normal active_slots==0 check)
 								            extern void superslab_free(SuperSlab* ss);
-												Boxify superslab registry, add bench profile, and document C7 hotpath experiments

											
										
										
											2025-12-07 03:12:27 +09:00
+								            ss_stats_on_ss_free_class(class_idx);
 								            ss_budget_on_free(class_idx);
-												P-Tier + Tiny Route Policy: Aggressive Superslab Management + Safe Routing

## Phase 1: Utilization-Aware Superslab Tiering (案B実装済)

- Add ss_tier_box.h: Classify SuperSlabs into HOT/DRAINING/FREE based on utilization
  - HOT (>25%): Accept new allocations
  - DRAINING (≤25%): Drain only, no new allocs
  - FREE (0%): Ready for eager munmap

- Enhanced shared_pool_release_slab():
  - Check tier transition after each slab release
  - If tier→FREE: Force remaining slots to EMPTY and call superslab_free() immediately
  - Bypasses LRU cache to prevent registry bloat from accumulating DRAINING SuperSlabs

- Test results (bench_random_mixed_hakmem):
  - 1M iterations: ✅ ~1.03M ops/s (previously passed)
  - 10M iterations: ✅ ~1.15M ops/s (previously: registry full error)
  - 50M iterations: ✅ ~1.08M ops/s (stress test)

## Phase 2: Tiny Front Routing Policy (新規Box)

- Add tiny_route_box.h/c: Single 8-byte table for class→routing decisions
  - ROUTE_TINY_ONLY: Tiny front exclusive (no fallback)
  - ROUTE_TINY_FIRST: Try Tiny, fallback to Pool if fails
  - ROUTE_POOL_ONLY: Skip Tiny entirely

- Profiles via HAKMEM_TINY_PROFILE ENV:
  - "hot": C0-C3=TINY_ONLY, C4-C6=TINY_FIRST, C7=POOL_ONLY
  - "conservative" (default): All TINY_FIRST
  - "off": All POOL_ONLY (disable Tiny)
  - "full": All TINY_ONLY (microbench mode)

- A/B test results (ws=256, 100k ops random_mixed):
  - Default (conservative): ~2.90M ops/s
  - hot: ~2.65M ops/s (more conservative)
  - off: ~2.86M ops/s
  - full: ~2.98M ops/s (slightly best)

## Design Rationale

### Registry Pressure Fix (案B)
- Problem: DRAINING tier SS occupied registry indefinitely
- Solution: When total_active_blocks→0, immediately free to clear registry slot
- Result: No more "registry full" errors under stress

### Routing Policy Box (新)
- Problem: Tiny front optimization scattered across ENV/branches
- Solution: Centralize routing in single table, select profiles via ENV
- Benefit: Safe A/B testing without touching hot path code
- Future: Integrate with RSS budget/learning layers for dynamic profile switching

## Next Steps (性能最適化)
- Profile Tiny front internals (TLS SLL, FastCache, Superslab backend latency)
- Identify bottleneck between current ~2.9M ops/s and mimalloc ~100M ops/s
- Consider:
  - Reduce shared pool lock contention
  - Optimize unified cache hit rate
  - Streamline Superslab carving logic

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 18:01:25 +09:00
+								            superslab_free(ss);
 								            return;
 								        }
 								    }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								    // Check if SuperSlab is now completely empty (all slots EMPTY or UNUSED)
 								    if (sp_meta->active_slots == 0) {
 								        #if !HAKMEM_BUILD_RELEASE
 								        if (dbg == 1) {
 								            fprintf(stderr, "[SP_SLOT_COMPLETELY_EMPTY] ss=%p active_slots=0 (calling superslab_free)\n",
 								                    (void*)ss);
 								        }
 								        #endif
 								        if (g_lock_stats_enabled == 1) {
 								            atomic_fetch_add(&g_lock_release_count, 1);
 								        }
 								        // RACE FIX: Set meta->ss to NULL BEFORE unlocking mutex
 								        // This prevents Stage 2 from accessing freed SuperSlab
 								        atomic_store_explicit(&sp_meta->ss, NULL, memory_order_release);
 								        pthread_mutex_unlock(&g_shared_pool.alloc_lock);
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								        // Remove from legacy backend list (moved to top of function)
 								        // extern void remove_superslab_from_legacy_head(SuperSlab* ss);
 								        // remove_superslab_from_legacy_head(ss);
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
 								        // Free SuperSlab:
 								        // 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation
 								        // 2. Or munmap if LRU is full - eager deallocation
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
-												Add SuperSlab Release Guard Box for centralized slab lifecycle decisions

Consolidates all slab recycling and SuperSlab free logic into a single
point of authority.

Box Theory compliance:
- Single Responsibility: Guard slab lifecycle transitions only
- No side effects: Pure decision logic, no mutations
- Clear API: ss_release_guard_slab_can_recycle, ss_release_guard_superslab_can_free
- Fail-fast friendly: Callers handle decision policy

Implementation:
- core/box/ss_release_guard_box.h: New guard box (68 lines)
- core/box/slab_recycling_box.h: Integrated into recycling decisions
- core/hakmem_shared_pool_release.c: Guards superslab_free() calls

Architecture:
- Protects against: premature slab recycling, UAF, double-free
- Validates: meta->used==0, meta->capacity>0, total_active_blocks==0
- Provides: single decision point for slab lifecycle

Testing: 60+ seconds stable
- 60s test: exit code 0, 0 crashes
- Slab lifecycle properly guarded
- All critical release paths protected

Benefits:
- Centralizes scattered slab validity checks
- Prevents race conditions in slab lifecycle
- Single policy point for future enhancements
- Foundation for slab state machine

Note: 180s test shows pre-existing TLS SLL issue (unrelated to this box).
The Release Guard Box itself is functioning correctly and is production-ready.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 06:22:09 +09:00
+								        // BUGFIX: Double check total_active_blocks and refcount. Legacy Backend might have
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								        // allocated from ANOTHER slab in this SS just before we removed it.
 								        // If so, we must NOT free the SS.
-												Add SuperSlab Release Guard Box for centralized slab lifecycle decisions

Consolidates all slab recycling and SuperSlab free logic into a single
point of authority.

Box Theory compliance:
- Single Responsibility: Guard slab lifecycle transitions only
- No side effects: Pure decision logic, no mutations
- Clear API: ss_release_guard_slab_can_recycle, ss_release_guard_superslab_can_free
- Fail-fast friendly: Callers handle decision policy

Implementation:
- core/box/ss_release_guard_box.h: New guard box (68 lines)
- core/box/slab_recycling_box.h: Integrated into recycling decisions
- core/hakmem_shared_pool_release.c: Guards superslab_free() calls

Architecture:
- Protects against: premature slab recycling, UAF, double-free
- Validates: meta->used==0, meta->capacity>0, total_active_blocks==0
- Provides: single decision point for slab lifecycle

Testing: 60+ seconds stable
- 60s test: exit code 0, 0 crashes
- Slab lifecycle properly guarded
- All critical release paths protected

Benefits:
- Centralizes scattered slab validity checks
- Prevents race conditions in slab lifecycle
- Single policy point for future enhancements
- Foundation for slab state machine

Note: 180s test shows pre-existing TLS SLL issue (unrelated to this box).
The Release Guard Box itself is functioning correctly and is production-ready.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-04 06:22:09 +09:00
+								        if (ss_release_guard_superslab_can_free(ss)) {
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								            extern void superslab_free(SuperSlab* ss);
-												Boxify superslab registry, add bench profile, and document C7 hotpath experiments

											
										
										
											2025-12-07 03:12:27 +09:00
+								            ss_stats_on_ss_free_class(class_idx);
 								            ss_budget_on_free(class_idx);
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								            superslab_free(ss);
 								        } else {
 								            #if !HAKMEM_BUILD_RELEASE
 								            if (dbg == 1) {
-												Fix debug build after clean reset

											
										
										
											2025-12-05 20:43:14 +09:00
+								                uint32_t active_blocks = atomic_load_explicit(&ss->total_active_blocks, memory_order_acquire);
 								                uint32_t ss_refs = superslab_ref_get(ss);
-												Add SuperSlab refcount pinning and critical failsafe guards

Major breakthrough: sh8bench now completes without SIGSEGV!
Added defensive refcounting and failsafe mechanisms to prevent
use-after-free and corruption propagation.

Changes:
1. SuperSlab Refcount Pinning (core/box/tls_sll_box.h)
   - tls_sll_push_impl: increment refcount before adding to list
   - tls_sll_pop_impl: decrement refcount when removing from list
   - Prevents SuperSlab from being freed while TLS SLL holds pointers

2. SuperSlab Release Guards (core/superslab_allocate.c, shared_pool_release.c)
   - Check refcount > 0 before freeing SuperSlab
   - If refcount > 0, defer release instead of freeing
   - Prevents use-after-free when TLS/remote/freelist hold stale pointers

3. TLS SLL Next Pointer Validation (core/box/tls_sll_box.h)
   - Detect invalid next pointer during traversal
   - Log [TLS_SLL_NEXT_INVALID] when detected
   - Drop list to prevent corruption propagation

4. Unified Cache Freelist Validation (core/front/tiny_unified_cache.c)
   - Validate freelist head before use
   - Log [UNIFIED_FREELIST_INVALID] for corrupted lists
   - Defensive drop to prevent bad allocations

5. Early Refcount Decrement Fix (core/tiny_free_fast.inc.h)
   - Removed ss_active_dec_one from fast path
   - Prevents premature refcount depletion
   - Defers decrement to proper cleanup path

Test Results:
✅ sh8bench completes successfully (exit code 0)
✅ No SIGSEGV or ABORT signals
✅ Short runs (5s) crash-free
⚠️ Multiple [TLS_SLL_NEXT_INVALID] / [UNIFIED_FREELIST_INVALID] logged
⚠️ Invalid pointers still present (stale references exist)

Status Analysis:
- Stability: ACHIEVED (no crashes)
- Root Cause: NOT FULLY SOLVED (invalid pointers remain)
- Approach: Defensive + refcount guards working well

Remaining Issues:
❌ Why does SuperSlab get unregistered while TLS SLL holds pointers?
❌ SuperSlab lifecycle: remote_queue / adopt / LRU interactions?
❌ Stale pointers indicate improper SuperSlab lifetime management

Performance Impact:
- Refcount operations: +1-3 cycles per push/pop (minor)
- Validation checks: +2-5 cycles (minor)
- Overall: < 5% overhead estimated

Next Investigation:
- Trace SuperSlab lifecycle (allocation → registration → unregister → free)
- Check remote_queue handling
- Verify adopt/LRU mechanisms
- Correlate stale pointer logs with SuperSlab unregister events

Log Volume Warning:
- May produce many diagnostic logs on long runs
- Consider ENV gating for production

Technical Notes:
- Refcount is per-SuperSlab, not global
- Guards prevent symptom propagation, not root cause
- Root cause is in SuperSlab lifecycle management

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-12-03 21:56:52 +09:00
+								                fprintf(stderr,
 								                        "[SP_SLOT_RELEASE] SKIP free ss=%p: total_active_blocks=%u refcount=%u\n",
 								                        (void*)ss,
 								                        (unsigned)active_blocks,
 								                        (unsigned)ss_refs);
-												Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).

											
										
										
											2025-12-01 13:47:23 +09:00
+								            }
 								            #endif
 								        }
-												Refactor: Split monolithic hakmem_shared_pool.c into acquire/release modules

- Split core/hakmem_shared_pool.c into acquire/release modules for maintainability.
- Introduced core/hakmem_shared_pool_internal.h for shared internal API.
- Fixed incorrect function name usage (superslab_alloc -> superslab_allocate).
- Increased SUPER_REG_SIZE to 1M to support large working sets (Phase 9-2 fix).
- Updated Makefile.
- Verified with benchmarks.

											
										
										
											2025-11-30 18:11:08 +09:00
+								        return;
 								    }
 								    if (g_lock_stats_enabled == 1) {
 								        atomic_fetch_add(&g_lock_release_count, 1);
 								    }
 								    pthread_mutex_unlock(&g_shared_pool.alloc_lock);
 								}