hakmem/core/box/ss_hot_cold_box.h

// ss_hot_cold_box.h - Phase 3d-C: Hot/Cold Slab Split Box
// Purpose: Cache locality optimization via hot/cold slab separation
// License: MIT
// Date: 2025-11-20

#ifndef SS_HOT_COLD_BOX_H
#define SS_HOT_COLD_BOX_H

#include "../superslab/superslab_types.h"
#include <stdbool.h>
#include <stdlib.h>  // P1.3: for getenv()
#include <stdio.h>   // P2.4: for fprintf() in debug output

// ============================================================================
// Phase 3d-C: Hot/Cold Split Box API
// ============================================================================
//
// Goal: Improve L1D cache hit rate by separating hot (high utilization) and
//       cold (low utilization) slabs within a SuperSlab.
//
// Strategy:
// - Hot slabs (used > 50%): Prioritized for allocation → better cache locality
// - Cold slabs (used ≤ 50%): Used as fallback → delayed deallocation
//
// Expected: +8-12% throughput from improved cache line locality
//
// Box Contract:
// - ss_is_slab_hot(): Returns true if slab should be considered "hot"
// - ss_update_hot_cold_indices(): Rebuilds hot/cold index arrays
// - ss_init_hot_cold(): Initializes hot/cold fields on SuperSlab creation
//
// ============================================================================

// Phase 3d-C: Hot/Cold判定閾値
#define HOT_UTILIZATION_THRESHOLD 50  // 使用率50%以上でホット判定

// Phase 12-1.1: EMPTY判定ロジック（最優先再利用）
// P1.3: ENV gate for active-based empty detection
// ENV: HAKMEM_TINY_ACTIVE_TRACK=1 → use active, else use used
// Returns: true if slab is completely EMPTY (highest reuse priority)
static inline bool ss_is_slab_empty(const TinySlabMeta* meta) {
    if (meta->capacity == 0) return false;

    // P1.3: Use active-based empty detection if enabled
    static int g_use_active = -1;
    if (__builtin_expect(g_use_active == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_ACTIVE_TRACK");
        g_use_active = (e && *e && *e != '0') ? 1 : 0;
    }

    if (g_use_active) {
        // P1.3: active == 0 means all blocks returned by user (even if some in TLS SLL)
        uint16_t act = atomic_load_explicit(&meta->active, memory_order_relaxed);
        return (act == 0);
    } else {
        // Legacy: used == 0 (doesn't account for TLS SLL)
        return (meta->used == 0);
    }
}

// Phase 3d-C: Hot判定ロジック
// Returns: true if slab is "hot" (high utilization, should be prioritized)
static inline bool ss_is_slab_hot(const TinySlabMeta* meta) {
    // ヒューリスティック: 使用率 > 50% → ホット
    // 理由: 使用率が高い = 頻繁にアクセスされている = キャッシュに載せたい
    if (meta->capacity == 0) {
        return false;  // Uninitialized slab
    }
    return (meta->used * 100 / meta->capacity) > HOT_UTILIZATION_THRESHOLD;
}

// Phase 12-1.1: EMPTY mask更新ヘルパー
// Marks a slab as EMPTY (highest reuse priority)
static inline void ss_mark_slab_empty(SuperSlab* ss, int slab_idx) {
    if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) return;

    uint32_t bit = (1u << slab_idx);
    if (!(ss->empty_mask & bit)) {
        ss->empty_mask |= bit;
        ss->empty_count++;
    }
}

// Phase 12-1.1: EMPTY mask クリアヘルパー
// Removes a slab from EMPTY state (when reactivated)
static inline void ss_clear_slab_empty(SuperSlab* ss, int slab_idx) {
    if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) return;

    uint32_t bit = (1u << slab_idx);
    if (ss->empty_mask & bit) {
        ss->empty_mask &= ~bit;
        ss->empty_count--;
    }
}

// Phase 3d-C: Hot/Cold インデックス更新
// Rebuilds hot_indices[] and cold_indices[] arrays based on current slab state
static inline void ss_update_hot_cold_indices(SuperSlab* ss) {
    if (!ss) return;

    ss->hot_count = 0;
    ss->cold_count = 0;
    // Phase 12-1.1: Reset empty tracking
    ss->empty_mask = 0;
    ss->empty_count = 0;

    uint32_t max_slabs = (1u << ss->lg_size) / SLAB_SIZE;
    if (max_slabs > SLABS_PER_SUPERSLAB_MAX) {
        max_slabs = SLABS_PER_SUPERSLAB_MAX;
    }

    // Scan active slabs and classify as EMPTY / hot / cold
    for (uint32_t i = 0; i < max_slabs && i < ss->active_slabs; i++) {
        TinySlabMeta* meta = &ss->slabs[i];

        // Skip uninitialized slabs (capacity == 0)
        if (meta->capacity == 0) {
            continue;
        }

        // Phase 12-1.1: EMPTY slabs have highest reuse priority
        if (ss_is_slab_empty(meta)) {
            ss_mark_slab_empty(ss, (int)i);
            continue;  // Don't add to hot/cold arrays
        }

        if (ss_is_slab_hot(meta)) {
            // Hot slab: high utilization
            if (ss->hot_count < 16) {
                ss->hot_indices[ss->hot_count++] = (uint8_t)i;
            }
        } else {
            // Cold slab: low utilization
            if (ss->cold_count < 16) {
                ss->cold_indices[ss->cold_count++] = (uint8_t)i;
            }
        }
    }
}

// Phase 3d-C: SuperSlab初期化時にhot/cold fieldsをゼロクリア
static inline void ss_init_hot_cold(SuperSlab* ss) {
    if (!ss) return;

    ss->hot_count = 0;
    ss->cold_count = 0;
    // Phase 12-1.1: Initialize EMPTY tracking
    ss->empty_mask = 0;
    ss->empty_count = 0;

    // Initialize index arrays to 0 (defensive programming)
    for (int i = 0; i < 16; i++) {
        ss->hot_indices[i] = 0;
        ss->cold_indices[i] = 0;
    }
}

// ============================================================================
// P2.4: Invariant Verification for Debug Builds
// ============================================================================
//
// Invariant: active + tls_cached ≈ used
//
// - active: blocks currently held by user code
// - tls_cached: blocks cached in TLS SLL (returned by user, not yet pushed to slab freelist)
// - used: total blocks carved from slab and distributed
//
// Due to concurrent updates, exact equality is not guaranteed.
// We allow a small tolerance (delta) for race conditions.
//
// ENV: HAKMEM_TINY_INVARIANT_CHECK=1 to enable (disabled by default)
// ============================================================================

// P2.4: Verify slab invariant: active + tls_cached ≈ used
// Returns: true if invariant holds within tolerance, false if violated
// tolerance: maximum allowed deviation (default: 2 for TLS lag)
static inline bool ss_verify_slab_invariant(const TinySlabMeta* meta, int tolerance) {
    if (!meta || meta->capacity == 0) return true;  // Skip uninitialized slabs

    uint16_t used = atomic_load_explicit(&meta->used, memory_order_relaxed);
    uint16_t active = atomic_load_explicit(&meta->active, memory_order_relaxed);
    uint16_t tls_cached = atomic_load_explicit(&meta->tls_cached, memory_order_relaxed);

    int sum = (int)active + (int)tls_cached;
    int diff = sum - (int)used;
    if (diff < 0) diff = -diff;  // abs(diff)

    return (diff <= tolerance);
}

// P2.4: Verify all slab invariants in a SuperSlab
// Returns: count of slabs that violate the invariant
// ENV: HAKMEM_TINY_INVARIANT_CHECK=1 to enable checking
static inline int ss_verify_superslab_invariants(const SuperSlab* ss, int tolerance) {
    static int g_invariant_check = -1;
    if (__builtin_expect(g_invariant_check == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_INVARIANT_CHECK");
        g_invariant_check = (e && *e && *e != '0') ? 1 : 0;
    }

    if (!g_invariant_check) return 0;  // Disabled by ENV
    if (!ss) return 0;

    int violations = 0;
    uint32_t max_slabs = (1u << ss->lg_size) / SLAB_SIZE;
    if (max_slabs > SLABS_PER_SUPERSLAB_MAX) {
        max_slabs = SLABS_PER_SUPERSLAB_MAX;
    }

    for (uint32_t i = 0; i < max_slabs && i < ss->active_slabs; i++) {
        const TinySlabMeta* meta = &ss->slabs[i];
        if (!ss_verify_slab_invariant(meta, tolerance)) {
            violations++;
#ifndef NDEBUG
            // Debug output for violations
            fprintf(stderr, "[P2.4] Invariant VIOLATION: slab[%u] used=%u active=%u tls_cached=%u (sum=%u)\n",
                    i, meta->used,
                    atomic_load_explicit(&meta->active, memory_order_relaxed),
                    atomic_load_explicit(&meta->tls_cached, memory_order_relaxed),
                    atomic_load_explicit(&meta->active, memory_order_relaxed) +
                    atomic_load_explicit(&meta->tls_cached, memory_order_relaxed));
#endif
        }
    }

    return violations;
}

// P2.4: Debug dump of slab state for troubleshooting
// ENV: HAKMEM_TINY_INVARIANT_DUMP=1 to enable periodic dumps
static inline void ss_dump_slab_state(const SuperSlab* ss, int slab_idx) {
#ifndef NDEBUG
    static int g_dump_enabled = -1;
    if (__builtin_expect(g_dump_enabled == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_INVARIANT_DUMP");
        g_dump_enabled = (e && *e && *e != '0') ? 1 : 0;
    }
    if (!g_dump_enabled) return;
    if (!ss || slab_idx < 0 || slab_idx >= (int)ss->active_slabs) return;

    const TinySlabMeta* meta = &ss->slabs[slab_idx];
    fprintf(stderr, "[P2.4-DUMP] slab[%d]: used=%u active=%u tls_cached=%u capacity=%u class=%u\n",
            slab_idx, meta->used,
            atomic_load_explicit(&meta->active, memory_order_relaxed),
            atomic_load_explicit(&meta->tls_cached, memory_order_relaxed),
            meta->capacity, meta->class_idx);
#else
    (void)ss;
    (void)slab_idx;
#endif
}

#endif // SS_HOT_COLD_BOX_H
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
+								// ss_hot_cold_box.h - Phase 3d-C: Hot/Cold Slab Split Box
 								// Purpose: Cache locality optimization via hot/cold slab separation
 								// License: MIT
 								// Date: 2025-11-20
 								#ifndef SS_HOT_COLD_BOX_H
 								#define SS_HOT_COLD_BOX_H
 								#include "../superslab/superslab_types.h"
 								#include <stdbool.h>
-												P1.3: Add meta->active for TLS SLL tracking

Add active field to TinySlabMeta to track blocks currently held by
users (not in TLS SLL or freelist caches). This enables accurate
empty slab detection that accounts for TLS SLL cached blocks.

Changes:
- superslab_types.h: Add _Atomic uint16_t active field
- ss_allocation_box.c, hakmem_tiny_superslab.c: Initialize active=0
- tiny_free_fast_v2.inc.h: Decrement active on TLS SLL push
- tiny_alloc_fast.inc.h: Add tiny_active_track_alloc() helper,
  increment active on TLS SLL pop (all code paths)
- ss_hot_cold_box.h: ss_is_slab_empty() uses active when enabled

All tracking is ENV-gated: HAKMEM_TINY_ACTIVE_TRACK=1 to enable.
Default is off for zero performance impact.

Invariant: active = used - tls_cached (active <= used)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-28 13:53:45 +09:00
+								#include <stdlib.h>  // P1.3: for getenv()
-												P2: TLS SLL Redesign - class_map default, tls_cached tracking, conditional header restore

This commit completes the P2 phase of the Tiny Pool TLS SLL redesign to fix the
Header/Next pointer conflict that was causing ~30% crash rates.

Changes:
- P2.1: Make class_map lookup the default (ENV: HAKMEM_TINY_NO_CLASS_MAP=1 for legacy)
- P2.2: Add meta->tls_cached field to track blocks cached in TLS SLL
- P2.3: Make Header restoration conditional in tiny_next_store() (default: skip)
- P2.4: Add invariant verification functions (active + tls_cached ≈ used)
- P0.4: Document new ENV variables in ENV_VARS.md

New ENV variables:
- HAKMEM_TINY_ACTIVE_TRACK=1: Enable active/tls_cached tracking (~1% overhead)
- HAKMEM_TINY_NO_CLASS_MAP=1: Disable class_map (legacy mode)
- HAKMEM_TINY_RESTORE_HEADER=1: Force header restoration (legacy mode)
- HAKMEM_TINY_INVARIANT_CHECK=1: Enable invariant verification (debug)
- HAKMEM_TINY_INVARIANT_DUMP=1: Enable periodic state dumps (debug)

Benchmark results (bench_tiny_hot_hakmem 64B):
- Default (class_map ON): 84.49 M ops/sec
- ACTIVE_TRACK=1: 83.62 M ops/sec (-1%)
- NO_CLASS_MAP=1 (legacy): 85.06 M ops/sec
- MT performance: +21-28% vs system allocator

No crashes observed. All tests passed.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-28 14:11:37 +09:00
+								#include <stdio.h>   // P2.4: for fprintf() in debug output
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
 								// ============================================================================
 								// Phase 3d-C: Hot/Cold Split Box API
 								// ============================================================================
 								//
 								// Goal: Improve L1D cache hit rate by separating hot (high utilization) and
 								//       cold (low utilization) slabs within a SuperSlab.
 								//
 								// Strategy:
 								// - Hot slabs (used > 50%): Prioritized for allocation → better cache locality
 								// - Cold slabs (used ≤ 50%): Used as fallback → delayed deallocation
 								//
 								// Expected: +8-12% throughput from improved cache line locality
 								//
 								// Box Contract:
 								// - ss_is_slab_hot(): Returns true if slab should be considered "hot"
 								// - ss_update_hot_cold_indices(): Rebuilds hot/cold index arrays
 								// - ss_init_hot_cold(): Initializes hot/cold fields on SuperSlab creation
 								//
 								// ============================================================================
 								// Phase 3d-C: Hot/Cold判定閾値
 								#define HOT_UTILIZATION_THRESHOLD 50  // 使用率50%以上でホット判定
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								// Phase 12-1.1: EMPTY判定ロジック（最優先再利用）
-												P1.3: Add meta->active for TLS SLL tracking

Add active field to TinySlabMeta to track blocks currently held by
users (not in TLS SLL or freelist caches). This enables accurate
empty slab detection that accounts for TLS SLL cached blocks.

Changes:
- superslab_types.h: Add _Atomic uint16_t active field
- ss_allocation_box.c, hakmem_tiny_superslab.c: Initialize active=0
- tiny_free_fast_v2.inc.h: Decrement active on TLS SLL push
- tiny_alloc_fast.inc.h: Add tiny_active_track_alloc() helper,
  increment active on TLS SLL pop (all code paths)
- ss_hot_cold_box.h: ss_is_slab_empty() uses active when enabled

All tracking is ENV-gated: HAKMEM_TINY_ACTIVE_TRACK=1 to enable.
Default is off for zero performance impact.

Invariant: active = used - tls_cached (active <= used)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-28 13:53:45 +09:00
+								// P1.3: ENV gate for active-based empty detection
 								// ENV: HAKMEM_TINY_ACTIVE_TRACK=1 → use active, else use used
 								// Returns: true if slab is completely EMPTY (highest reuse priority)
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								static inline bool ss_is_slab_empty(const TinySlabMeta* meta) {
-												P1.3: Add meta->active for TLS SLL tracking

Add active field to TinySlabMeta to track blocks currently held by
users (not in TLS SLL or freelist caches). This enables accurate
empty slab detection that accounts for TLS SLL cached blocks.

Changes:
- superslab_types.h: Add _Atomic uint16_t active field
- ss_allocation_box.c, hakmem_tiny_superslab.c: Initialize active=0
- tiny_free_fast_v2.inc.h: Decrement active on TLS SLL push
- tiny_alloc_fast.inc.h: Add tiny_active_track_alloc() helper,
  increment active on TLS SLL pop (all code paths)
- ss_hot_cold_box.h: ss_is_slab_empty() uses active when enabled

All tracking is ENV-gated: HAKMEM_TINY_ACTIVE_TRACK=1 to enable.
Default is off for zero performance impact.

Invariant: active = used - tls_cached (active <= used)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-28 13:53:45 +09:00
+								    if (meta->capacity == 0) return false;
 								    // P1.3: Use active-based empty detection if enabled
 								    static int g_use_active = -1;
 								    if (__builtin_expect(g_use_active == -1, 0)) {
 								        const char* e = getenv("HAKMEM_TINY_ACTIVE_TRACK");
 								        g_use_active = (e && *e && *e != '0') ? 1 : 0;
 								    }
 								    if (g_use_active) {
 								        // P1.3: active == 0 means all blocks returned by user (even if some in TLS SLL)
 								        uint16_t act = atomic_load_explicit(&meta->active, memory_order_relaxed);
 								        return (act == 0);
 								    } else {
 								        // Legacy: used == 0 (doesn't account for TLS SLL)
 								        return (meta->used == 0);
 								    }
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								}
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
+								// Phase 3d-C: Hot判定ロジック
 								// Returns: true if slab is "hot" (high utilization, should be prioritized)
 								static inline bool ss_is_slab_hot(const TinySlabMeta* meta) {
 								    // ヒューリスティック: 使用率 > 50% → ホット
 								    // 理由: 使用率が高い = 頻繁にアクセスされている = キャッシュに載せたい
 								    if (meta->capacity == 0) {
 								        return false;  // Uninitialized slab
 								    }
 								    return (meta->used * 100 / meta->capacity) > HOT_UTILIZATION_THRESHOLD;
 								}
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								// Phase 12-1.1: EMPTY mask更新ヘルパー
 								// Marks a slab as EMPTY (highest reuse priority)
 								static inline void ss_mark_slab_empty(SuperSlab* ss, int slab_idx) {
 								    if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) return;
 								    uint32_t bit = (1u << slab_idx);
 								    if (!(ss->empty_mask & bit)) {
 								        ss->empty_mask |= bit;
 								        ss->empty_count++;
 								    }
 								}
 								// Phase 12-1.1: EMPTY mask クリアヘルパー
 								// Removes a slab from EMPTY state (when reactivated)
 								static inline void ss_clear_slab_empty(SuperSlab* ss, int slab_idx) {
 								    if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) return;
 								    uint32_t bit = (1u << slab_idx);
 								    if (ss->empty_mask & bit) {
 								        ss->empty_mask &= ~bit;
 								        ss->empty_count--;
 								    }
 								}
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
+								// Phase 3d-C: Hot/Cold インデックス更新
 								// Rebuilds hot_indices[] and cold_indices[] arrays based on current slab state
 								static inline void ss_update_hot_cold_indices(SuperSlab* ss) {
 								    if (!ss) return;
 								    ss->hot_count = 0;
 								    ss->cold_count = 0;
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								    // Phase 12-1.1: Reset empty tracking
 								    ss->empty_mask = 0;
 								    ss->empty_count = 0;
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
 								    uint32_t max_slabs = (1u << ss->lg_size) / SLAB_SIZE;
 								    if (max_slabs > SLABS_PER_SUPERSLAB_MAX) {
 								        max_slabs = SLABS_PER_SUPERSLAB_MAX;
 								    }
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								    // Scan active slabs and classify as EMPTY / hot / cold
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
+								    for (uint32_t i = 0; i < max_slabs && i < ss->active_slabs; i++) {
 								        TinySlabMeta* meta = &ss->slabs[i];
 								        // Skip uninitialized slabs (capacity == 0)
 								        if (meta->capacity == 0) {
 								            continue;
 								        }
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								        // Phase 12-1.1: EMPTY slabs have highest reuse priority
 								        if (ss_is_slab_empty(meta)) {
 								            ss_mark_slab_empty(ss, (int)i);
 								            continue;  // Don't add to hot/cold arrays
 								        }
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
+								        if (ss_is_slab_hot(meta)) {
 								            // Hot slab: high utilization
 								            if (ss->hot_count < 16) {
 								                ss->hot_indices[ss->hot_count++] = (uint8_t)i;
 								            }
 								        } else {
 								            // Cold slab: low utilization
 								            if (ss->cold_count < 16) {
 								                ss->cold_indices[ss->cold_count++] = (uint8_t)i;
 								            }
 								        }
 								    }
 								}
 								// Phase 3d-C: SuperSlab初期化時にhot/cold fieldsをゼロクリア
 								static inline void ss_init_hot_cold(SuperSlab* ss) {
 								    if (!ss) return;
 								    ss->hot_count = 0;
 								    ss->cold_count = 0;
-												Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s)

Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab
for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead.

## Changes

### 1. SuperSlab Structure (core/superslab/superslab_types.h)
- Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0)
- Added `empty_count` (uint8_t): Quick check for EMPTY slab availability

### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h)
- Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY
- Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority)
- Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated
- Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs
- Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count

### 3. Free Path Integration (core/box/free_local_box.c)
- After `meta->used--`, check if `meta->used == 0`
- If true, call `ss_mark_slab_empty()` to update empty_mask
- Enables immediate EMPTY detection on every free operation

### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c)
- New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs
- Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries)
- Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()`
- Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead)
- ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing)
- ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs)

## Performance Results

```
Benchmark: Random Mixed 256B (100K iterations)

OFF (default):  10.2M ops/s (baseline)
ON  (ENV=1):    11.5M ops/s (+13.0% improvement) ✅
```

## Expected Impact (from Task-sensei analysis)

**Current bottleneck**:
- Stage 1: 2-5% hit rate (free list broken)
- Stage 2: 3-8% hit rate (rare UNUSED)
- Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck

**Expected with Phase 12-1.1**:
- Stage 0.5: 20-40% hit rate (EMPTY scan)
- Stage 1-2: 20-30% hit rate (combined)
- Stage 3: 30-50% hit rate (significantly reduced)

**Theoretical max**: 25M → 55-70M ops/s (+120-180%)

## Current Gap Analysis

**Observed**: 11.5M ops/s (+13%)
**Expected**: 55-70M ops/s (+120-180%)
**Gap**: Performance regression or missing complementary optimizations

Possible causes:
1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change
2. EMPTY scan overhead (16 SuperSlabs × empty_count check)
3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.)
4. Stage 0.5 too conservative (scan_limit=16, should be higher?)

## Usage

```bash
# Enable EMPTY reuse optimization
export HAKMEM_SS_EMPTY_REUSE=1

# Optional: increase scan limit (trade-off: throughput vs latency)
export HAKMEM_SS_EMPTY_SCAN_LIMIT=32

./bench_random_mixed_hakmem 100000 256 42
```

## Next Steps

**Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M)
**Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect
**Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning)

## Files Modified

Core implementation:
- `core/superslab/superslab_types.h` - empty_mask/empty_count fields
- `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API
- `core/box/free_local_box.c` - Free path EMPTY detection
- `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan

Documentation:
- `CURRENT_TASK.md` - Task-sensei investigation report

---

🎯 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (investigation & design analysis)

											
										
										
											2025-11-21 04:56:48 +09:00
+								    // Phase 12-1.1: Initialize EMPTY tracking
 								    ss->empty_mask = 0;
 								    ss->empty_count = 0;
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
 								    // Initialize index arrays to 0 (defensive programming)
 								    for (int i = 0; i < 16; i++) {
 								        ss->hot_indices[i] = 0;
 								        ss->cold_indices[i] = 0;
 								    }
 								}
-												P2: TLS SLL Redesign - class_map default, tls_cached tracking, conditional header restore

This commit completes the P2 phase of the Tiny Pool TLS SLL redesign to fix the
Header/Next pointer conflict that was causing ~30% crash rates.

Changes:
- P2.1: Make class_map lookup the default (ENV: HAKMEM_TINY_NO_CLASS_MAP=1 for legacy)
- P2.2: Add meta->tls_cached field to track blocks cached in TLS SLL
- P2.3: Make Header restoration conditional in tiny_next_store() (default: skip)
- P2.4: Add invariant verification functions (active + tls_cached ≈ used)
- P0.4: Document new ENV variables in ENV_VARS.md

New ENV variables:
- HAKMEM_TINY_ACTIVE_TRACK=1: Enable active/tls_cached tracking (~1% overhead)
- HAKMEM_TINY_NO_CLASS_MAP=1: Disable class_map (legacy mode)
- HAKMEM_TINY_RESTORE_HEADER=1: Force header restoration (legacy mode)
- HAKMEM_TINY_INVARIANT_CHECK=1: Enable invariant verification (debug)
- HAKMEM_TINY_INVARIANT_DUMP=1: Enable periodic state dumps (debug)

Benchmark results (bench_tiny_hot_hakmem 64B):
- Default (class_map ON): 84.49 M ops/sec
- ACTIVE_TRACK=1: 83.62 M ops/sec (-1%)
- NO_CLASS_MAP=1 (legacy): 85.06 M ops/sec
- MT performance: +21-28% vs system allocator

No crashes observed. All tests passed.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-28 14:11:37 +09:00
+								// ============================================================================
 								// P2.4: Invariant Verification for Debug Builds
 								// ============================================================================
 								//
 								// Invariant: active + tls_cached ≈ used
 								//
 								// - active: blocks currently held by user code
 								// - tls_cached: blocks cached in TLS SLL (returned by user, not yet pushed to slab freelist)
 								// - used: total blocks carved from slab and distributed
 								//
 								// Due to concurrent updates, exact equality is not guaranteed.
 								// We allow a small tolerance (delta) for race conditions.
 								//
 								// ENV: HAKMEM_TINY_INVARIANT_CHECK=1 to enable (disabled by default)
 								// ============================================================================
 								// P2.4: Verify slab invariant: active + tls_cached ≈ used
 								// Returns: true if invariant holds within tolerance, false if violated
 								// tolerance: maximum allowed deviation (default: 2 for TLS lag)
 								static inline bool ss_verify_slab_invariant(const TinySlabMeta* meta, int tolerance) {
 								    if (!meta || meta->capacity == 0) return true;  // Skip uninitialized slabs
 								    uint16_t used = atomic_load_explicit(&meta->used, memory_order_relaxed);
 								    uint16_t active = atomic_load_explicit(&meta->active, memory_order_relaxed);
 								    uint16_t tls_cached = atomic_load_explicit(&meta->tls_cached, memory_order_relaxed);
 								    int sum = (int)active + (int)tls_cached;
 								    int diff = sum - (int)used;
 								    if (diff < 0) diff = -diff;  // abs(diff)
 								    return (diff <= tolerance);
 								}
 								// P2.4: Verify all slab invariants in a SuperSlab
 								// Returns: count of slabs that violate the invariant
 								// ENV: HAKMEM_TINY_INVARIANT_CHECK=1 to enable checking
 								static inline int ss_verify_superslab_invariants(const SuperSlab* ss, int tolerance) {
 								    static int g_invariant_check = -1;
 								    if (__builtin_expect(g_invariant_check == -1, 0)) {
 								        const char* e = getenv("HAKMEM_TINY_INVARIANT_CHECK");
 								        g_invariant_check = (e && *e && *e != '0') ? 1 : 0;
 								    }
 								    if (!g_invariant_check) return 0;  // Disabled by ENV
 								    if (!ss) return 0;
 								    int violations = 0;
 								    uint32_t max_slabs = (1u << ss->lg_size) / SLAB_SIZE;
 								    if (max_slabs > SLABS_PER_SUPERSLAB_MAX) {
 								        max_slabs = SLABS_PER_SUPERSLAB_MAX;
 								    }
 								    for (uint32_t i = 0; i < max_slabs && i < ss->active_slabs; i++) {
 								        const TinySlabMeta* meta = &ss->slabs[i];
 								        if (!ss_verify_slab_invariant(meta, tolerance)) {
 								            violations++;
 								#ifndef NDEBUG
 								            // Debug output for violations
 								            fprintf(stderr, "[P2.4] Invariant VIOLATION: slab[%u] used=%u active=%u tls_cached=%u (sum=%u)\n",
 								                    i, meta->used,
 								                    atomic_load_explicit(&meta->active, memory_order_relaxed),
 								                    atomic_load_explicit(&meta->tls_cached, memory_order_relaxed),
 								                    atomic_load_explicit(&meta->active, memory_order_relaxed) +
 								                    atomic_load_explicit(&meta->tls_cached, memory_order_relaxed));
 								#endif
 								        }
 								    }
 								    return violations;
 								}
 								// P2.4: Debug dump of slab state for troubleshooting
 								// ENV: HAKMEM_TINY_INVARIANT_DUMP=1 to enable periodic dumps
 								static inline void ss_dump_slab_state(const SuperSlab* ss, int slab_idx) {
 								#ifndef NDEBUG
 								    static int g_dump_enabled = -1;
 								    if (__builtin_expect(g_dump_enabled == -1, 0)) {
 								        const char* e = getenv("HAKMEM_TINY_INVARIANT_DUMP");
 								        g_dump_enabled = (e && *e && *e != '0') ? 1 : 0;
 								    }
 								    if (!g_dump_enabled) return;
 								    if (!ss || slab_idx < 0 || slab_idx >= (int)ss->active_slabs) return;
 								    const TinySlabMeta* meta = &ss->slabs[slab_idx];
 								    fprintf(stderr, "[P2.4-DUMP] slab[%d]: used=%u active=%u tls_cached=%u capacity=%u class=%u\n",
 								            slab_idx, meta->used,
 								            atomic_load_explicit(&meta->active, memory_order_relaxed),
 								            atomic_load_explicit(&meta->tls_cached, memory_order_relaxed),
 								            meta->capacity, meta->class_idx);
 								#else
 								    (void)ss;
 								    (void)slab_idx;
 								#endif
 								}
-												Phase 3d-C: Hot/Cold Slab Split - SuperSlab cache locality optimization (baseline established)

Goal: Improve L1D cache hit rate via hot/cold slab separation

Implementation:
- Added hot/cold fields to SuperSlab (superslab_types.h)
  - hot_indices[16] / cold_indices[16]: Index arrays for hot/cold slabs
  - hot_count / cold_count: Number of slabs in each category
- Created ss_hot_cold_box.h: Hot/Cold Split Box API
  - ss_is_slab_hot(): Utilization-based hot判定 (>50% usage)
  - ss_update_hot_cold_indices(): Rebuild index arrays on slab activation
  - ss_init_hot_cold(): Initialize fields on SuperSlab creation
- Updated hakmem_tiny_superslab.c:
  - Initialize hot/cold fields in superslab creation (line 786-792)
  - Update hot/cold indices on slab activation (line 1130)
  - Include ss_hot_cold_box.h (line 7)

Architecture:
- Strategy: Hot slabs (high utilization) prioritized for allocation
- Expected: +8-12% from improved cache line locality
- Note: Refill path optimization (hot優先スキャン) deferred to future commit

Testing:
- Build: Success (LTO warnings are pre-existing)
- 10K ops sanity test: PASS (1.4M ops/s)
- Baseline established for Phase C-8 benchmark comparison

Phase 3d sequence:
- Phase A: SlabMeta Box boundary (38552c3f3) ✅
- Phase B: TLS Cache Merge (9b0d74640) ✅
- Phase C: Hot/Cold Split (current) ✅

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-20 07:44:07 +09:00
+								#endif // SS_HOT_COLD_BOX_H