Files
hakmem/core/box/ss_hot_cold_box.h

254 lines
9.3 KiB
C
Raw Normal View History

// ss_hot_cold_box.h - Phase 3d-C: Hot/Cold Slab Split Box
// Purpose: Cache locality optimization via hot/cold slab separation
// License: MIT
// Date: 2025-11-20
#ifndef SS_HOT_COLD_BOX_H
#define SS_HOT_COLD_BOX_H
#include "../superslab/superslab_types.h"
#include <stdbool.h>
#include <stdlib.h> // P1.3: for getenv()
#include <stdio.h> // P2.4: for fprintf() in debug output
// ============================================================================
// Phase 3d-C: Hot/Cold Split Box API
// ============================================================================
//
// Goal: Improve L1D cache hit rate by separating hot (high utilization) and
// cold (low utilization) slabs within a SuperSlab.
//
// Strategy:
// - Hot slabs (used > 50%): Prioritized for allocation → better cache locality
// - Cold slabs (used ≤ 50%): Used as fallback → delayed deallocation
//
// Expected: +8-12% throughput from improved cache line locality
//
// Box Contract:
// - ss_is_slab_hot(): Returns true if slab should be considered "hot"
// - ss_update_hot_cold_indices(): Rebuilds hot/cold index arrays
// - ss_init_hot_cold(): Initializes hot/cold fields on SuperSlab creation
//
// ============================================================================
// Phase 3d-C: Hot/Cold判定閾値
#define HOT_UTILIZATION_THRESHOLD 50 // 使用率50%以上でホット判定
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
// Phase 12-1.1: EMPTY判定ロジック最優先再利用
// P1.3: ENV gate for active-based empty detection
// ENV: HAKMEM_TINY_ACTIVE_TRACK=1 → use active, else use used
// Returns: true if slab is completely EMPTY (highest reuse priority)
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
static inline bool ss_is_slab_empty(const TinySlabMeta* meta) {
if (meta->capacity == 0) return false;
// P1.3: Use active-based empty detection if enabled
static int g_use_active = -1;
if (__builtin_expect(g_use_active == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ACTIVE_TRACK");
g_use_active = (e && *e && *e != '0') ? 1 : 0;
}
if (g_use_active) {
// P1.3: active == 0 means all blocks returned by user (even if some in TLS SLL)
uint16_t act = atomic_load_explicit(&meta->active, memory_order_relaxed);
return (act == 0);
} else {
// Legacy: used == 0 (doesn't account for TLS SLL)
return (meta->used == 0);
}
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
}
// Phase 3d-C: Hot判定ロジック
// Returns: true if slab is "hot" (high utilization, should be prioritized)
static inline bool ss_is_slab_hot(const TinySlabMeta* meta) {
// ヒューリスティック: 使用率 > 50% → ホット
// 理由: 使用率が高い = 頻繁にアクセスされている = キャッシュに載せたい
if (meta->capacity == 0) {
return false; // Uninitialized slab
}
return (meta->used * 100 / meta->capacity) > HOT_UTILIZATION_THRESHOLD;
}
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
// Phase 12-1.1: EMPTY mask更新ヘルパー
// Marks a slab as EMPTY (highest reuse priority)
static inline void ss_mark_slab_empty(SuperSlab* ss, int slab_idx) {
if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) return;
uint32_t bit = (1u << slab_idx);
if (!(ss->empty_mask & bit)) {
ss->empty_mask |= bit;
ss->empty_count++;
}
}
// Phase 12-1.1: EMPTY mask クリアヘルパー
// Removes a slab from EMPTY state (when reactivated)
static inline void ss_clear_slab_empty(SuperSlab* ss, int slab_idx) {
if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX) return;
uint32_t bit = (1u << slab_idx);
if (ss->empty_mask & bit) {
ss->empty_mask &= ~bit;
ss->empty_count--;
}
}
// Phase 3d-C: Hot/Cold インデックス更新
// Rebuilds hot_indices[] and cold_indices[] arrays based on current slab state
static inline void ss_update_hot_cold_indices(SuperSlab* ss) {
if (!ss) return;
ss->hot_count = 0;
ss->cold_count = 0;
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
// Phase 12-1.1: Reset empty tracking
ss->empty_mask = 0;
ss->empty_count = 0;
uint32_t max_slabs = (1u << ss->lg_size) / SLAB_SIZE;
if (max_slabs > SLABS_PER_SUPERSLAB_MAX) {
max_slabs = SLABS_PER_SUPERSLAB_MAX;
}
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
// Scan active slabs and classify as EMPTY / hot / cold
for (uint32_t i = 0; i < max_slabs && i < ss->active_slabs; i++) {
TinySlabMeta* meta = &ss->slabs[i];
// Skip uninitialized slabs (capacity == 0)
if (meta->capacity == 0) {
continue;
}
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
// Phase 12-1.1: EMPTY slabs have highest reuse priority
if (ss_is_slab_empty(meta)) {
ss_mark_slab_empty(ss, (int)i);
continue; // Don't add to hot/cold arrays
}
if (ss_is_slab_hot(meta)) {
// Hot slab: high utilization
if (ss->hot_count < 16) {
ss->hot_indices[ss->hot_count++] = (uint8_t)i;
}
} else {
// Cold slab: low utilization
if (ss->cold_count < 16) {
ss->cold_indices[ss->cold_count++] = (uint8_t)i;
}
}
}
}
// Phase 3d-C: SuperSlab初期化時にhot/cold fieldsをゼロクリア
static inline void ss_init_hot_cold(SuperSlab* ss) {
if (!ss) return;
ss->hot_count = 0;
ss->cold_count = 0;
Phase 12-1.1: EMPTY Slab Detection + Immediate Reuse (+13% improvement, 10.2M→11.5M ops/s) Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
2025-11-21 04:56:48 +09:00
// Phase 12-1.1: Initialize EMPTY tracking
ss->empty_mask = 0;
ss->empty_count = 0;
// Initialize index arrays to 0 (defensive programming)
for (int i = 0; i < 16; i++) {
ss->hot_indices[i] = 0;
ss->cold_indices[i] = 0;
}
}
// ============================================================================
// P2.4: Invariant Verification for Debug Builds
// ============================================================================
//
// Invariant: active + tls_cached ≈ used
//
// - active: blocks currently held by user code
// - tls_cached: blocks cached in TLS SLL (returned by user, not yet pushed to slab freelist)
// - used: total blocks carved from slab and distributed
//
// Due to concurrent updates, exact equality is not guaranteed.
// We allow a small tolerance (delta) for race conditions.
//
// ENV: HAKMEM_TINY_INVARIANT_CHECK=1 to enable (disabled by default)
// ============================================================================
// P2.4: Verify slab invariant: active + tls_cached ≈ used
// Returns: true if invariant holds within tolerance, false if violated
// tolerance: maximum allowed deviation (default: 2 for TLS lag)
static inline bool ss_verify_slab_invariant(const TinySlabMeta* meta, int tolerance) {
if (!meta || meta->capacity == 0) return true; // Skip uninitialized slabs
uint16_t used = atomic_load_explicit(&meta->used, memory_order_relaxed);
uint16_t active = atomic_load_explicit(&meta->active, memory_order_relaxed);
uint16_t tls_cached = atomic_load_explicit(&meta->tls_cached, memory_order_relaxed);
int sum = (int)active + (int)tls_cached;
int diff = sum - (int)used;
if (diff < 0) diff = -diff; // abs(diff)
return (diff <= tolerance);
}
// P2.4: Verify all slab invariants in a SuperSlab
// Returns: count of slabs that violate the invariant
// ENV: HAKMEM_TINY_INVARIANT_CHECK=1 to enable checking
static inline int ss_verify_superslab_invariants(const SuperSlab* ss, int tolerance) {
static int g_invariant_check = -1;
if (__builtin_expect(g_invariant_check == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_INVARIANT_CHECK");
g_invariant_check = (e && *e && *e != '0') ? 1 : 0;
}
if (!g_invariant_check) return 0; // Disabled by ENV
if (!ss) return 0;
int violations = 0;
uint32_t max_slabs = (1u << ss->lg_size) / SLAB_SIZE;
if (max_slabs > SLABS_PER_SUPERSLAB_MAX) {
max_slabs = SLABS_PER_SUPERSLAB_MAX;
}
for (uint32_t i = 0; i < max_slabs && i < ss->active_slabs; i++) {
const TinySlabMeta* meta = &ss->slabs[i];
if (!ss_verify_slab_invariant(meta, tolerance)) {
violations++;
#ifndef NDEBUG
// Debug output for violations
fprintf(stderr, "[P2.4] Invariant VIOLATION: slab[%u] used=%u active=%u tls_cached=%u (sum=%u)\n",
i, meta->used,
atomic_load_explicit(&meta->active, memory_order_relaxed),
atomic_load_explicit(&meta->tls_cached, memory_order_relaxed),
atomic_load_explicit(&meta->active, memory_order_relaxed) +
atomic_load_explicit(&meta->tls_cached, memory_order_relaxed));
#endif
}
}
return violations;
}
// P2.4: Debug dump of slab state for troubleshooting
// ENV: HAKMEM_TINY_INVARIANT_DUMP=1 to enable periodic dumps
static inline void ss_dump_slab_state(const SuperSlab* ss, int slab_idx) {
#ifndef NDEBUG
static int g_dump_enabled = -1;
if (__builtin_expect(g_dump_enabled == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_INVARIANT_DUMP");
g_dump_enabled = (e && *e && *e != '0') ? 1 : 0;
}
if (!g_dump_enabled) return;
if (!ss || slab_idx < 0 || slab_idx >= (int)ss->active_slabs) return;
const TinySlabMeta* meta = &ss->slabs[slab_idx];
fprintf(stderr, "[P2.4-DUMP] slab[%d]: used=%u active=%u tls_cached=%u capacity=%u class=%u\n",
slab_idx, meta->used,
atomic_load_explicit(&meta->active, memory_order_relaxed),
atomic_load_explicit(&meta->tls_cached, memory_order_relaxed),
meta->capacity, meta->class_idx);
#else
(void)ss;
(void)slab_idx;
#endif
}
#endif // SS_HOT_COLD_BOX_H