2025-11-30 18:11:08 +09:00
|
|
|
|
#include "hakmem_shared_pool_internal.h"
|
|
|
|
|
|
#include "hakmem_debug_master.h"
|
|
|
|
|
|
#include "hakmem_stats_master.h"
|
|
|
|
|
|
#include "box/ss_slab_meta_box.h"
|
|
|
|
|
|
#include "box/ss_hot_cold_box.h"
|
|
|
|
|
|
#include "box/pagefault_telemetry_box.h"
|
|
|
|
|
|
#include "box/tls_sll_drain_box.h"
|
|
|
|
|
|
#include "box/tls_slab_reuse_guard_box.h"
|
2025-12-04 18:01:25 +09:00
|
|
|
|
#include "box/ss_tier_box.h" // P-Tier: Tier filtering support
|
2025-11-30 18:11:08 +09:00
|
|
|
|
#include "hakmem_policy.h"
|
2025-12-02 20:51:50 +09:00
|
|
|
|
#include "hakmem_env_cache.h" // Priority-2: ENV cache
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
#include "front/tiny_warm_pool.h" // Warm Pool: Prefill during registry scans
|
2025-11-30 18:11:08 +09:00
|
|
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
#include <stdatomic.h>
|
|
|
|
|
|
|
2025-12-04 18:26:39 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Performance Measurement: Shared Pool Lock Contention (ENV-gated)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Global atomic counters for lock contention measurement
|
|
|
|
|
|
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
|
|
|
|
|
|
_Atomic uint64_t g_sp_stage2_lock_acquired_global = 0;
|
|
|
|
|
|
_Atomic uint64_t g_sp_stage3_lock_acquired_global = 0;
|
|
|
|
|
|
_Atomic uint64_t g_sp_alloc_lock_contention_global = 0;
|
|
|
|
|
|
|
2025-12-05 15:31:44 +09:00
|
|
|
|
// Per-class lock acquisition statistics(Tiny クラス別の lock 負荷観測用)
|
|
|
|
|
|
_Atomic uint64_t g_sp_stage2_lock_acquired_by_class[TINY_NUM_CLASSES_SS] = {0};
|
|
|
|
|
|
_Atomic uint64_t g_sp_stage3_lock_acquired_by_class[TINY_NUM_CLASSES_SS] = {0};
|
|
|
|
|
|
|
2025-12-04 18:26:39 +09:00
|
|
|
|
// Check if measurement is enabled (cached)
|
|
|
|
|
|
static inline int sp_measure_enabled(void) {
|
|
|
|
|
|
static int g_measure = -1;
|
|
|
|
|
|
if (__builtin_expect(g_measure == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
|
|
|
|
|
|
g_measure = (e && *e && *e != '0') ? 1 : 0;
|
2025-12-05 15:31:44 +09:00
|
|
|
|
if (g_measure == 1) {
|
|
|
|
|
|
// Measurement が ON のときは per-class stage stats も有効化する
|
|
|
|
|
|
// (Stage1/2/3 ヒット数は g_sp_stage*_hits に集計される)
|
|
|
|
|
|
extern int g_sp_stage_stats_enabled;
|
|
|
|
|
|
g_sp_stage_stats_enabled = 1;
|
|
|
|
|
|
}
|
2025-12-04 18:26:39 +09:00
|
|
|
|
}
|
|
|
|
|
|
return g_measure;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Print statistics function
|
|
|
|
|
|
void shared_pool_print_measurements(void);
|
|
|
|
|
|
|
2025-11-30 18:11:08 +09:00
|
|
|
|
// Stage 0.5: EMPTY slab direct scan(registry ベースの EMPTY 再利用)
|
|
|
|
|
|
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
|
|
|
|
|
|
// avoid Stage 3 (mmap) when freed slabs are available.
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
//
|
|
|
|
|
|
// WARM POOL OPTIMIZATION:
|
|
|
|
|
|
// - During the registry scan, prefill warm pool with HOT SuperSlabs
|
|
|
|
|
|
// - This eliminates future registry scans for cache misses
|
|
|
|
|
|
// - Expected gain: +40-50% by reducing O(N) scan overhead
|
2025-11-30 18:11:08 +09:00
|
|
|
|
static inline int
|
|
|
|
|
|
sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
|
|
|
|
|
|
{
|
2025-12-02 20:51:50 +09:00
|
|
|
|
// Priority-2: Use cached ENV
|
|
|
|
|
|
int empty_reuse_enabled = HAK_ENV_SS_EMPTY_REUSE();
|
2025-11-30 18:11:08 +09:00
|
|
|
|
if (!empty_reuse_enabled) {
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
|
|
|
|
|
|
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0;
|
2025-12-02 20:51:50 +09:00
|
|
|
|
// Priority-2: Use cached ENV
|
|
|
|
|
|
int scan_limit = HAK_ENV_SS_EMPTY_SCAN_LIMIT();
|
2025-11-30 18:11:08 +09:00
|
|
|
|
if (scan_limit > reg_size) scan_limit = reg_size;
|
|
|
|
|
|
|
|
|
|
|
|
// Stage 0.5 hit counter for visualization
|
|
|
|
|
|
static _Atomic uint64_t stage05_hits = 0;
|
|
|
|
|
|
static _Atomic uint64_t stage05_attempts = 0;
|
|
|
|
|
|
atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);
|
|
|
|
|
|
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
// Initialize warm pool on first use (per-thread, one-time)
|
|
|
|
|
|
tiny_warm_pool_init_once();
|
|
|
|
|
|
|
|
|
|
|
|
// Track SuperSlabs scanned during this acquire call for warm pool prefill
|
|
|
|
|
|
SuperSlab* primary_result = NULL;
|
|
|
|
|
|
int primary_slab_idx = -1;
|
|
|
|
|
|
|
2025-11-30 18:11:08 +09:00
|
|
|
|
for (int i = 0; i < scan_limit; i++) {
|
|
|
|
|
|
SuperSlab* ss = g_super_reg_by_class[class_idx][i];
|
|
|
|
|
|
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
|
2025-12-04 18:01:25 +09:00
|
|
|
|
// P-Tier: Skip DRAINING tier SuperSlabs
|
|
|
|
|
|
if (!ss_tier_is_hot(ss)) continue;
|
2025-11-30 18:11:08 +09:00
|
|
|
|
if (ss->empty_count == 0) continue; // No EMPTY slabs in this SS
|
|
|
|
|
|
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
// WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
|
|
|
|
|
|
// This is low-cost during registry scan and avoids future expensive scans
|
2025-12-05 12:16:39 +09:00
|
|
|
|
// Phase 1: Increase threshold from 4 to 12 to match TINY_WARM_POOL_MAX_PER_CLASS
|
|
|
|
|
|
if (ss != primary_result && tiny_warm_pool_count(class_idx) < 12) {
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
tiny_warm_pool_push(class_idx, ss);
|
|
|
|
|
|
// Track prefilled SuperSlabs for metrics
|
|
|
|
|
|
g_warm_pool_stats[class_idx].prefilled++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-30 18:11:08 +09:00
|
|
|
|
uint32_t mask = ss->empty_mask;
|
|
|
|
|
|
while (mask) {
|
|
|
|
|
|
int empty_idx = __builtin_ctz(mask);
|
|
|
|
|
|
mask &= (mask - 1); // clear lowest bit
|
|
|
|
|
|
|
|
|
|
|
|
TinySlabMeta* meta = &ss->slabs[empty_idx];
|
|
|
|
|
|
if (meta->capacity > 0 && meta->used == 0) {
|
|
|
|
|
|
tiny_tls_slab_reuse_guard(ss);
|
|
|
|
|
|
ss_clear_slab_empty(ss, empty_idx);
|
|
|
|
|
|
|
|
|
|
|
|
meta->class_idx = (uint8_t)class_idx;
|
|
|
|
|
|
ss->class_map[empty_idx] = (uint8_t)class_idx;
|
|
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (dbg_acquire == 1) {
|
|
|
|
|
|
fprintf(stderr,
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
"[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u warm_pool_size=%d)\n",
|
|
|
|
|
|
class_idx, (void*)ss, empty_idx, ss->empty_count, tiny_warm_pool_count(class_idx));
|
2025-11-30 18:11:08 +09:00
|
|
|
|
}
|
|
|
|
|
|
#else
|
|
|
|
|
|
(void)dbg_acquire;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
// Store primary result but continue scanning to prefill warm pool
|
|
|
|
|
|
if (primary_result == NULL) {
|
|
|
|
|
|
primary_result = ss;
|
|
|
|
|
|
primary_slab_idx = empty_idx;
|
|
|
|
|
|
*ss_out = ss;
|
|
|
|
|
|
*slab_idx_out = empty_idx;
|
|
|
|
|
|
sp_stage_stats_init();
|
|
|
|
|
|
if (g_sp_stage_stats_enabled) {
|
|
|
|
|
|
atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
|
2025-11-30 18:11:08 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
|
|
|
|
|
|
if (primary_result != NULL) {
|
|
|
|
|
|
// Stage 0.5 hit rate visualization (every 100 hits)
|
|
|
|
|
|
uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
|
|
|
|
|
|
if (hits % 100 == 1) {
|
|
|
|
|
|
uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
|
|
|
|
|
|
fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d warm_pool=%d)\n",
|
|
|
|
|
|
hits, attempts, (double)hits * 100.0 / attempts, scan_limit, tiny_warm_pool_count(class_idx));
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
2025-11-30 18:11:08 +09:00
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int
|
|
|
|
|
|
shared_pool_acquire_slab(int class_idx, SuperSlab** ss_out, int* slab_idx_out)
|
|
|
|
|
|
{
|
|
|
|
|
|
// Phase 12: SP-SLOT Box - 3-Stage Acquire Logic
|
|
|
|
|
|
//
|
|
|
|
|
|
// Stage 1: Reuse EMPTY slots from per-class free list (EMPTY→ACTIVE)
|
|
|
|
|
|
// Stage 2: Find UNUSED slots in existing SuperSlabs
|
|
|
|
|
|
// Stage 3: Get new SuperSlab (LRU pop or mmap)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Invariants:
|
|
|
|
|
|
// - On success: *ss_out != NULL, 0 <= *slab_idx_out < total_slots
|
|
|
|
|
|
// - The chosen slab has meta->class_idx == class_idx
|
|
|
|
|
|
|
|
|
|
|
|
if (!ss_out || !slab_idx_out) {
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
shared_pool_init();
|
|
|
|
|
|
|
|
|
|
|
|
// Debug logging / stage stats
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-12-02 20:51:50 +09:00
|
|
|
|
// Priority-2: Use cached ENV
|
|
|
|
|
|
int dbg_acquire = HAK_ENV_SS_ACQUIRE_DEBUG();
|
2025-11-30 18:11:08 +09:00
|
|
|
|
#else
|
|
|
|
|
|
static const int dbg_acquire = 0;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
sp_stage_stats_init();
|
|
|
|
|
|
|
|
|
|
|
|
stage1_retry_after_tension_drain:
|
|
|
|
|
|
// ========== Stage 0.5 (Phase 12-1.1): EMPTY slab direct scan ==========
|
|
|
|
|
|
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
|
|
|
|
|
|
// avoid Stage 3 (mmap) when freed slabs are available.
|
|
|
|
|
|
if (sp_acquire_from_empty_scan(class_idx, ss_out, slab_idx_out, dbg_acquire) == 0) {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Stage 1 (Lock-Free): Try to reuse EMPTY slots ==========
|
|
|
|
|
|
// P0-4: Lock-free pop from per-class free list (no mutex needed!)
|
|
|
|
|
|
// Best case: Same class freed a slot, reuse immediately (cache-hot)
|
|
|
|
|
|
SharedSSMeta* reuse_meta = NULL;
|
|
|
|
|
|
int reuse_slot_idx = -1;
|
|
|
|
|
|
|
|
|
|
|
|
if (sp_freelist_pop_lockfree(class_idx, &reuse_meta, &reuse_slot_idx)) {
|
|
|
|
|
|
// Found EMPTY slot from lock-free list!
|
|
|
|
|
|
// Now acquire mutex ONLY for slot activation and metadata update
|
|
|
|
|
|
|
|
|
|
|
|
// P0 instrumentation: count lock acquisitions
|
|
|
|
|
|
lock_stats_init();
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_count, 1);
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
|
|
|
|
|
|
// P0.3: Guard against TLS SLL orphaned pointers before reusing slab
|
|
|
|
|
|
// RACE FIX: Load SuperSlab pointer atomically BEFORE guard (consistency)
|
|
|
|
|
|
SuperSlab* ss_guard = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
|
|
|
|
|
|
if (ss_guard) {
|
|
|
|
|
|
tiny_tls_slab_reuse_guard(ss_guard);
|
2025-12-04 18:01:25 +09:00
|
|
|
|
|
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.
Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate
Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.
Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter
Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After: C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)
Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality
Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1
Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
|
|
|
|
// P-Tier: Skip DRAINING tier SuperSlabs
|
2025-12-04 18:01:25 +09:00
|
|
|
|
if (!ss_tier_is_hot(ss_guard)) {
|
|
|
|
|
|
// DRAINING SuperSlab - skip this slot and fall through to Stage 2
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
goto stage2_fallback;
|
|
|
|
|
|
}
|
2025-11-30 18:11:08 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Activate slot under mutex (slot state transition requires protection)
|
|
|
|
|
|
if (sp_slot_mark_active(reuse_meta, reuse_slot_idx, class_idx) == 0) {
|
|
|
|
|
|
// RACE FIX: Load SuperSlab pointer atomically (consistency)
|
|
|
|
|
|
SuperSlab* ss = atomic_load_explicit(&reuse_meta->ss, memory_order_relaxed);
|
|
|
|
|
|
|
|
|
|
|
|
// RACE FIX: Check if SuperSlab was freed (NULL pointer)
|
|
|
|
|
|
// This can happen if Thread A freed the SuperSlab after pushing slot to freelist,
|
|
|
|
|
|
// but Thread B popped the stale slot before the freelist was cleared.
|
|
|
|
|
|
if (!ss) {
|
|
|
|
|
|
// SuperSlab freed - skip and fall through to Stage 2/3
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
goto stage2_fallback;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (dbg_acquire == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SP_ACQUIRE_STAGE1_LOCKFREE] class=%d reusing EMPTY slot (ss=%p slab=%d)\n",
|
|
|
|
|
|
class_idx, (void*)ss, reuse_slot_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Update SuperSlab metadata
|
|
|
|
|
|
ss->slab_bitmap |= (1u << reuse_slot_idx);
|
|
|
|
|
|
ss_slab_meta_class_idx_set(ss, reuse_slot_idx, (uint8_t)class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
if (ss->active_slabs == 0) {
|
|
|
|
|
|
// Was empty, now active again
|
|
|
|
|
|
ss->active_slabs = 1;
|
|
|
|
|
|
g_shared_pool.active_count++;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Track per-class active slots (approximate, under alloc_lock)
|
|
|
|
|
|
if (class_idx < TINY_NUM_CLASSES_SS) {
|
|
|
|
|
|
g_shared_pool.class_active_slots[class_idx]++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Update hint
|
|
|
|
|
|
g_shared_pool.class_hints[class_idx] = ss;
|
|
|
|
|
|
|
|
|
|
|
|
*ss_out = ss;
|
|
|
|
|
|
*slab_idx_out = reuse_slot_idx;
|
|
|
|
|
|
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
if (g_sp_stage_stats_enabled) {
|
|
|
|
|
|
atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0; // ✅ Stage 1 (lock-free) success
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Slot activation failed (race condition?) - release lock and fall through
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
stage2_fallback:
|
|
|
|
|
|
// ========== Stage 2 (Lock-Free): Try to claim UNUSED slots ==========
|
2025-12-04 16:21:54 +09:00
|
|
|
|
// P0 Optimization: Try class hint FIRST for fast path (same class locality)
|
|
|
|
|
|
// This reduces metadata scan from 100% to ~10% when hints are effective
|
|
|
|
|
|
{
|
|
|
|
|
|
SuperSlab* hint_ss = g_shared_pool.class_hints[class_idx];
|
|
|
|
|
|
if (__builtin_expect(hint_ss != NULL, 1)) {
|
2025-12-04 18:01:25 +09:00
|
|
|
|
// P-Tier: Skip DRAINING tier SuperSlabs
|
|
|
|
|
|
if (!ss_tier_is_hot(hint_ss)) {
|
|
|
|
|
|
// Clear stale hint pointing to DRAINING SuperSlab
|
|
|
|
|
|
g_shared_pool.class_hints[class_idx] = NULL;
|
|
|
|
|
|
goto stage2_scan;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-04 16:21:54 +09:00
|
|
|
|
// P0 Optimization: O(1) lookup via cached pointer (avoids metadata scan)
|
|
|
|
|
|
SharedSSMeta* hint_meta = hint_ss->shared_meta;
|
|
|
|
|
|
if (__builtin_expect(hint_meta != NULL, 1)) {
|
|
|
|
|
|
// Try lock-free claiming on hint SuperSlab first
|
|
|
|
|
|
int claimed_idx = sp_slot_claim_lockfree(hint_meta, class_idx);
|
|
|
|
|
|
if (__builtin_expect(claimed_idx >= 0, 1)) {
|
|
|
|
|
|
// Fast path success! No need to scan all metadata
|
|
|
|
|
|
SuperSlab* ss = atomic_load_explicit(&hint_meta->ss, memory_order_acquire);
|
|
|
|
|
|
if (__builtin_expect(ss != NULL, 1)) {
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (dbg_acquire == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SP_ACQUIRE_STAGE2_HINT] class=%d claimed UNUSED slot from hint (ss=%p slab=%d)\n",
|
|
|
|
|
|
class_idx, (void*)ss, claimed_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// P0 instrumentation: count lock acquisitions
|
|
|
|
|
|
lock_stats_init();
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_count, 1);
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
|
2025-12-04 18:26:39 +09:00
|
|
|
|
// Performance measurement: count Stage 2 lock acquisitions
|
|
|
|
|
|
if (__builtin_expect(sp_measure_enabled(), 0)) {
|
2025-12-05 15:31:44 +09:00
|
|
|
|
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global,
|
|
|
|
|
|
1, memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
|
|
|
|
|
|
1, memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(
|
|
|
|
|
|
&g_sp_stage2_lock_acquired_by_class[class_idx],
|
|
|
|
|
|
1, memory_order_relaxed);
|
2025-12-04 18:26:39 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-04 16:21:54 +09:00
|
|
|
|
// Update SuperSlab metadata under mutex
|
|
|
|
|
|
ss->slab_bitmap |= (1u << claimed_idx);
|
|
|
|
|
|
ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
if (ss->active_slabs == 0) {
|
|
|
|
|
|
ss->active_slabs = 1;
|
|
|
|
|
|
g_shared_pool.active_count++;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (class_idx < TINY_NUM_CLASSES_SS) {
|
|
|
|
|
|
g_shared_pool.class_active_slots[class_idx]++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Hint is still good, no need to update
|
|
|
|
|
|
*ss_out = ss;
|
|
|
|
|
|
*slab_idx_out = claimed_idx;
|
|
|
|
|
|
sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
if (g_sp_stage_stats_enabled) {
|
|
|
|
|
|
atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0; // ✅ Stage 2 (hint fast path) success
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-04 18:01:25 +09:00
|
|
|
|
stage2_scan:
|
2025-11-30 18:11:08 +09:00
|
|
|
|
// P0-5: Lock-free atomic CAS claiming (no mutex needed for slot state transition!)
|
|
|
|
|
|
// RACE FIX: Read ss_meta_count atomically (now properly declared as _Atomic)
|
|
|
|
|
|
// No cast needed! memory_order_acquire synchronizes with release in sp_meta_find_or_create
|
|
|
|
|
|
uint32_t meta_count = atomic_load_explicit(
|
|
|
|
|
|
&g_shared_pool.ss_meta_count,
|
|
|
|
|
|
memory_order_acquire
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
for (uint32_t i = 0; i < meta_count; i++) {
|
|
|
|
|
|
SharedSSMeta* meta = &g_shared_pool.ss_metadata[i];
|
|
|
|
|
|
|
2025-12-04 18:01:25 +09:00
|
|
|
|
// RACE FIX: Load SuperSlab pointer atomically BEFORE claiming
|
|
|
|
|
|
// Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
|
|
|
|
|
|
SuperSlab* ss_preflight = atomic_load_explicit(&meta->ss, memory_order_acquire);
|
|
|
|
|
|
if (!ss_preflight) {
|
|
|
|
|
|
// SuperSlab was freed - skip this entry
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// P-Tier: Skip DRAINING tier SuperSlabs
|
|
|
|
|
|
if (!ss_tier_is_hot(ss_preflight)) {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-30 18:11:08 +09:00
|
|
|
|
// Try lock-free claiming (UNUSED → ACTIVE via CAS)
|
|
|
|
|
|
int claimed_idx = sp_slot_claim_lockfree(meta, class_idx);
|
|
|
|
|
|
if (claimed_idx >= 0) {
|
2025-12-04 18:01:25 +09:00
|
|
|
|
// RACE FIX: Load SuperSlab pointer atomically again after claiming
|
2025-11-30 18:11:08 +09:00
|
|
|
|
// Use memory_order_acquire to synchronize with release in sp_meta_find_or_create
|
|
|
|
|
|
SuperSlab* ss = atomic_load_explicit(&meta->ss, memory_order_acquire);
|
|
|
|
|
|
if (!ss) {
|
|
|
|
|
|
// SuperSlab was freed between claiming and loading - skip this entry
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (dbg_acquire == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SP_ACQUIRE_STAGE2_LOCKFREE] class=%d claimed UNUSED slot (ss=%p slab=%d)\n",
|
|
|
|
|
|
class_idx, (void*)ss, claimed_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// P0 instrumentation: count lock acquisitions
|
|
|
|
|
|
lock_stats_init();
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_count, 1);
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
|
2025-12-04 18:26:39 +09:00
|
|
|
|
// Performance measurement: count Stage 2 scan lock acquisitions
|
|
|
|
|
|
if (__builtin_expect(sp_measure_enabled(), 0)) {
|
2025-12-05 15:31:44 +09:00
|
|
|
|
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global,
|
|
|
|
|
|
1, memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
|
|
|
|
|
|
1, memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(
|
|
|
|
|
|
&g_sp_stage2_lock_acquired_by_class[class_idx],
|
|
|
|
|
|
1, memory_order_relaxed);
|
2025-12-04 18:26:39 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-30 18:11:08 +09:00
|
|
|
|
// Update SuperSlab metadata under mutex
|
|
|
|
|
|
ss->slab_bitmap |= (1u << claimed_idx);
|
|
|
|
|
|
ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
if (ss->active_slabs == 0) {
|
|
|
|
|
|
ss->active_slabs = 1;
|
|
|
|
|
|
g_shared_pool.active_count++;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (class_idx < TINY_NUM_CLASSES_SS) {
|
|
|
|
|
|
g_shared_pool.class_active_slots[class_idx]++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Update hint
|
|
|
|
|
|
g_shared_pool.class_hints[class_idx] = ss;
|
|
|
|
|
|
|
|
|
|
|
|
*ss_out = ss;
|
|
|
|
|
|
*slab_idx_out = claimed_idx;
|
|
|
|
|
|
sp_fix_geometry_if_needed(ss, claimed_idx, class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
if (g_sp_stage_stats_enabled) {
|
|
|
|
|
|
atomic_fetch_add(&g_sp_stage2_hits[class_idx], 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0; // ✅ Stage 2 (lock-free) success
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Claim failed (no UNUSED slots in this meta) - continue to next SuperSlab
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Tension-Based Drain: Try to create EMPTY slots before Stage 3 ==========
|
|
|
|
|
|
// If TLS SLL has accumulated blocks, drain them to enable EMPTY slot detection
|
|
|
|
|
|
// This can avoid allocating new SuperSlabs by reusing EMPTY slots in Stage 1
|
|
|
|
|
|
// ENV: HAKMEM_TINY_TENSION_DRAIN_ENABLE=0 to disable (default=1)
|
|
|
|
|
|
// ENV: HAKMEM_TINY_TENSION_DRAIN_THRESHOLD=N to set threshold (default=1024)
|
|
|
|
|
|
{
|
2025-12-02 20:51:50 +09:00
|
|
|
|
// Priority-2: Use cached ENV
|
|
|
|
|
|
int tension_drain_enabled = HAK_ENV_TINY_TENSION_DRAIN_ENABLE();
|
|
|
|
|
|
uint32_t tension_threshold = (uint32_t)HAK_ENV_TINY_TENSION_DRAIN_THRESHOLD();
|
2025-11-30 18:11:08 +09:00
|
|
|
|
|
|
|
|
|
|
if (tension_drain_enabled) {
|
|
|
|
|
|
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
|
|
|
|
|
extern uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size);
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t sll_count = (class_idx < TINY_NUM_CLASSES) ? g_tls_sll[class_idx].count : 0;
|
|
|
|
|
|
|
|
|
|
|
|
if (sll_count >= tension_threshold) {
|
|
|
|
|
|
// Drain all blocks to maximize EMPTY slot creation
|
|
|
|
|
|
uint32_t drained = tiny_tls_sll_drain(class_idx, 0); // 0 = drain all
|
|
|
|
|
|
|
|
|
|
|
|
if (drained > 0) {
|
|
|
|
|
|
// Retry Stage 1 (EMPTY reuse) after drain
|
|
|
|
|
|
// Some slabs might have become EMPTY (meta->used == 0)
|
|
|
|
|
|
goto stage1_retry_after_tension_drain;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Stage 3: Mutex-protected fallback (new SuperSlab allocation) ==========
|
|
|
|
|
|
// All existing SuperSlabs have no UNUSED slots → need new SuperSlab
|
|
|
|
|
|
// P0 instrumentation: count lock acquisitions
|
|
|
|
|
|
lock_stats_init();
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_count, 1);
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_slab_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
|
2025-12-04 18:26:39 +09:00
|
|
|
|
// Performance measurement: count Stage 3 lock acquisitions
|
|
|
|
|
|
if (__builtin_expect(sp_measure_enabled(), 0)) {
|
2025-12-05 15:31:44 +09:00
|
|
|
|
atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_global,
|
|
|
|
|
|
1, memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
|
|
|
|
|
|
1, memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_by_class[class_idx],
|
|
|
|
|
|
1, memory_order_relaxed);
|
2025-12-04 18:26:39 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-30 18:11:08 +09:00
|
|
|
|
// ========== Stage 3: Get new SuperSlab ==========
|
|
|
|
|
|
// Try LRU cache first, then mmap
|
|
|
|
|
|
SuperSlab* new_ss = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
// Stage 3a: Try LRU cache
|
|
|
|
|
|
extern SuperSlab* hak_ss_lru_pop(uint8_t size_class);
|
|
|
|
|
|
new_ss = hak_ss_lru_pop((uint8_t)class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
int from_lru = (new_ss != NULL);
|
|
|
|
|
|
|
|
|
|
|
|
// Stage 3b: If LRU miss, allocate new SuperSlab
|
|
|
|
|
|
if (!new_ss) {
|
|
|
|
|
|
// Release the alloc_lock to avoid deadlock with registry during superslab_allocate
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
|
2025-12-03 20:42:28 +09:00
|
|
|
|
SuperSlab* allocated_ss = sp_internal_allocate_superslab(class_idx);
|
2025-11-30 18:11:08 +09:00
|
|
|
|
|
|
|
|
|
|
// Re-acquire the alloc_lock
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_count, 1);
|
|
|
|
|
|
atomic_fetch_add(&g_lock_acquire_slab_count, 1); // This is part of acquisition path
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
|
|
|
|
|
|
if (!allocated_ss) {
|
|
|
|
|
|
// Allocation failed; return now.
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
return -1; // Out of memory
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
new_ss = allocated_ss;
|
|
|
|
|
|
|
|
|
|
|
|
// Add newly allocated SuperSlab to the shared pool's internal array
|
|
|
|
|
|
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
|
|
|
|
|
|
shared_pool_ensure_capacity_unlocked(g_shared_pool.total_count + 1);
|
|
|
|
|
|
if (g_shared_pool.total_count >= g_shared_pool.capacity) {
|
|
|
|
|
|
// Pool table expansion failed; leave ss alive (registry-owned),
|
|
|
|
|
|
// but do not treat it as part of shared_pool.
|
|
|
|
|
|
// This is a critical error, return early.
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
g_shared_pool.slabs[g_shared_pool.total_count] = new_ss;
|
|
|
|
|
|
g_shared_pool.total_count++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (dbg_acquire == 1 && new_ss) {
|
|
|
|
|
|
fprintf(stderr, "[SP_ACQUIRE_STAGE3] class=%d new SuperSlab (ss=%p from_lru=%d)\n",
|
|
|
|
|
|
class_idx, (void*)new_ss, from_lru);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
if (!new_ss) {
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
return -1; // ❌ Out of memory
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Before creating a new SuperSlab, consult learning-layer soft cap.
|
2025-12-01 13:47:23 +09:00
|
|
|
|
// Phase 9-2: Soft Cap removed to allow Shared Pool to fully replace Legacy Backend.
|
|
|
|
|
|
// We now rely on LRU eviction and EMPTY recycling to manage memory pressure.
|
2025-11-30 18:11:08 +09:00
|
|
|
|
|
|
|
|
|
|
// Create metadata for this new SuperSlab
|
|
|
|
|
|
SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);
|
|
|
|
|
|
if (!new_meta) {
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
return -1; // ❌ Metadata allocation failed
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Assign first slot to this class
|
|
|
|
|
|
int first_slot = 0;
|
|
|
|
|
|
if (sp_slot_mark_active(new_meta, first_slot, class_idx) != 0) {
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
return -1; // ❌ Should not happen
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Update SuperSlab metadata
|
|
|
|
|
|
new_ss->slab_bitmap |= (1u << first_slot);
|
|
|
|
|
|
ss_slab_meta_class_idx_set(new_ss, first_slot, (uint8_t)class_idx);
|
|
|
|
|
|
new_ss->active_slabs = 1;
|
|
|
|
|
|
g_shared_pool.active_count++;
|
|
|
|
|
|
if (class_idx < TINY_NUM_CLASSES_SS) {
|
|
|
|
|
|
g_shared_pool.class_active_slots[class_idx]++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Update hint
|
|
|
|
|
|
g_shared_pool.class_hints[class_idx] = new_ss;
|
|
|
|
|
|
|
|
|
|
|
|
*ss_out = new_ss;
|
|
|
|
|
|
*slab_idx_out = first_slot;
|
|
|
|
|
|
sp_fix_geometry_if_needed(new_ss, first_slot, class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
if (g_lock_stats_enabled == 1) {
|
|
|
|
|
|
atomic_fetch_add(&g_lock_release_count, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
|
|
|
|
|
|
if (g_sp_stage_stats_enabled) {
|
|
|
|
|
|
atomic_fetch_add(&g_sp_stage3_hits[class_idx], 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0; // ✅ Stage 3 success
|
|
|
|
|
|
}
|
2025-12-04 18:26:39 +09:00
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Performance Measurement: Print Statistics
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
void shared_pool_print_measurements(void) {
|
|
|
|
|
|
if (!sp_measure_enabled()) {
|
|
|
|
|
|
return; // Measurement disabled
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-05 15:31:44 +09:00
|
|
|
|
uint64_t stage2 = atomic_load_explicit(&g_sp_stage2_lock_acquired_global,
|
|
|
|
|
|
memory_order_relaxed);
|
|
|
|
|
|
uint64_t stage3 = atomic_load_explicit(&g_sp_stage3_lock_acquired_global,
|
|
|
|
|
|
memory_order_relaxed);
|
|
|
|
|
|
uint64_t total_locks = atomic_load_explicit(&g_sp_alloc_lock_contention_global,
|
|
|
|
|
|
memory_order_relaxed);
|
2025-12-04 18:26:39 +09:00
|
|
|
|
|
|
|
|
|
|
if (total_locks == 0) {
|
|
|
|
|
|
fprintf(stderr, "\n========================================\n");
|
|
|
|
|
|
fprintf(stderr, "Shared Pool Contention Statistics\n");
|
|
|
|
|
|
fprintf(stderr, "========================================\n");
|
|
|
|
|
|
fprintf(stderr, "No lock acquisitions recorded\n");
|
|
|
|
|
|
fprintf(stderr, "========================================\n\n");
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
double stage2_pct = (100.0 * stage2) / total_locks;
|
|
|
|
|
|
double stage3_pct = (100.0 * stage3) / total_locks;
|
|
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "\n========================================\n");
|
|
|
|
|
|
fprintf(stderr, "Shared Pool Contention Statistics\n");
|
|
|
|
|
|
fprintf(stderr, "========================================\n");
|
|
|
|
|
|
fprintf(stderr, "Stage 2 Locks: %llu (%.1f%%)\n",
|
|
|
|
|
|
(unsigned long long)stage2, stage2_pct);
|
|
|
|
|
|
fprintf(stderr, "Stage 3 Locks: %llu (%.1f%%)\n",
|
|
|
|
|
|
(unsigned long long)stage3, stage3_pct);
|
|
|
|
|
|
fprintf(stderr, "Total Contention: %llu lock acquisitions\n",
|
|
|
|
|
|
(unsigned long long)total_locks);
|
2025-12-05 15:31:44 +09:00
|
|
|
|
|
|
|
|
|
|
// Per-class breakdown(Tiny 用クラス 0-7、特に C5–C7 を観測)
|
|
|
|
|
|
fprintf(stderr, "\nPer-class Shared Pool Locks (Stage2/Stage3):\n");
|
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) {
|
|
|
|
|
|
uint64_t s2c = atomic_load_explicit(
|
|
|
|
|
|
&g_sp_stage2_lock_acquired_by_class[cls],
|
|
|
|
|
|
memory_order_relaxed);
|
|
|
|
|
|
uint64_t s3c = atomic_load_explicit(
|
|
|
|
|
|
&g_sp_stage3_lock_acquired_by_class[cls],
|
|
|
|
|
|
memory_order_relaxed);
|
|
|
|
|
|
uint64_t tc = s2c + s3c;
|
|
|
|
|
|
if (tc == 0) {
|
|
|
|
|
|
continue; // ロック取得のないクラスは省略
|
|
|
|
|
|
}
|
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
|
" C%d: Stage2=%llu Stage3=%llu Total=%llu\n",
|
|
|
|
|
|
cls,
|
|
|
|
|
|
(unsigned long long)s2c,
|
|
|
|
|
|
(unsigned long long)s3c,
|
|
|
|
|
|
(unsigned long long)tc);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-04 18:26:39 +09:00
|
|
|
|
fprintf(stderr, "========================================\n\n");
|
|
|
|
|
|
}
|