Files
hakmem/core/hakmem_tiny_stats.c

683 lines
31 KiB
C
Raw Normal View History

// hakmem_tiny_stats.c
// Phase 2, Module 1: Statistics and Debug Functions
// Extracted from hakmem_tiny.c (lines 4348-4728, non-contiguous)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <stdatomic.h>
#include <unistd.h>
#include "hakmem_tiny.h"
#include "hakmem_tiny_config.h" // extern g_tiny_class_sizes
#include "hakmem_tiny_stats_api.h"
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
#include "hakmem_stats_master.h" // Phase 4d: Master stats control
static int g_dump_atexit_only = -1; // env: HAKMEM_TINY_DUMP_ATEXIT_ONLY=1
// Forward declaration of local dump (defined later in this file)
static void hak_tiny_refill_counters_dump(void);
static void hak_tiny_stats_init_flags(void) {
if (g_dump_atexit_only == -1) {
const char* e = getenv("HAKMEM_TINY_DUMP_ATEXIT_ONLY");
g_dump_atexit_only = (e && atoi(e) != 0) ? 1 : 0;
}
}
void hak_tiny_dump_all_counters_now(void) {
// Dump both minimal and extended (if compiled)
hak_tiny_refill_counters_dump();
hak_tiny_debug_counters_dump();
}
#include "hakmem_tiny_superslab.h"
#include "hakmem_config.h"
#include "hakmem_tiny_stats.h"
// ============================================================================
// Phase 8.1: Public Statistics API (lines 4348-4415)
// ============================================================================
void hak_tiny_get_stats(uint64_t* alloc_count, uint64_t* free_count, uint64_t* slab_count) {
if (!g_tiny_initialized) return;
#ifdef HAKMEM_ENABLE_STATS
// Flush TLS batches to global counters for accurate stats
stats_flush_all();
#endif
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
if (alloc_count) alloc_count[i] = g_tiny_pool.alloc_count[i];
if (free_count) free_count[i] = g_tiny_pool.free_count[i];
if (slab_count) slab_count[i] = g_tiny_pool.slab_count[i];
}
}
void hak_tiny_print_stats(void) {
if (!g_tiny_initialized) {
printf("Tiny Pool not initialized\n");
return;
}
printf("\n");
printf("Tiny Pool Statistics\n");
printf("========================================\n");
printf("Class | Size | Allocs | Frees | Slabs\n");
printf("------|--------|---------|---------|-------\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
printf(" %d | %4zuB | %7lu | %7lu | %5lu\n",
i,
g_tiny_class_sizes[i],
(unsigned long)g_tiny_pool.alloc_count[i],
(unsigned long)g_tiny_pool.free_count[i],
(unsigned long)g_tiny_pool.slab_count[i]);
}
printf("========================================\n");
printf("\n");
#if HAKMEM_BUILD_DEBUG
printf("TLS Debug Counters (hit/miss/spill) per class\n");
printf("---------------------------------------------\n");
printf("Class | Hit | Miss | SpillSS | SpillOwner | SpillMag | SpillReq\n");
printf("------+-----------+-----------+-----------+-----------+-----------+-----------\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
printf(" %d | %9llu | %9llu | %9llu | %9llu | %9llu | %9llu\n",
i,
(unsigned long long)g_tls_hit_count[i],
(unsigned long long)g_tls_miss_count[i],
(unsigned long long)g_tls_spill_ss_count[i],
(unsigned long long)g_tls_spill_owner_count[i],
(unsigned long long)g_tls_spill_mag_count[i],
(unsigned long long)g_tls_spill_requeue_count[i]);
}
printf("---------------------------------------------\n\n");
printf("Observation Snapshot: removed (obs pipeline retired)\n\n");
#endif
}
// ============================================================================
// Phase 8.2: Memory Profiling Debug (toggle with HAKMEM_DEBUG_MEMORY)
// ============================================================================
#ifdef HAKMEM_DEBUG_MEMORY
// NOTE: count_active_superslabs and hak_tiny_print_memory_profile are currently disabled
#else
// Stub function when debug is disabled
void hak_tiny_print_memory_profile(void) {
// No-op in release builds
}
#endif // HAKMEM_DEBUG_MEMORY
// ============================================================================
// Debug Print Functions (always available, gated by HAKMEM_DEBUG_COUNTERS)
// ============================================================================
// Debug print for Ultra Tiny counters
#if HAKMEM_DEBUG_COUNTERS
void hak_tiny_ultra_debug_dump(void) {
// NOTE: Ultra Tiny counters (pop_hits, refills, resets, sll_count) are currently not tracked
// Uncomment when these variables are implemented
/*
fprintf(stderr, "\n[Ultra Tiny Debug]\n");
fprintf(stderr, "class, pop_hits, refills, resets, sll_count\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%u\n",
i,
(unsigned long long)g_ultra_pop_hits[i],
(unsigned long long)g_ultra_refill_calls[i],
(unsigned long long)g_ultra_resets[i],
(unsigned)g_tls_sll[i].count);
}
*/
}
#else
void hak_tiny_ultra_debug_dump(void) { /* no-op in release builds */ }
#endif
// Debug print for normal path counters (SLL/MAG/FRONT/SUPER)
void hak_tiny_path_debug_dump(void) {
#if HAKMEM_DEBUG_COUNTERS
const char* on = getenv("HAKMEM_TINY_PATH_DEBUG");
if (!(on && atoi(on) != 0)) return;
// NOTE: Path debug counters (sll_pop, mag_pop, etc.) are currently not tracked
// Uncomment when these variables are implemented
/*
fprintf(stderr, "\n[Tiny Path Debug]\n");
fprintf(stderr, "class, sll_pop, mag_pop, front_pop, superslab, refills\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu\n",
i,
(unsigned long long)g_path_sll_pop[i],
(unsigned long long)g_path_mag_pop[i],
(unsigned long long)g_path_front_pop[i],
(unsigned long long)g_path_superslab[i],
(unsigned long long)g_path_refill_calls[i]);
}
*/
(void)on;
#else
(void)getenv; // suppress unused warnings when compiled out
#endif
}
// Debug print for extended counters (slow/bin/bump/spec)
void hak_tiny_debug_counters_dump(void) {
#if HAKMEM_DEBUG_COUNTERS
ENV cleanup: Add RELEASE guards to DEBUG ENV variables (14 vars) Added compile-time guards (#if HAKMEM_BUILD_RELEASE) to eliminate DEBUG ENV variable overhead in RELEASE builds. Variables guarded (14 total): - HAKMEM_TINY_TRACE_RING, HAKMEM_TINY_DUMP_RING_ATEXIT - HAKMEM_TINY_RF_TRACE, HAKMEM_TINY_MAILBOX_TRACE - HAKMEM_TINY_MAILBOX_TRACE_LIMIT, HAKMEM_TINY_MAILBOX_SLOWDISC - HAKMEM_TINY_MAILBOX_SLOWDISC_PERIOD - HAKMEM_SS_PREWARM_DEBUG, HAKMEM_SS_FREE_DEBUG - HAKMEM_TINY_FRONT_METRICS, HAKMEM_TINY_FRONT_DUMP - HAKMEM_TINY_COUNTERS_DUMP, HAKMEM_TINY_REFILL_DUMP - HAKMEM_PTR_TRACE_DUMP, HAKMEM_PTR_TRACE_VERBOSE Files modified (9 core files): - core/tiny_debug_ring.c (ring trace/dump) - core/box/mailbox_box.c (mailbox trace + slowdisc) - core/tiny_refill.h (refill trace) - core/hakmem_tiny_superslab.c (superslab debug) - core/box/ss_allocation_box.c (allocation debug) - core/tiny_superslab_free.inc.h (free debug) - core/box/front_metrics_box.c (frontend metrics) - core/hakmem_tiny_stats.c (stats dump) - core/ptr_trace.h (pointer trace) Bug fixes during implementation: 1. mailbox_box.c - Fixed variable scope (moved 'used' outside guard) 2. hakmem_tiny_stats.c - Fixed incomplete declarations (on1, on2) Impact: - Binary size: -85KB total - bench_random_mixed_hakmem: 319K → 305K (-14K, -4.4%) - larson_hakmem: 380K → 309K (-71K, -18.7%) - Performance: No regression (16.9-17.9M ops/s maintained) - Functional: All tests pass (Random Mixed + Larson) - Behavior: DEBUG ENV vars correctly ignored in RELEASE builds Testing: - Build: Clean compilation (warnings only, pre-existing) - 100K Random Mixed: 16.9-17.9M ops/s (PASS) - 10K Larson: 25.9M ops/s (PASS) - DEBUG ENV verification: Correctly ignored (PASS) Result: 14 DEBUG ENV variables now have zero overhead in RELEASE builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 03:41:07 +09:00
#if HAKMEM_BUILD_RELEASE
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
return;
ENV cleanup: Add RELEASE guards to DEBUG ENV variables (14 vars) Added compile-time guards (#if HAKMEM_BUILD_RELEASE) to eliminate DEBUG ENV variable overhead in RELEASE builds. Variables guarded (14 total): - HAKMEM_TINY_TRACE_RING, HAKMEM_TINY_DUMP_RING_ATEXIT - HAKMEM_TINY_RF_TRACE, HAKMEM_TINY_MAILBOX_TRACE - HAKMEM_TINY_MAILBOX_TRACE_LIMIT, HAKMEM_TINY_MAILBOX_SLOWDISC - HAKMEM_TINY_MAILBOX_SLOWDISC_PERIOD - HAKMEM_SS_PREWARM_DEBUG, HAKMEM_SS_FREE_DEBUG - HAKMEM_TINY_FRONT_METRICS, HAKMEM_TINY_FRONT_DUMP - HAKMEM_TINY_COUNTERS_DUMP, HAKMEM_TINY_REFILL_DUMP - HAKMEM_PTR_TRACE_DUMP, HAKMEM_PTR_TRACE_VERBOSE Files modified (9 core files): - core/tiny_debug_ring.c (ring trace/dump) - core/box/mailbox_box.c (mailbox trace + slowdisc) - core/tiny_refill.h (refill trace) - core/hakmem_tiny_superslab.c (superslab debug) - core/box/ss_allocation_box.c (allocation debug) - core/tiny_superslab_free.inc.h (free debug) - core/box/front_metrics_box.c (frontend metrics) - core/hakmem_tiny_stats.c (stats dump) - core/ptr_trace.h (pointer trace) Bug fixes during implementation: 1. mailbox_box.c - Fixed variable scope (moved 'used' outside guard) 2. hakmem_tiny_stats.c - Fixed incomplete declarations (on1, on2) Impact: - Binary size: -85KB total - bench_random_mixed_hakmem: 319K → 305K (-14K, -4.4%) - larson_hakmem: 380K → 309K (-71K, -18.7%) - Performance: No regression (16.9-17.9M ops/s maintained) - Functional: All tests pass (Random Mixed + Larson) - Behavior: DEBUG ENV vars correctly ignored in RELEASE builds Testing: - Build: Clean compilation (warnings only, pre-existing) - 100K Random Mixed: 16.9-17.9M ops/s (PASS) - 10K Larson: 25.9M ops/s (PASS) - DEBUG ENV verification: Correctly ignored (PASS) Result: 14 DEBUG ENV variables now have zero overhead in RELEASE builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 03:41:07 +09:00
#else
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
if (!hak_stats_check("HAKMEM_TINY_COUNTERS_DUMP", "counters")) {
return;
}
ENV cleanup: Add RELEASE guards to DEBUG ENV variables (14 vars) Added compile-time guards (#if HAKMEM_BUILD_RELEASE) to eliminate DEBUG ENV variable overhead in RELEASE builds. Variables guarded (14 total): - HAKMEM_TINY_TRACE_RING, HAKMEM_TINY_DUMP_RING_ATEXIT - HAKMEM_TINY_RF_TRACE, HAKMEM_TINY_MAILBOX_TRACE - HAKMEM_TINY_MAILBOX_TRACE_LIMIT, HAKMEM_TINY_MAILBOX_SLOWDISC - HAKMEM_TINY_MAILBOX_SLOWDISC_PERIOD - HAKMEM_SS_PREWARM_DEBUG, HAKMEM_SS_FREE_DEBUG - HAKMEM_TINY_FRONT_METRICS, HAKMEM_TINY_FRONT_DUMP - HAKMEM_TINY_COUNTERS_DUMP, HAKMEM_TINY_REFILL_DUMP - HAKMEM_PTR_TRACE_DUMP, HAKMEM_PTR_TRACE_VERBOSE Files modified (9 core files): - core/tiny_debug_ring.c (ring trace/dump) - core/box/mailbox_box.c (mailbox trace + slowdisc) - core/tiny_refill.h (refill trace) - core/hakmem_tiny_superslab.c (superslab debug) - core/box/ss_allocation_box.c (allocation debug) - core/tiny_superslab_free.inc.h (free debug) - core/box/front_metrics_box.c (frontend metrics) - core/hakmem_tiny_stats.c (stats dump) - core/ptr_trace.h (pointer trace) Bug fixes during implementation: 1. mailbox_box.c - Fixed variable scope (moved 'used' outside guard) 2. hakmem_tiny_stats.c - Fixed incomplete declarations (on1, on2) Impact: - Binary size: -85KB total - bench_random_mixed_hakmem: 319K → 305K (-14K, -4.4%) - larson_hakmem: 380K → 309K (-71K, -18.7%) - Performance: No regression (16.9-17.9M ops/s maintained) - Functional: All tests pass (Random Mixed + Larson) - Behavior: DEBUG ENV vars correctly ignored in RELEASE builds Testing: - Build: Clean compilation (warnings only, pre-existing) - 100K Random Mixed: 16.9-17.9M ops/s (PASS) - 10K Larson: 25.9M ops/s (PASS) - DEBUG ENV verification: Correctly ignored (PASS) Result: 14 DEBUG ENV variables now have zero overhead in RELEASE builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 03:41:07 +09:00
#endif
// NOTE: Extended counters (alloc_slow, bitmap_scans, etc.) are currently not tracked
// Uncomment when these variables are implemented
/*
fprintf(stderr, "\n[Tiny Extended Counters]\n");
fprintf(stderr, "class, alloc_slow, ss_refill, bitmap_scans, bin_pops, bump_hits, bump_arms, spec_calls, spec_hits\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n",
i,
(unsigned long long)g_alloc_slow_calls[i],
(unsigned long long)g_superslab_refill_calls_dbg[i],
(unsigned long long)g_bitmap_scan_calls[i],
(unsigned long long)g_bgbin_pops[i],
(unsigned long long)g_bump_hits[i],
(unsigned long long)g_bump_arms[i],
(unsigned long long)g_spec_calls[i],
(unsigned long long)g_spec_hits[i]);
}
*/
// SuperSlab adopt/publish debug
extern unsigned long long g_ss_publish_dbg[];
extern unsigned long long g_ss_adopt_dbg[];
fprintf(stderr, "\n[SS Adopt/Publish Counters]\n");
fprintf(stderr, "class, ss_publish, ss_adopt\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu\n", i,
(unsigned long long)g_ss_publish_dbg[i],
(unsigned long long)g_ss_adopt_dbg[i]);
}
// Refill-stage counters
extern unsigned long long g_rf_total_calls[];
extern unsigned long long g_rf_hit_bench[];
extern unsigned long long g_rf_hit_hot[];
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
extern unsigned long long g_rf_hit_ready[];
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_hit_ss[];
extern unsigned long long g_rf_hit_reg[];
extern unsigned long long g_rf_mmap_calls[];
fprintf(stderr, "\n[Refill Stage Counters]\n");
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
fprintf(stderr, "class, total, ready, bench, hot, slab, ss, reg, mmap\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_rf_total_calls[i],
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
(unsigned long long)g_rf_hit_ready[i],
(unsigned long long)g_rf_hit_bench[i],
(unsigned long long)g_rf_hit_hot[i],
(unsigned long long)g_rf_hit_slab[i],
(unsigned long long)g_rf_hit_ss[i],
(unsigned long long)g_rf_hit_reg[i],
(unsigned long long)g_rf_mmap_calls[i]);
}
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
// Refill item sources (freelist vs carve)
extern unsigned long long g_rf_freelist_items[];
extern unsigned long long g_rf_carve_items[];
fprintf(stderr, "\n[Refill Item Sources]\n");
fprintf(stderr, "class, freelist_items, carve_items\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu\n", i,
(unsigned long long)g_rf_freelist_items[i],
(unsigned long long)g_rf_carve_items[i]);
}
// Refill item sources (freelist vs carve)
extern unsigned long long g_rf_freelist_items[];
extern unsigned long long g_rf_carve_items[];
fprintf(stderr, "\n[Refill Item Sources]\n");
fprintf(stderr, "class, freelist_items, carve_items\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu\n", i,
(unsigned long long)g_rf_freelist_items[i],
(unsigned long long)g_rf_carve_items[i]);
}
// Diagnostic: refill early return counters
extern unsigned long long g_rf_early_no_ss[];
extern unsigned long long g_rf_early_no_meta[];
extern unsigned long long g_rf_early_no_room[];
extern unsigned long long g_rf_early_want_zero[];
fprintf(stderr, "\n[Refill Early Returns - Diagnostic]\n");
fprintf(stderr, "class, no_ss, no_meta, no_room, want_zero\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_rf_early_no_ss[i],
(unsigned long long)g_rf_early_no_meta[i],
(unsigned long long)g_rf_early_no_room[i],
(unsigned long long)g_rf_early_want_zero[i]);
}
// Slab-ring counters
extern unsigned long long g_slab_publish_dbg[];
extern unsigned long long g_slab_adopt_dbg[];
extern unsigned long long g_slab_requeue_dbg[];
extern unsigned long long g_slab_miss_dbg[];
fprintf(stderr, "\n[Slab Adopt/Publish Counters]\n");
fprintf(stderr, "class, slab_publish, slab_adopt, slab_requeue, slab_miss\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_slab_publish_dbg[i],
(unsigned long long)g_slab_adopt_dbg[i],
(unsigned long long)g_slab_requeue_dbg[i],
(unsigned long long)g_slab_miss_dbg[i]);
}
// Publish-side counters
extern unsigned long long g_pub_bench_hits[];
extern unsigned long long g_pub_hot_hits[];
extern unsigned long long g_pub_mail_hits[];
fprintf(stderr, "\n[Publish Hits]\n");
fprintf(stderr, "class, pub_mail, pub_bench, pub_hot\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu\n", i,
(unsigned long long)g_pub_mail_hits[i],
(unsigned long long)g_pub_bench_hits[i],
(unsigned long long)g_pub_hot_hits[i]);
}
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
// Front Gate Breakdown (SFC/SLL/Quick/Mag)
extern unsigned long long g_front_sfc_hit[];
extern unsigned long long g_front_sll_hit[];
extern unsigned long long g_front_quick_hit[];
extern unsigned long long g_front_mag_hit[];
fprintf(stderr, "\n[Front Gate Breakdown]\n");
fprintf(stderr, "class, sfc_hit, sll_hit, quick_hit, mag_hit\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_front_sfc_hit[i],
(unsigned long long)g_front_sll_hit[i],
(unsigned long long)g_front_quick_hit[i],
(unsigned long long)g_front_mag_hit[i]);
}
// Free Triggers (first-free / remote transition)
extern unsigned long long g_first_free_transitions[];
extern unsigned long long g_remote_free_transitions[];
fprintf(stderr, "\n[Free Triggers]\n");
fprintf(stderr, "class, first_free, remote_transition\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu\n", i,
(unsigned long long)g_first_free_transitions[i],
(unsigned long long)g_remote_free_transitions[i]);
}
// Adopt/Registry Gate
extern unsigned long long g_adopt_gate_calls[];
extern unsigned long long g_adopt_gate_success[];
extern unsigned long long g_reg_scan_attempts[];
extern unsigned long long g_reg_scan_hits[];
fprintf(stderr, "\n[Adopt/Registry Gate]\n");
fprintf(stderr, "class, adopt_calls, adopt_success, reg_scans, reg_hits\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_adopt_gate_calls[i],
(unsigned long long)g_adopt_gate_success[i],
(unsigned long long)g_reg_scan_attempts[i],
(unsigned long long)g_reg_scan_hits[i]);
}
// SuperSlab Registry (per-class sizes)
extern int g_super_reg_class_size[];
fprintf(stderr, "\n[SuperSlab Registry]\n");
fprintf(stderr, "class, reg_size\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%d\n", i, g_super_reg_class_size[i]);
}
extern unsigned long long g_fast_push_hits[];
extern unsigned long long g_fast_push_full[];
extern unsigned long long g_fast_push_disabled[];
extern unsigned long long g_fast_push_zero_cap[];
extern unsigned long long g_fast_push_gate_disabled[];
extern unsigned long long g_fast_push_gate_zero_cap[];
extern unsigned long long g_fast_spare_attempts[];
extern unsigned long long g_fast_spare_disabled[];
extern unsigned long long g_fast_spare_empty[];
extern unsigned long long g_fast_spare_lookup_fail[];
extern unsigned long long g_fast_spare_bad_index[];
extern unsigned long long g_fast_lookup_ss[];
extern unsigned long long g_fast_lookup_slab[];
extern unsigned long long g_fast_lookup_none;
fprintf(stderr, "\n[Fast Cache Debug]\n");
fprintf(stderr, "class, push_hits, push_full, push_disabled, push_zero_cap, gate_disabled, gate_zero_cap, spare_attempts, spare_disabled, spare_empty, spare_lookup_fail, spare_bad_index, lookup_ss, lookup_slab\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_fast_push_hits[i],
(unsigned long long)g_fast_push_full[i],
(unsigned long long)g_fast_push_disabled[i],
(unsigned long long)g_fast_push_zero_cap[i],
(unsigned long long)g_fast_push_gate_disabled[i],
(unsigned long long)g_fast_push_gate_zero_cap[i],
(unsigned long long)g_fast_spare_attempts[i],
(unsigned long long)g_fast_spare_disabled[i],
(unsigned long long)g_fast_spare_empty[i],
(unsigned long long)g_fast_spare_lookup_fail[i],
(unsigned long long)g_fast_spare_bad_index[i],
(unsigned long long)g_fast_lookup_ss[i],
(unsigned long long)g_fast_lookup_slab[i]);
}
fprintf(stderr, "lookup_none,%llu\n", (unsigned long long)g_fast_lookup_none);
extern uint64_t g_ss_cache_hits[];
extern uint64_t g_ss_cache_misses[];
extern uint64_t g_ss_cache_puts[];
extern uint64_t g_ss_cache_drops[];
extern uint64_t g_ss_cache_precharged[];
extern uint64_t g_superslabs_reused;
extern uint64_t g_superslabs_cached;
fprintf(stderr, "\n[SS Cache Stats]\n");
fprintf(stderr, "class, cache_hits, cache_misses, cache_puts, cache_drops, precharged\n");
for (int i = 0; i < 8; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_ss_cache_hits[i],
(unsigned long long)g_ss_cache_misses[i],
(unsigned long long)g_ss_cache_puts[i],
(unsigned long long)g_ss_cache_drops[i],
(unsigned long long)g_ss_cache_precharged[i]);
}
fprintf(stderr, "cache_reused=%llu cache_cached=%llu\n",
(unsigned long long)g_superslabs_reused,
(unsigned long long)g_superslabs_cached);
// Free pipeline
extern unsigned long long g_free_via_ss_local[];
extern unsigned long long g_free_via_ss_remote[];
extern unsigned long long g_free_via_tls_sll[];
extern unsigned long long g_free_via_mag[];
extern unsigned long long g_free_via_fast_tls[];
extern unsigned long long g_free_via_fastcache[];
extern unsigned long long g_fast_spare_flush[];
fprintf(stderr, "\n[Free Pipeline]\n");
fprintf(stderr, "class, ss_local, ss_remote, fast_tls, fast_cache, tls_sll, magazine, fast_spare_flush\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_free_via_ss_local[i],
(unsigned long long)g_free_via_ss_remote[i],
(unsigned long long)g_free_via_fast_tls[i],
(unsigned long long)g_free_via_fastcache[i],
(unsigned long long)g_free_via_tls_sll[i],
(unsigned long long)g_free_via_mag[i],
(unsigned long long)g_fast_spare_flush[i]);
}
// Publish pipeline
extern unsigned long long g_pub_notify_calls[];
extern unsigned long long g_pub_same_empty[];
extern unsigned long long g_remote_transitions[];
extern unsigned long long g_mailbox_register_calls[];
extern unsigned long long g_mailbox_slow_discoveries[];
fprintf(stderr, "\n[Publish Pipeline]\n");
fprintf(stderr, "class, notify_calls, same_empty_pubs, remote_transitions, mailbox_reg_calls, mailbox_slow_disc\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_pub_notify_calls[i],
(unsigned long long)g_pub_same_empty[i],
(unsigned long long)g_remote_transitions[i],
(unsigned long long)g_mailbox_register_calls[i],
(unsigned long long)g_mailbox_slow_discoveries[i]);
}
extern unsigned long long g_fast_push_hits[];
extern unsigned long long g_fast_push_full[];
extern unsigned long long g_fast_push_disabled[];
extern unsigned long long g_fast_push_zero_cap[];
extern unsigned long long g_fast_push_gate_disabled[];
extern unsigned long long g_fast_push_gate_zero_cap[];
extern unsigned long long g_fast_spare_attempts[];
extern unsigned long long g_fast_spare_disabled[];
extern unsigned long long g_fast_spare_empty[];
extern unsigned long long g_fast_spare_lookup_fail[];
extern unsigned long long g_fast_spare_bad_index[];
extern unsigned long long g_fast_lookup_ss[];
extern unsigned long long g_fast_lookup_slab[];
extern unsigned long long g_fast_lookup_none;
fprintf(stderr, "\n[Fast Cache Debug]\n");
fprintf(stderr, "class, push_hits, push_full, push_disabled, push_zero_cap, gate_disabled, gate_zero_cap, spare_attempts, spare_disabled, spare_empty, spare_lookup_fail, spare_bad_index, lookup_ss, lookup_slab\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_fast_push_hits[i],
(unsigned long long)g_fast_push_full[i],
(unsigned long long)g_fast_push_disabled[i],
(unsigned long long)g_fast_push_zero_cap[i],
(unsigned long long)g_fast_push_gate_disabled[i],
(unsigned long long)g_fast_push_gate_zero_cap[i],
(unsigned long long)g_fast_spare_attempts[i],
(unsigned long long)g_fast_spare_disabled[i],
(unsigned long long)g_fast_spare_empty[i],
(unsigned long long)g_fast_spare_lookup_fail[i],
(unsigned long long)g_fast_spare_bad_index[i],
(unsigned long long)g_fast_lookup_ss[i],
(unsigned long long)g_fast_lookup_slab[i]);
}
fprintf(stderr, "lookup_none,%llu\n", (unsigned long long)g_fast_lookup_none);
// Refill timing (ns)
extern unsigned long long g_rf_time_total_ns[];
extern unsigned long long g_rf_time_hot_ns[];
extern unsigned long long g_rf_time_bench_ns[];
extern unsigned long long g_rf_time_mail_ns[];
extern unsigned long long g_rf_time_slab_ns[];
extern unsigned long long g_rf_time_ss_ns[];
extern unsigned long long g_rf_time_reg_ns[];
extern unsigned long long g_rf_time_mmap_ns[];
fprintf(stderr, "\n[Refill Time (ns)]\n");
fprintf(stderr, "class, total, hot, bench, mail, slab, ss, reg, mmap\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_rf_time_total_ns[i],
(unsigned long long)g_rf_time_hot_ns[i],
(unsigned long long)g_rf_time_bench_ns[i],
(unsigned long long)g_rf_time_mail_ns[i],
(unsigned long long)g_rf_time_slab_ns[i],
(unsigned long long)g_rf_time_ss_ns[i],
(unsigned long long)g_rf_time_reg_ns[i],
(unsigned long long)g_rf_time_mmap_ns[i]);
}
#endif
}
// Always-available: Refill stage counters dump (env: HAKMEM_TINY_REFILL_DUMP=1 or reuse HAKMEM_TINY_COUNTERS_DUMP)
static void hak_tiny_refill_counters_dump(void) {
hak_tiny_stats_init_flags();
ENV cleanup: Add RELEASE guards to DEBUG ENV variables (14 vars) Added compile-time guards (#if HAKMEM_BUILD_RELEASE) to eliminate DEBUG ENV variable overhead in RELEASE builds. Variables guarded (14 total): - HAKMEM_TINY_TRACE_RING, HAKMEM_TINY_DUMP_RING_ATEXIT - HAKMEM_TINY_RF_TRACE, HAKMEM_TINY_MAILBOX_TRACE - HAKMEM_TINY_MAILBOX_TRACE_LIMIT, HAKMEM_TINY_MAILBOX_SLOWDISC - HAKMEM_TINY_MAILBOX_SLOWDISC_PERIOD - HAKMEM_SS_PREWARM_DEBUG, HAKMEM_SS_FREE_DEBUG - HAKMEM_TINY_FRONT_METRICS, HAKMEM_TINY_FRONT_DUMP - HAKMEM_TINY_COUNTERS_DUMP, HAKMEM_TINY_REFILL_DUMP - HAKMEM_PTR_TRACE_DUMP, HAKMEM_PTR_TRACE_VERBOSE Files modified (9 core files): - core/tiny_debug_ring.c (ring trace/dump) - core/box/mailbox_box.c (mailbox trace + slowdisc) - core/tiny_refill.h (refill trace) - core/hakmem_tiny_superslab.c (superslab debug) - core/box/ss_allocation_box.c (allocation debug) - core/tiny_superslab_free.inc.h (free debug) - core/box/front_metrics_box.c (frontend metrics) - core/hakmem_tiny_stats.c (stats dump) - core/ptr_trace.h (pointer trace) Bug fixes during implementation: 1. mailbox_box.c - Fixed variable scope (moved 'used' outside guard) 2. hakmem_tiny_stats.c - Fixed incomplete declarations (on1, on2) Impact: - Binary size: -85KB total - bench_random_mixed_hakmem: 319K → 305K (-14K, -4.4%) - larson_hakmem: 380K → 309K (-71K, -18.7%) - Performance: No regression (16.9-17.9M ops/s maintained) - Functional: All tests pass (Random Mixed + Larson) - Behavior: DEBUG ENV vars correctly ignored in RELEASE builds Testing: - Build: Clean compilation (warnings only, pre-existing) - 100K Random Mixed: 16.9-17.9M ops/s (PASS) - 10K Larson: 25.9M ops/s (PASS) - DEBUG ENV verification: Correctly ignored (PASS) Result: 14 DEBUG ENV variables now have zero overhead in RELEASE builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 03:41:07 +09:00
#if HAKMEM_BUILD_RELEASE
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
return;
ENV cleanup: Add RELEASE guards to DEBUG ENV variables (14 vars) Added compile-time guards (#if HAKMEM_BUILD_RELEASE) to eliminate DEBUG ENV variable overhead in RELEASE builds. Variables guarded (14 total): - HAKMEM_TINY_TRACE_RING, HAKMEM_TINY_DUMP_RING_ATEXIT - HAKMEM_TINY_RF_TRACE, HAKMEM_TINY_MAILBOX_TRACE - HAKMEM_TINY_MAILBOX_TRACE_LIMIT, HAKMEM_TINY_MAILBOX_SLOWDISC - HAKMEM_TINY_MAILBOX_SLOWDISC_PERIOD - HAKMEM_SS_PREWARM_DEBUG, HAKMEM_SS_FREE_DEBUG - HAKMEM_TINY_FRONT_METRICS, HAKMEM_TINY_FRONT_DUMP - HAKMEM_TINY_COUNTERS_DUMP, HAKMEM_TINY_REFILL_DUMP - HAKMEM_PTR_TRACE_DUMP, HAKMEM_PTR_TRACE_VERBOSE Files modified (9 core files): - core/tiny_debug_ring.c (ring trace/dump) - core/box/mailbox_box.c (mailbox trace + slowdisc) - core/tiny_refill.h (refill trace) - core/hakmem_tiny_superslab.c (superslab debug) - core/box/ss_allocation_box.c (allocation debug) - core/tiny_superslab_free.inc.h (free debug) - core/box/front_metrics_box.c (frontend metrics) - core/hakmem_tiny_stats.c (stats dump) - core/ptr_trace.h (pointer trace) Bug fixes during implementation: 1. mailbox_box.c - Fixed variable scope (moved 'used' outside guard) 2. hakmem_tiny_stats.c - Fixed incomplete declarations (on1, on2) Impact: - Binary size: -85KB total - bench_random_mixed_hakmem: 319K → 305K (-14K, -4.4%) - larson_hakmem: 380K → 309K (-71K, -18.7%) - Performance: No regression (16.9-17.9M ops/s maintained) - Functional: All tests pass (Random Mixed + Larson) - Behavior: DEBUG ENV vars correctly ignored in RELEASE builds Testing: - Build: Clean compilation (warnings only, pre-existing) - 100K Random Mixed: 16.9-17.9M ops/s (PASS) - 10K Larson: 25.9M ops/s (PASS) - DEBUG ENV verification: Correctly ignored (PASS) Result: 14 DEBUG ENV variables now have zero overhead in RELEASE builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 03:41:07 +09:00
#else
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
if (!hak_stats_check("HAKMEM_TINY_REFILL_DUMP", "refill") &&
!hak_stats_check("HAKMEM_TINY_COUNTERS_DUMP", "counters")) {
return;
}
ENV cleanup: Add RELEASE guards to DEBUG ENV variables (14 vars) Added compile-time guards (#if HAKMEM_BUILD_RELEASE) to eliminate DEBUG ENV variable overhead in RELEASE builds. Variables guarded (14 total): - HAKMEM_TINY_TRACE_RING, HAKMEM_TINY_DUMP_RING_ATEXIT - HAKMEM_TINY_RF_TRACE, HAKMEM_TINY_MAILBOX_TRACE - HAKMEM_TINY_MAILBOX_TRACE_LIMIT, HAKMEM_TINY_MAILBOX_SLOWDISC - HAKMEM_TINY_MAILBOX_SLOWDISC_PERIOD - HAKMEM_SS_PREWARM_DEBUG, HAKMEM_SS_FREE_DEBUG - HAKMEM_TINY_FRONT_METRICS, HAKMEM_TINY_FRONT_DUMP - HAKMEM_TINY_COUNTERS_DUMP, HAKMEM_TINY_REFILL_DUMP - HAKMEM_PTR_TRACE_DUMP, HAKMEM_PTR_TRACE_VERBOSE Files modified (9 core files): - core/tiny_debug_ring.c (ring trace/dump) - core/box/mailbox_box.c (mailbox trace + slowdisc) - core/tiny_refill.h (refill trace) - core/hakmem_tiny_superslab.c (superslab debug) - core/box/ss_allocation_box.c (allocation debug) - core/tiny_superslab_free.inc.h (free debug) - core/box/front_metrics_box.c (frontend metrics) - core/hakmem_tiny_stats.c (stats dump) - core/ptr_trace.h (pointer trace) Bug fixes during implementation: 1. mailbox_box.c - Fixed variable scope (moved 'used' outside guard) 2. hakmem_tiny_stats.c - Fixed incomplete declarations (on1, on2) Impact: - Binary size: -85KB total - bench_random_mixed_hakmem: 319K → 305K (-14K, -4.4%) - larson_hakmem: 380K → 309K (-71K, -18.7%) - Performance: No regression (16.9-17.9M ops/s maintained) - Functional: All tests pass (Random Mixed + Larson) - Behavior: DEBUG ENV vars correctly ignored in RELEASE builds Testing: - Build: Clean compilation (warnings only, pre-existing) - 100K Random Mixed: 16.9-17.9M ops/s (PASS) - 10K Larson: 25.9M ops/s (PASS) - DEBUG ENV verification: Correctly ignored (PASS) Result: 14 DEBUG ENV variables now have zero overhead in RELEASE builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-27 03:41:07 +09:00
#endif
extern unsigned long long g_rf_total_calls[];
extern unsigned long long g_rf_hit_bench[];
extern unsigned long long g_rf_hit_hot[];
extern unsigned long long g_rf_hit_mail[];
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_hit_ss[];
extern unsigned long long g_rf_hit_reg[];
extern unsigned long long g_rf_mmap_calls[];
fprintf(stderr, "\n[Refill Stage Counters]\n");
fprintf(stderr, "class, total, mail, bench, hot, slab, ss, reg, mmap\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_rf_total_calls[i],
(unsigned long long)g_rf_hit_mail[i],
(unsigned long long)g_rf_hit_bench[i],
(unsigned long long)g_rf_hit_hot[i],
(unsigned long long)g_rf_hit_slab[i],
(unsigned long long)g_rf_hit_ss[i],
(unsigned long long)g_rf_hit_reg[i],
(unsigned long long)g_rf_mmap_calls[i]);
}
// Diagnostic: refill early return counters
extern unsigned long long g_rf_early_no_ss[];
extern unsigned long long g_rf_early_no_meta[];
extern unsigned long long g_rf_early_no_room[];
extern unsigned long long g_rf_early_want_zero[];
fprintf(stderr, "\n[Refill Early Returns - Diagnostic]\n");
fprintf(stderr, "class, no_ss, no_meta, no_room, want_zero\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_rf_early_no_ss[i],
(unsigned long long)g_rf_early_no_meta[i],
(unsigned long long)g_rf_early_no_room[i],
(unsigned long long)g_rf_early_want_zero[i]);
}
// Publish-side counters (always available)
extern unsigned long long g_pub_bench_hits[];
extern unsigned long long g_pub_hot_hits[];
extern unsigned long long g_pub_mail_hits[];
fprintf(stderr, "\n[Publish Hits]\n");
fprintf(stderr, "class, pub_mail, pub_bench, pub_hot\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu\n", i,
(unsigned long long)g_pub_mail_hits[i],
(unsigned long long)g_pub_bench_hits[i],
(unsigned long long)g_pub_hot_hits[i]);
}
extern uint64_t g_ss_cache_hits[];
extern uint64_t g_ss_cache_misses[];
extern uint64_t g_ss_cache_puts[];
extern uint64_t g_ss_cache_drops[];
extern uint64_t g_ss_cache_precharged[];
extern uint64_t g_superslabs_reused;
extern uint64_t g_superslabs_cached;
fprintf(stderr, "\n[SS Cache Stats]\n");
fprintf(stderr, "class, cache_hits, cache_misses, cache_puts, cache_drops, precharged\n");
for (int i = 0; i < 8; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_ss_cache_hits[i],
(unsigned long long)g_ss_cache_misses[i],
(unsigned long long)g_ss_cache_puts[i],
(unsigned long long)g_ss_cache_drops[i],
(unsigned long long)g_ss_cache_precharged[i]);
}
fprintf(stderr, "cache_reused=%llu cache_cached=%llu\n",
(unsigned long long)g_superslabs_reused,
(unsigned long long)g_superslabs_cached);
// Free pipeline
extern unsigned long long g_free_via_ss_local[];
extern unsigned long long g_free_via_ss_remote[];
extern unsigned long long g_free_via_tls_sll[];
extern unsigned long long g_free_via_mag[];
extern unsigned long long g_free_via_fast_tls[];
extern unsigned long long g_free_via_fastcache[];
extern unsigned long long g_fast_spare_flush[];
fprintf(stderr, "\n[Free Pipeline]\n");
fprintf(stderr, "class, ss_local, ss_remote, fast_tls, fast_cache, tls_sll, magazine, fast_spare_flush\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_free_via_ss_local[i],
(unsigned long long)g_free_via_ss_remote[i],
(unsigned long long)g_free_via_fast_tls[i],
(unsigned long long)g_free_via_fastcache[i],
(unsigned long long)g_free_via_tls_sll[i],
(unsigned long long)g_free_via_mag[i],
(unsigned long long)g_fast_spare_flush[i]);
}
// Publish pipeline
extern unsigned long long g_pub_notify_calls[];
extern unsigned long long g_pub_same_empty[];
extern unsigned long long g_remote_transitions[];
extern unsigned long long g_mailbox_register_calls[];
extern unsigned long long g_mailbox_slow_discoveries[];
fprintf(stderr, "\n[Publish Pipeline]\n");
fprintf(stderr, "class, notify_calls, same_empty_pubs, remote_transitions, mailbox_reg_calls, mailbox_slow_disc\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_pub_notify_calls[i],
(unsigned long long)g_pub_same_empty[i],
(unsigned long long)g_remote_transitions[i],
(unsigned long long)g_mailbox_register_calls[i],
(unsigned long long)g_mailbox_slow_discoveries[i]);
}
extern unsigned long long g_fast_push_hits[];
extern unsigned long long g_fast_push_full[];
extern unsigned long long g_fast_push_disabled[];
extern unsigned long long g_fast_push_zero_cap[];
extern unsigned long long g_fast_push_gate_disabled[];
extern unsigned long long g_fast_push_gate_zero_cap[];
extern unsigned long long g_fast_spare_attempts[];
extern unsigned long long g_fast_spare_disabled[];
extern unsigned long long g_fast_spare_empty[];
extern unsigned long long g_fast_spare_lookup_fail[];
extern unsigned long long g_fast_spare_bad_index[];
extern unsigned long long g_fast_lookup_ss[];
extern unsigned long long g_fast_lookup_slab[];
extern unsigned long long g_fast_lookup_none;
fprintf(stderr, "\n[Fast Cache Debug]\n");
fprintf(stderr, "class, push_hits, push_full, push_disabled, push_zero_cap, gate_disabled, gate_zero_cap, spare_attempts, spare_disabled, spare_empty, spare_lookup_fail, spare_bad_index, lookup_ss, lookup_slab\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_fast_push_hits[i],
(unsigned long long)g_fast_push_full[i],
(unsigned long long)g_fast_push_disabled[i],
(unsigned long long)g_fast_push_zero_cap[i],
(unsigned long long)g_fast_push_gate_disabled[i],
(unsigned long long)g_fast_push_gate_zero_cap[i],
(unsigned long long)g_fast_spare_attempts[i],
(unsigned long long)g_fast_spare_disabled[i],
(unsigned long long)g_fast_spare_empty[i],
(unsigned long long)g_fast_spare_lookup_fail[i],
(unsigned long long)g_fast_spare_bad_index[i],
(unsigned long long)g_fast_lookup_ss[i],
(unsigned long long)g_fast_lookup_slab[i]);
}
fprintf(stderr, "lookup_none,%llu\n", (unsigned long long)g_fast_lookup_none);
// Refill timing (ns)
extern unsigned long long g_rf_time_total_ns[];
extern unsigned long long g_rf_time_hot_ns[];
extern unsigned long long g_rf_time_bench_ns[];
extern unsigned long long g_rf_time_mail_ns[];
extern unsigned long long g_rf_time_slab_ns[];
extern unsigned long long g_rf_time_ss_ns[];
extern unsigned long long g_rf_time_reg_ns[];
extern unsigned long long g_rf_time_mmap_ns[];
fprintf(stderr, "\n[Refill Time (ns)]\n");
fprintf(stderr, "class, total, hot, bench, mail, slab, ss, reg, mmap\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
fprintf(stderr, "%d,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", i,
(unsigned long long)g_rf_time_total_ns[i],
(unsigned long long)g_rf_time_hot_ns[i],
(unsigned long long)g_rf_time_bench_ns[i],
(unsigned long long)g_rf_time_mail_ns[i],
(unsigned long long)g_rf_time_slab_ns[i],
(unsigned long long)g_rf_time_ss_ns[i],
(unsigned long long)g_rf_time_reg_ns[i],
(unsigned long long)g_rf_time_mmap_ns[i]);
}
}
__attribute__((destructor))
static void hak_tiny_stats_auto_dump(void) {
// Dump at exit if enabled or atexit-only requested
hak_tiny_stats_init_flags();
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
if (g_dump_atexit_only || hak_stats_dump_enabled()) {
// Force dump regardless of individual envs when atexit-only
// or when master HAKMEM_STATS(_DUMP) requests global dump.
hak_tiny_dump_all_counters_now();
} else {
hak_tiny_refill_counters_dump();
}
}