diff --git a/Makefile b/Makefile index d7199a43..49007628 100644 --- a/Makefile +++ b/Makefile @@ -219,12 +219,12 @@ LDFLAGS += $(EXTRA_LDFLAGS) # Targets TARGET = test_hakmem -OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/wrapper_env_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o +OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/wrapper_env_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o OBJS = $(OBJS_BASE) # Shared library SHARED_LIB = libhakmem.so -SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o core/box/ss_allocation_box_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o core/superslab_head_stub_shared.o hakmem_smallmid_shared.o hakmem_smallmid_superslab_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/ss_addr_map_box_shared.o core/box/slab_recycling_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/box/tiny_env_box_shared.o core/box/wrapper_env_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_shared_pool_acquire_shared.o hakmem_shared_pool_release_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o +SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o core/box/ss_allocation_box_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o core/superslab_head_stub_shared.o hakmem_smallmid_shared.o hakmem_smallmid_superslab_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/ss_addr_map_box_shared.o core/box/slab_recycling_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/box/tiny_env_box_shared.o core/box/tiny_route_box_shared.o core/box/wrapper_env_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_shared_pool_acquire_shared.o hakmem_shared_pool_release_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o # Pool TLS Phase 1 (enable with POOL_TLS_PHASE1=1) ifeq ($(POOL_TLS_PHASE1),1) @@ -251,7 +251,7 @@ endif # Benchmark targets BENCH_HAKMEM = bench_allocators_hakmem BENCH_SYSTEM = bench_allocators_system -BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/wrapper_env_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o +BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/wrapper_env_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE) ifeq ($(POOL_TLS_PHASE1),1) BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o diff --git a/bench_random_mixed.c b/bench_random_mixed.c index 2a8dd0f6..756a245b 100644 --- a/bench_random_mixed.c +++ b/bench_random_mixed.c @@ -148,6 +148,14 @@ int main(int argc, char** argv){ // Phase 20-2: Print BenchFast stats (verify pool wasn't exhausted) bench_fast_stats(); + // Production Performance Measurements (ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1) + extern void unified_cache_print_measurements(void); + extern void tls_sll_print_measurements(void); + extern void shared_pool_print_measurements(void); + unified_cache_print_measurements(); + tls_sll_print_measurements(); + shared_pool_print_measurements(); + // Phase 21-1: Ring cache - DELETED (A/B test: OFF is faster) // extern void ring_cache_print_stats(void); // ring_cache_print_stats(); diff --git a/core/box/tls_sll_box.h b/core/box/tls_sll_box.h index 62093539..4bb5ead2 100644 --- a/core/box/tls_sll_box.h +++ b/core/box/tls_sll_box.h @@ -41,6 +41,28 @@ #include "tiny_next_ptr_box.h" #include "tiny_header_box.h" // Header Box: Single Source of Truth for header operations +// ============================================================================ +// Performance Measurement: TLS SLL Hit Rate (ENV-gated) +// ============================================================================ +// Global atomic counters for TLS SLL performance measurement +// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) +extern _Atomic uint64_t g_tls_sll_push_count_global; +extern _Atomic uint64_t g_tls_sll_pop_count_global; +extern _Atomic uint64_t g_tls_sll_pop_empty_count_global; + +// Print statistics function +void tls_sll_print_measurements(void); + +// Check if measurement is enabled (inline for hot path) +static inline int tls_sll_measure_enabled(void) { + static int g_measure = -1; + if (__builtin_expect(g_measure == -1, 0)) { + const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE"); + g_measure = (e && *e && *e != '0') ? 1 : 0; + } + return g_measure; +} + // Per-thread debug shadow: last successful push base per class (release-safe) // Changed to extern to share across TUs (defined in hakmem_tiny.c) extern __thread hak_base_ptr_t s_tls_sll_last_push[TINY_NUM_CLASSES]; @@ -797,6 +819,11 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t g_tls_sll[class_idx].count = cur + 1; s_tls_sll_last_push[class_idx] = ptr; + // Performance measurement: count push operations + if (__builtin_expect(tls_sll_measure_enabled(), 0)) { + atomic_fetch_add_explicit(&g_tls_sll_push_count_global, 1, memory_order_relaxed); + } + #if !HAKMEM_BUILD_RELEASE // Trace TLS SLL push (debug only) extern void ptr_trace_record_impl(int event, void* ptr, int class_idx, uint64_t op_num, @@ -845,6 +872,10 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch hak_base_ptr_t base = g_tls_sll[class_idx].head; if (hak_base_is_null(base)) { + // Performance measurement: count empty pops + if (__builtin_expect(tls_sll_measure_enabled(), 0)) { + atomic_fetch_add_explicit(&g_tls_sll_pop_empty_count_global, 1, memory_order_relaxed); + } return false; } void* raw_base = HAK_BASE_TO_RAW(base); @@ -1138,6 +1169,12 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch #endif *out = base; + + // Performance measurement: count successful pops + if (__builtin_expect(tls_sll_measure_enabled(), 0)) { + atomic_fetch_add_explicit(&g_tls_sll_pop_count_global, 1, memory_order_relaxed); + } + return true; } diff --git a/core/front/tiny_unified_cache.c b/core/front/tiny_unified_cache.c index 9f7b6daa..be8419e5 100644 --- a/core/front/tiny_unified_cache.c +++ b/core/front/tiny_unified_cache.c @@ -11,6 +11,40 @@ #include #include #include +#include + +// ============================================================================ +// Performance Measurement: Unified Cache (ENV-gated) +// ============================================================================ +// Global atomic counters for unified cache performance measurement +// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) +_Atomic uint64_t g_unified_cache_hits_global = 0; +_Atomic uint64_t g_unified_cache_misses_global = 0; +_Atomic uint64_t g_unified_cache_refill_cycles_global = 0; + +// Helper: Get cycle count (x86_64 rdtsc) +static inline uint64_t read_tsc(void) { +#if defined(__x86_64__) || defined(_M_X64) + uint32_t lo, hi; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#else + // Fallback to clock_gettime for non-x86 platforms + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; +#endif +} + +// Check if measurement is enabled (cached) +static inline int unified_cache_measure_enabled(void) { + static int g_measure = -1; + if (__builtin_expect(g_measure == -1, 0)) { + const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE"); + g_measure = (e && *e && *e != '0') ? 1 : 0; + } + return g_measure; +} // Phase 23-E: Forward declarations extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c @@ -294,6 +328,13 @@ static inline int unified_refill_validate_base(int class_idx, // Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed // Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer) hak_base_ptr_t unified_cache_refill(int class_idx) { + // Measure refill cost if enabled + uint64_t start_cycles = 0; + int measure = unified_cache_measure_enabled(); + if (measure) { + start_cycles = read_tsc(); + } + TinyTLSSlab* tls = &g_tls_slabs[class_idx]; // Step 1: Ensure SuperSlab available @@ -443,5 +484,51 @@ hak_base_ptr_t unified_cache_refill(int class_idx) { g_unified_cache_miss[class_idx]++; #endif + // Measure refill cycles + if (measure) { + uint64_t end_cycles = read_tsc(); + uint64_t delta = end_cycles - start_cycles; + atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global, delta, memory_order_relaxed); + atomic_fetch_add_explicit(&g_unified_cache_misses_global, 1, memory_order_relaxed); + } + return HAK_BASE_FROM_RAW(first); // Return first block (BASE pointer) } + +// ============================================================================ +// Performance Measurement: Print Statistics +// ============================================================================ +void unified_cache_print_measurements(void) { + if (!unified_cache_measure_enabled()) { + return; // Measurement disabled, nothing to print + } + + uint64_t hits = atomic_load_explicit(&g_unified_cache_hits_global, memory_order_relaxed); + uint64_t misses = atomic_load_explicit(&g_unified_cache_misses_global, memory_order_relaxed); + uint64_t refill_cycles = atomic_load_explicit(&g_unified_cache_refill_cycles_global, memory_order_relaxed); + + uint64_t total = hits + misses; + if (total == 0) { + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "Unified Cache Statistics\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "No operations recorded (measurement may be disabled)\n"); + fprintf(stderr, "========================================\n\n"); + return; + } + + double hit_rate = (100.0 * hits) / total; + double avg_refill_cycles = misses > 0 ? (double)refill_cycles / misses : 0.0; + + // Estimate time at 1GHz (conservative, most modern CPUs are 2-4GHz) + double avg_refill_us = avg_refill_cycles / 1000.0; + + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "Unified Cache Statistics\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "Hits: %llu\n", (unsigned long long)hits); + fprintf(stderr, "Misses: %llu\n", (unsigned long long)misses); + fprintf(stderr, "Hit Rate: %.1f%%\n", hit_rate); + fprintf(stderr, "Avg Refill Cycles: %.0f (est. %.2fus @ 1GHz)\n", avg_refill_cycles, avg_refill_us); + fprintf(stderr, "========================================\n\n"); +} diff --git a/core/front/tiny_unified_cache.h b/core/front/tiny_unified_cache.h index 0b0eeb75..1b0b05e5 100644 --- a/core/front/tiny_unified_cache.h +++ b/core/front/tiny_unified_cache.h @@ -25,11 +25,34 @@ #include #include #include +#include #include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES #include "../box/ptr_type_box.h" // Phantom pointer types (BASE/USER) #include "../box/tiny_front_config_box.h" // Phase 8-Step1: Config macros +// ============================================================================ +// Performance Measurement: Unified Cache (ENV-gated) +// ============================================================================ +// Global atomic counters for production performance measurement +// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) +extern _Atomic uint64_t g_unified_cache_hits_global; +extern _Atomic uint64_t g_unified_cache_misses_global; +extern _Atomic uint64_t g_unified_cache_refill_cycles_global; + +// Print statistics function +void unified_cache_print_measurements(void); + +// Check if measurement is enabled (inline for hot path) +static inline int unified_cache_measure_check(void) { + static int g_measure = -1; + if (__builtin_expect(g_measure == -1, 0)) { + const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE"); + g_measure = (e && *e && *e != '0') ? 1 : 0; + } + return g_measure; +} + // ============================================================================ // Unified Cache Structure (per class) // ============================================================================ @@ -242,6 +265,10 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) { #if !HAKMEM_BUILD_RELEASE g_unified_cache_hit[class_idx]++; #endif + // Performance measurement: count cache hits + if (__builtin_expect(unified_cache_measure_check(), 0)) { + atomic_fetch_add_explicit(&g_unified_cache_hits_global, 1, memory_order_relaxed); + } return HAK_BASE_FROM_RAW(base); // Hit! (2-3 cache misses total) } diff --git a/core/hakmem_shared_pool_acquire.c b/core/hakmem_shared_pool_acquire.c index 82eb6332..c4182a24 100644 --- a/core/hakmem_shared_pool_acquire.c +++ b/core/hakmem_shared_pool_acquire.c @@ -14,6 +14,28 @@ #include #include +// ============================================================================ +// Performance Measurement: Shared Pool Lock Contention (ENV-gated) +// ============================================================================ +// Global atomic counters for lock contention measurement +// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) +_Atomic uint64_t g_sp_stage2_lock_acquired_global = 0; +_Atomic uint64_t g_sp_stage3_lock_acquired_global = 0; +_Atomic uint64_t g_sp_alloc_lock_contention_global = 0; + +// Check if measurement is enabled (cached) +static inline int sp_measure_enabled(void) { + static int g_measure = -1; + if (__builtin_expect(g_measure == -1, 0)) { + const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE"); + g_measure = (e && *e && *e != '0') ? 1 : 0; + } + return g_measure; +} + +// Print statistics function +void shared_pool_print_measurements(void); + // Stage 0.5: EMPTY slab direct scan(registry ベースの EMPTY 再利用) // Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to // avoid Stage 3 (mmap) when freed slabs are available. @@ -266,6 +288,12 @@ stage2_fallback: pthread_mutex_lock(&g_shared_pool.alloc_lock); + // Performance measurement: count Stage 2 lock acquisitions + if (__builtin_expect(sp_measure_enabled(), 0)) { + atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed); + } + // Update SuperSlab metadata under mutex ss->slab_bitmap |= (1u << claimed_idx); ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx); @@ -349,6 +377,12 @@ stage2_scan: pthread_mutex_lock(&g_shared_pool.alloc_lock); + // Performance measurement: count Stage 2 scan lock acquisitions + if (__builtin_expect(sp_measure_enabled(), 0)) { + atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed); + } + // Update SuperSlab metadata under mutex ss->slab_bitmap |= (1u << claimed_idx); ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx); @@ -421,6 +455,12 @@ stage2_scan: pthread_mutex_lock(&g_shared_pool.alloc_lock); + // Performance measurement: count Stage 3 lock acquisitions + if (__builtin_expect(sp_measure_enabled(), 0)) { + atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_global, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed); + } + // ========== Stage 3: Get new SuperSlab ========== // Try LRU cache first, then mmap SuperSlab* new_ss = NULL; @@ -541,3 +581,39 @@ stage2_scan: } return 0; // ✅ Stage 3 success } + +// ============================================================================ +// Performance Measurement: Print Statistics +// ============================================================================ +void shared_pool_print_measurements(void) { + if (!sp_measure_enabled()) { + return; // Measurement disabled + } + + uint64_t stage2 = atomic_load_explicit(&g_sp_stage2_lock_acquired_global, memory_order_relaxed); + uint64_t stage3 = atomic_load_explicit(&g_sp_stage3_lock_acquired_global, memory_order_relaxed); + uint64_t total_locks = atomic_load_explicit(&g_sp_alloc_lock_contention_global, memory_order_relaxed); + + if (total_locks == 0) { + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "Shared Pool Contention Statistics\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "No lock acquisitions recorded\n"); + fprintf(stderr, "========================================\n\n"); + return; + } + + double stage2_pct = (100.0 * stage2) / total_locks; + double stage3_pct = (100.0 * stage3) / total_locks; + + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "Shared Pool Contention Statistics\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "Stage 2 Locks: %llu (%.1f%%)\n", + (unsigned long long)stage2, stage2_pct); + fprintf(stderr, "Stage 3 Locks: %llu (%.1f%%)\n", + (unsigned long long)stage3, stage3_pct); + fprintf(stderr, "Total Contention: %llu lock acquisitions\n", + (unsigned long long)total_locks); + fprintf(stderr, "========================================\n\n"); +} diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 2f8fdbfc..9dd79611 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -647,7 +647,49 @@ static void tiny_tls_sll_diag_atexit(void) { } -// ============================================================================ +// ============================================================================ +// Performance Measurement: TLS SLL Statistics Print Function +// ============================================================================ +void tls_sll_print_measurements(void) { + // Check if measurement is enabled + static int g_measure = -1; + if (g_measure == -1) { + const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE"); + g_measure = (e && *e && *e != '0') ? 1 : 0; + } + if (!g_measure) { + return; // Measurement disabled + } + + uint64_t pushes = atomic_load_explicit(&g_tls_sll_push_count_global, memory_order_relaxed); + uint64_t pops = atomic_load_explicit(&g_tls_sll_pop_count_global, memory_order_relaxed); + uint64_t pop_empty = atomic_load_explicit(&g_tls_sll_pop_empty_count_global, memory_order_relaxed); + + uint64_t total_pop_attempts = pops + pop_empty; + if (total_pop_attempts == 0 && pushes == 0) { + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "TLS SLL Statistics\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "No operations recorded\n"); + fprintf(stderr, "========================================\n\n"); + return; + } + + double hit_rate = total_pop_attempts > 0 ? (100.0 * pops) / total_pop_attempts : 0.0; + double empty_rate = total_pop_attempts > 0 ? (100.0 * pop_empty) / total_pop_attempts : 0.0; + + fprintf(stderr, "\n========================================\n"); + fprintf(stderr, "TLS SLL Statistics\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "Total Pushes: %llu\n", (unsigned long long)pushes); + fprintf(stderr, "Total Pops: %llu\n", (unsigned long long)pops); + fprintf(stderr, "Pop Empty Count: %llu (%.1f%% of pops)\n", + (unsigned long long)pop_empty, empty_rate); + fprintf(stderr, "Hit Rate: %.1f%%\n", hit_rate); + fprintf(stderr, "========================================\n\n"); +} + +// ============================================================================ // ACE Learning Layer & Tiny Guard - EXTRACTED to hakmem_tiny_ace_guard_box.inc -// ============================================================================ +// ============================================================================ #include "hakmem_tiny_ace_guard_box.inc" diff --git a/core/hakmem_tiny_tls_state_box.inc b/core/hakmem_tiny_tls_state_box.inc index 7ed9d820..13551a5a 100644 --- a/core/hakmem_tiny_tls_state_box.inc +++ b/core/hakmem_tiny_tls_state_box.inc @@ -1,3 +1,13 @@ +// ============================================================================ +// Performance Measurement: TLS SLL Hit Rate (ENV-gated) +// ============================================================================ +// Global atomic counters for TLS SLL performance measurement +// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) +#include +_Atomic uint64_t g_tls_sll_push_count_global = 0; +_Atomic uint64_t g_tls_sll_pop_count_global = 0; +_Atomic uint64_t g_tls_sll_pop_empty_count_global = 0; + // Hot-path cheap sampling counter to avoid rand() in allocation path // Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B) int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable