Remove unused Mid MT layer
This commit is contained in:
10
Makefile
10
Makefile
@ -218,12 +218,12 @@ LDFLAGS += $(EXTRA_LDFLAGS)
|
||||
|
||||
# Targets
|
||||
TARGET = test_hakmem
|
||||
OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/ss_tls_hint_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
|
||||
OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/ss_tls_hint_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
|
||||
OBJS = $(OBJS_BASE)
|
||||
|
||||
# Shared library
|
||||
SHARED_LIB = libhakmem.so
|
||||
SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o superslab_allocate_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o superslab_head_shared.o hakmem_smallmid_shared.o hakmem_smallmid_superslab_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/unified_batch_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/ss_addr_map_box_shared.o core/box/ss_tls_hint_box_shared.o core/box/slab_recycling_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_mid_mt_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_shared_pool_acquire_shared.o hakmem_shared_pool_release_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o
|
||||
SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o superslab_allocate_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o superslab_head_shared.o hakmem_smallmid_shared.o hakmem_smallmid_superslab_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/unified_batch_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/ss_addr_map_box_shared.o core/box/ss_tls_hint_box_shared.o core/box/slab_recycling_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_shared_pool_acquire_shared.o hakmem_shared_pool_release_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o
|
||||
|
||||
# Pool TLS Phase 1 (enable with POOL_TLS_PHASE1=1)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
@ -250,7 +250,7 @@ endif
|
||||
# Benchmark targets
|
||||
BENCH_HAKMEM = bench_allocators_hakmem
|
||||
BENCH_SYSTEM = bench_allocators_system
|
||||
BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/ss_tls_hint_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
|
||||
BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/ss_tls_hint_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
|
||||
BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
|
||||
@ -285,7 +285,7 @@ $(TARGET): $(OBJS)
|
||||
@echo "========================================="
|
||||
|
||||
# Compile C files
|
||||
%.o: %.c hakmem.h hakmem_config.h hakmem_features.h hakmem_internal.h hakmem_bigcache.h hakmem_pool.h hakmem_l25_pool.h hakmem_site_rules.h hakmem_tiny.h hakmem_tiny_superslab.h hakmem_mid_mt.h hakmem_super_registry.h hakmem_elo.h hakmem_batch.h hakmem_p2.h hakmem_sizeclass_dist.h hakmem_evo.h
|
||||
%.o: %.c hakmem.h hakmem_config.h hakmem_features.h hakmem_internal.h hakmem_bigcache.h hakmem_pool.h hakmem_l25_pool.h hakmem_site_rules.h hakmem_tiny.h hakmem_tiny_superslab.h hakmem_super_registry.h hakmem_elo.h hakmem_batch.h hakmem_p2.h hakmem_sizeclass_dist.h hakmem_evo.h
|
||||
$(CC) $(CFLAGS) -c -o $@ $<
|
||||
|
||||
# Build benchmark programs
|
||||
@ -427,7 +427,7 @@ test-box-refactor: box-refactor
|
||||
./larson_hakmem 10 8 128 1024 1 12345 4
|
||||
|
||||
# Phase 4: Tiny Pool benchmarks (properly linked with hakmem)
|
||||
TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/free_publish_box.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/ss_tls_hint_box.o core/box/slab_recycling_box.o core/box/tiny_sizeclass_hist_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_unified_cache.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o
|
||||
TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o superslab_allocate.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o superslab_head.o hakmem_smallmid.o hakmem_smallmid_superslab.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/free_publish_box.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/ss_tls_hint_box.o core/box/slab_recycling_box.o core/box/tiny_sizeclass_hist_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_unified_cache.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o
|
||||
TINY_BENCH_OBJS = $(TINY_BENCH_OBJS_BASE)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
TINY_BENCH_OBJS += pool_tls.o pool_refill.o core/pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
|
||||
|
||||
@ -106,14 +106,6 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
|
||||
|
||||
hkm_size_hist_record(size);
|
||||
|
||||
// Legacy Mid MT allocator (Phase 5) is disabled by default to favor ACE/Pool.
|
||||
// Enable via HAKMEM_MID_MT_ENABLE=1 when running legacy benchmarks.
|
||||
static int g_mid_mt_enabled = -1;
|
||||
if (__builtin_expect(g_mid_mt_enabled < 0, 0)) {
|
||||
const char* e = getenv("HAKMEM_MID_MT_ENABLE");
|
||||
g_mid_mt_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
#ifdef HAKMEM_POOL_TLS_PHASE1
|
||||
// Phase 1: Ultra-fast Pool TLS for 8KB-52KB range
|
||||
if (size >= 8192 && size <= 53248) {
|
||||
@ -124,18 +116,6 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
|
||||
}
|
||||
#endif
|
||||
|
||||
if (__builtin_expect(g_mid_mt_enabled && mid_is_in_range(size), 0)) {
|
||||
#if HAKMEM_DEBUG_TIMING
|
||||
HKM_TIME_START(t_mid);
|
||||
#endif
|
||||
void* mid_ptr = mid_mt_alloc(size);
|
||||
#if HAKMEM_DEBUG_TIMING
|
||||
HKM_TIME_END(HKM_CAT_POOL_GET, t_mid);
|
||||
#endif
|
||||
// PERF_OPT: likely hint - mid allocations usually succeed
|
||||
if (__builtin_expect(mid_ptr != NULL, 1)) return mid_ptr;
|
||||
}
|
||||
|
||||
#if HAKMEM_FEATURE_EVOLUTION
|
||||
if (g_evo_sample_mask > 0) {
|
||||
static _Atomic uint64_t tick_counter = 0;
|
||||
|
||||
@ -72,7 +72,6 @@ static void hak_init_impl(void) {
|
||||
hkm_whale_init();
|
||||
|
||||
// NEW Phase Hybrid: Initialize Mid Range MT allocator (8-32KB, mimalloc-style)
|
||||
mid_mt_init();
|
||||
|
||||
// NEW Phase 6.8: Initialize configuration system (replaces init_free_policy + init_thp_policy)
|
||||
hak_config_init();
|
||||
|
||||
@ -33,7 +33,6 @@ void* realloc(void* ptr, size_t size) {
|
||||
#include "../hakmem_pool.h" // Mid registry lookup (failsafe for headerless Mid)
|
||||
#include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification
|
||||
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
|
||||
#include "mid_free_route_box.h" // Phase 5-Step2: Mid MT free routing fix
|
||||
|
||||
// malloc wrapper - intercepts system malloc() calls
|
||||
__thread uint64_t g_malloc_total_calls = 0;
|
||||
@ -226,11 +225,6 @@ void free(void* ptr) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Phase 5-Step2: Mid Free Route Box (BEFORE classify_ptr)
|
||||
// Quick fix for 19x free() slowdown: Try Mid MT registry first
|
||||
// If found, route directly to mid_mt_free() and return
|
||||
if (mid_free_route_try(ptr)) return;
|
||||
|
||||
// Classify pointer BEFORE early libc fallbacks to avoid misrouting Tiny pointers
|
||||
// This is safe: classifier uses header probe and registry; does not allocate.
|
||||
int is_hakmem_owned = 0;
|
||||
|
||||
@ -1,109 +0,0 @@
|
||||
/**
|
||||
* mid_free_route_box.h
|
||||
*
|
||||
* Box: Mid Free Route Box
|
||||
* Responsibility: Route Mid MT allocations to correct free path
|
||||
* Contract: Try Mid MT registry lookup, return success/failure
|
||||
*
|
||||
* Part of Phase 5-Step2 fix for 19x free() slowdown
|
||||
*
|
||||
* Problem:
|
||||
* - Mid MT allocator registers chunks in MidGlobalRegistry
|
||||
* - Free path searches Pool's mid_desc registry (different registry!)
|
||||
* - Result: 100% lookup failure → 4x cascading lookups → 19x slower
|
||||
*
|
||||
* Solution:
|
||||
* - Add Mid MT registry lookup BEFORE Pool registry lookup
|
||||
* - Route directly to mid_mt_free() if found
|
||||
* - Fall through to existing path if not found
|
||||
*
|
||||
* Performance Impact:
|
||||
* - Before: 1.42 M ops/s (19x slower than system malloc)
|
||||
* - After: 14-21 M ops/s (Option B quick fix, 10-15x improvement)
|
||||
*
|
||||
* Created: 2025-11-29 (Phase 5-Step2 Mid MT Gap Fix)
|
||||
*/
|
||||
|
||||
#ifndef MID_FREE_ROUTE_BOX_H
|
||||
#define MID_FREE_ROUTE_BOX_H
|
||||
|
||||
#include "../hakmem_mid_mt.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
// Box Contract: Mid MT Free Routing
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* mid_free_route_try - Try Mid MT free path first
|
||||
*
|
||||
* @param ptr Pointer to free
|
||||
* @return true if handled by Mid MT, false to fall through
|
||||
*
|
||||
* Phase 6-B: Header-based detection (lock-free!)
|
||||
*
|
||||
* Box Responsibilities:
|
||||
* 1. Read MidMTHeader from ptr - sizeof(MidMTHeader)
|
||||
* 2. Check magic number (0xAB42)
|
||||
* 3. If valid: Call mid_mt_free() and return true
|
||||
* 4. If invalid: Return false (let existing path handle it)
|
||||
*
|
||||
* Box Guarantees:
|
||||
* - Zero side effects if returning false
|
||||
* - Correct free if returning true
|
||||
* - Thread-safe (lock-free header read)
|
||||
*
|
||||
* Performance:
|
||||
* - Before (Phase 5): O(log N) registry lookup + mutex = ~50 cycles (13.98% CPU)
|
||||
* - After (Phase 6-B): O(1) header read + magic check = ~2 cycles (0.01% CPU)
|
||||
* - Expected improvement: +17-27% throughput
|
||||
*
|
||||
* Usage Example:
|
||||
* void free(void* ptr) {
|
||||
* if (mid_free_route_try(ptr)) return; // Mid MT handled
|
||||
* // Fall through to existing free path...
|
||||
* }
|
||||
*/
|
||||
__attribute__((always_inline))
|
||||
static inline bool mid_free_route_try(void* ptr) {
|
||||
if (!ptr) return false; // NULL ptr, not Mid MT
|
||||
|
||||
// Phase 6-B: Read header for O(1) detection (no mutex!)
|
||||
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
|
||||
// Check magic number to identify Mid MT allocation
|
||||
if (hdr->magic == MID_MT_MAGIC) {
|
||||
// Valid Mid MT allocation, route to mid_mt_free()
|
||||
// Pass block_size from header (no size needed from caller!)
|
||||
mid_mt_free(ptr, hdr->block_size);
|
||||
return true; // Handled
|
||||
}
|
||||
|
||||
// Not a Mid MT allocation, fall through to existing path
|
||||
return false;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Box Observability (Debug/Profiling)
|
||||
// ============================================================================
|
||||
|
||||
#if MID_DEBUG
|
||||
/**
|
||||
* mid_free_route_stats - Print Mid Free Route Box statistics
|
||||
*
|
||||
* Only available in debug builds (MID_DEBUG=1)
|
||||
* Tracks hit/miss rates for performance analysis
|
||||
*/
|
||||
void mid_free_route_stats(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // MID_FREE_ROUTE_BOX_H
|
||||
@ -16,7 +16,6 @@
|
||||
#include "hakmem_tiny.h" // NEW Phase 6.12: Tiny Pool (≤1KB)
|
||||
#include "hakmem_tiny_superslab.h" // NEW Phase 7.6: SuperSlab for Tiny Pool
|
||||
#include "tiny_fastcache.h" // NEW Phase 6-3: Tiny Fast Path (System tcache style)
|
||||
#include "hakmem_mid_mt.h" // NEW Phase Hybrid: Mid Range MT (8-32KB, mimalloc-style)
|
||||
#include "hakmem_super_registry.h" // NEW Phase 1: SuperSlab Registry (mincore elimination)
|
||||
#include "hakmem_elo.h" // NEW: ELO Strategy Selection (Phase 6.2)
|
||||
#include "hakmem_ace_stats.h" // NEW: ACE lightweight stats (avoid implicit decl warnings)
|
||||
|
||||
@ -1,451 +0,0 @@
|
||||
/**
|
||||
* hakmem_mid_mt.c
|
||||
*
|
||||
* Mid Range Multi-threaded Allocator Implementation (8-32KB)
|
||||
* mimalloc-style per-thread segment for optimal MT performance
|
||||
*
|
||||
* Design:
|
||||
* - Per-thread segments (TLS) for lock-free allocation
|
||||
* - Global registry for segment lookup during free()
|
||||
* - 64KB chunks with bump + free list allocation
|
||||
* - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking)
|
||||
* - Phase 2: Will add atomic remote free list
|
||||
*/
|
||||
|
||||
#include "hakmem_mid_mt.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
// Use likely/unlikely hints for branch prediction
|
||||
#ifndef likely
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#endif
|
||||
#ifndef unlikely
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
// Global and TLS Variables
|
||||
// ============================================================================
|
||||
|
||||
// TLS: Each thread has independent segments (lock-free!)
|
||||
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};
|
||||
|
||||
// Phase 6-B: Registry removed (no longer needed with header-based free)
|
||||
|
||||
// Statistics (if enabled)
|
||||
#if MID_ENABLE_STATS
|
||||
MidStats g_mid_stats = {0};
|
||||
#endif
|
||||
|
||||
// Initialization flag
|
||||
static volatile int g_mid_initialized = 0;
|
||||
static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
// ============================================================================
|
||||
// Forward Declarations
|
||||
// ============================================================================
|
||||
|
||||
static bool segment_refill(MidThreadSegment* seg, int class_idx);
|
||||
static void* segment_alloc(MidThreadSegment* seg, int class_idx);
|
||||
static void segment_free_local(MidThreadSegment* seg, void* ptr);
|
||||
static void* chunk_allocate(size_t chunk_size);
|
||||
static void chunk_deallocate(void* chunk, size_t chunk_size);
|
||||
// Phase 6-B: Registry functions removed (header-based free instead)
|
||||
|
||||
// ============================================================================
|
||||
// Chunk Management (mmap/munmap wrappers)
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* chunk_allocate - Allocate a new chunk via mmap
|
||||
*
|
||||
* @param chunk_size Size of chunk (typically 64KB)
|
||||
* @return Chunk base address, or NULL on failure
|
||||
*/
|
||||
static void* chunk_allocate(size_t chunk_size) {
|
||||
void* chunk = mmap(
|
||||
NULL,
|
||||
chunk_size,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS,
|
||||
-1,
|
||||
0
|
||||
);
|
||||
|
||||
if (chunk == MAP_FAILED) {
|
||||
MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size);
|
||||
return chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* chunk_deallocate - Free chunk via munmap
|
||||
*
|
||||
* @param chunk Chunk base address
|
||||
* @param chunk_size Size of chunk
|
||||
*/
|
||||
static void chunk_deallocate(void* chunk, size_t chunk_size) {
|
||||
if (!chunk) return;
|
||||
|
||||
int ret = munmap(chunk, chunk_size);
|
||||
if (ret != 0) {
|
||||
MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size);
|
||||
} else {
|
||||
MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size);
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Segment Operations
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* segment_refill - Allocate new chunk and setup segment
|
||||
*
|
||||
* Called when segment is exhausted (rare, ~0.1% of allocations)
|
||||
*
|
||||
* Phase 6-B: No longer registers chunks (header-based free instead)
|
||||
*
|
||||
* @return true on success, false on OOM
|
||||
*/
|
||||
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
|
||||
size_t block_size = mid_class_to_size(class_idx);
|
||||
size_t chunk_size = MID_CHUNK_SIZE;
|
||||
|
||||
// Allocate new chunk via mmap
|
||||
void* chunk = chunk_allocate(chunk_size);
|
||||
if (!chunk) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Phase 6-B: No registry add (header-based free doesn't need registry)
|
||||
|
||||
// Setup segment
|
||||
seg->chunk_base = chunk;
|
||||
seg->chunk_size = chunk_size;
|
||||
seg->block_size = block_size;
|
||||
seg->current = chunk;
|
||||
seg->end = (uint8_t*)chunk + chunk_size;
|
||||
seg->capacity = chunk_size / block_size;
|
||||
seg->refill_count++;
|
||||
|
||||
MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p",
|
||||
class_idx, block_size, seg->capacity, chunk);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* segment_alloc - Allocate from segment (fast path)
|
||||
*
|
||||
* PERFORMANCE: Force inline for maximum speed
|
||||
*
|
||||
* Fast path priority:
|
||||
* 1. Free list (most common, ~90-95% hit rate)
|
||||
* 2. Bump allocation (when free list empty)
|
||||
* 3. Refill (when segment exhausted)
|
||||
*
|
||||
* Phase 6-B: Now writes MidMTHeader for lock-free free()
|
||||
*
|
||||
* @return Allocated pointer (after header), or NULL on OOM
|
||||
*/
|
||||
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
|
||||
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
|
||||
void* block; // Block start (includes header space)
|
||||
size_t block_size = seg->block_size;
|
||||
|
||||
// === Path 0: First allocation - need refill ===
|
||||
// CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
|
||||
if (unlikely(seg->chunk_base == NULL)) {
|
||||
if (!segment_refill(seg, class_idx)) {
|
||||
return NULL; // OOM
|
||||
}
|
||||
block_size = seg->block_size; // Update after refill
|
||||
}
|
||||
|
||||
// === Path 1: Free list (fastest, ~4-5 instructions) ===
|
||||
// Note: Free list stores next pointer at block start (overwrites header when freed)
|
||||
block = seg->free_list;
|
||||
if (likely(block != NULL)) {
|
||||
seg->free_list = *(void**)block; // Pop from free list
|
||||
seg->used_count++;
|
||||
seg->alloc_count++;
|
||||
|
||||
// Phase 6-B: Write header before returning
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
hdr->block_size = (uint32_t)block_size;
|
||||
hdr->class_idx = (uint16_t)class_idx;
|
||||
hdr->magic = MID_MT_MAGIC;
|
||||
|
||||
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
||||
}
|
||||
|
||||
// === Path 2: Bump allocation (fast, ~6-8 instructions) ===
|
||||
block = seg->current;
|
||||
void* next = (uint8_t*)block + block_size;
|
||||
|
||||
if (likely(next <= seg->end)) {
|
||||
seg->current = next;
|
||||
seg->used_count++;
|
||||
seg->alloc_count++;
|
||||
|
||||
// Phase 6-B: Write header before returning
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
hdr->block_size = (uint32_t)block_size;
|
||||
hdr->class_idx = (uint16_t)class_idx;
|
||||
hdr->magic = MID_MT_MAGIC;
|
||||
|
||||
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
||||
}
|
||||
|
||||
// === Path 3: Refill (slow, called ~once per 64KB) ===
|
||||
if (!segment_refill(seg, class_idx)) {
|
||||
return NULL; // OOM
|
||||
}
|
||||
|
||||
// Retry after refill
|
||||
block = seg->current;
|
||||
block_size = seg->block_size; // Update after refill
|
||||
seg->current = (uint8_t*)block + block_size;
|
||||
seg->used_count++;
|
||||
seg->alloc_count++;
|
||||
|
||||
// Phase 6-B: Write header before returning
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
hdr->block_size = (uint32_t)block_size;
|
||||
hdr->class_idx = (uint16_t)class_idx;
|
||||
hdr->magic = MID_MT_MAGIC;
|
||||
|
||||
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
||||
}
|
||||
|
||||
/**
|
||||
* segment_free_local - Free to local segment (same thread)
|
||||
*
|
||||
* @param seg Segment to free to
|
||||
* @param ptr Pointer to free (user pointer, after header)
|
||||
*
|
||||
* Phase 6-B: Adjusted for header-based allocation
|
||||
*/
|
||||
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
|
||||
// Phase 6-B: Get block start (before header)
|
||||
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
||||
|
||||
// Push to free list (lock-free, local operation)
|
||||
// Note: Overwrites header with next pointer (header no longer needed after free)
|
||||
*(void**)block = seg->free_list;
|
||||
seg->free_list = block;
|
||||
seg->used_count--;
|
||||
seg->free_count++;
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
__sync_fetch_and_add(&g_mid_stats.local_frees, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Public API
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* mid_mt_init - Initialize Mid Range MT allocator
|
||||
*
|
||||
* Thread-safe, idempotent
|
||||
*
|
||||
* Phase 6-B: Simplified (no registry initialization)
|
||||
*/
|
||||
void mid_mt_init(void) {
|
||||
if (g_mid_initialized) return;
|
||||
|
||||
pthread_mutex_lock(&g_init_lock);
|
||||
|
||||
if (!g_mid_initialized) {
|
||||
// Phase 6-B: No registry initialization (header-based free)
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
memset(&g_mid_stats, 0, sizeof(g_mid_stats));
|
||||
#endif
|
||||
|
||||
g_mid_initialized = 1;
|
||||
|
||||
MID_LOG("Mid MT allocator initialized (Phase 6-B: header-based)");
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&g_init_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB)
|
||||
*
|
||||
* Thread-safe, lock-free (uses TLS)
|
||||
*/
|
||||
void* mid_mt_alloc(size_t size) {
|
||||
// Validate size range (Phase 16: dynamic min size based on Tiny's max)
|
||||
if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Initialize if needed (thread-safe)
|
||||
if (unlikely(!g_mid_initialized)) {
|
||||
mid_mt_init();
|
||||
}
|
||||
|
||||
// Get size class
|
||||
int class_idx = mid_size_to_class(size);
|
||||
if (unlikely(class_idx < 0)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Get thread-local segment
|
||||
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
||||
|
||||
// Allocate from segment (fast path)
|
||||
void* p = segment_alloc(seg, class_idx);
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
if (p) {
|
||||
__sync_fetch_and_add(&g_mid_stats.total_allocs, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_mt_free - Free memory allocated by mid_mt_alloc
|
||||
*
|
||||
* Phase 6-B: Header-based free (lock-free, no registry lookup!)
|
||||
* - Reads MidMTHeader to get block metadata (O(1), ~2 cycles)
|
||||
* - Eliminates pthread_mutex_lock/unlock (13.98% CPU overhead)
|
||||
* - Expected: +17-27% throughput improvement
|
||||
*
|
||||
* Local free (same thread): Ultra-fast, lock-free
|
||||
* Remote free (cross-thread): NOT IMPLEMENTED (memory leak, Phase 2 will add atomic remote free list)
|
||||
*/
|
||||
void mid_mt_free(void* ptr, size_t size) {
|
||||
if (unlikely(!ptr)) return;
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
__sync_fetch_and_add(&g_mid_stats.total_frees, 1);
|
||||
#endif
|
||||
|
||||
// Phase 6-B: Read header for O(1) metadata lookup (no mutex!)
|
||||
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
|
||||
// Validate header magic (sanity check)
|
||||
if (unlikely(hdr->magic != MID_MT_MAGIC)) {
|
||||
MID_LOG("ERROR: Invalid Mid MT magic 0x%X (expected 0x%X) for ptr %p",
|
||||
hdr->magic, MID_MT_MAGIC, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get metadata from header (no registry lookup!)
|
||||
int class_idx = hdr->class_idx;
|
||||
|
||||
// Validate class_idx
|
||||
if (unlikely(class_idx < 0 || class_idx >= MID_NUM_CLASSES)) {
|
||||
MID_LOG("ERROR: Invalid class_idx %d in header for ptr %p", class_idx, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get thread-local segment for this size class
|
||||
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
||||
|
||||
// === Fast path: Check if block belongs to current segment ===
|
||||
// Note: Check block (not ptr), since segment tracks block addresses
|
||||
if (likely(seg->chunk_base != NULL &&
|
||||
block >= seg->chunk_base &&
|
||||
block < seg->end)) {
|
||||
// Local free (same thread, lock-free)
|
||||
segment_free_local(seg, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// === Slow path: Remote free (cross-thread) ===
|
||||
// Phase 1: NOT IMPLEMENTED
|
||||
// We would need to find the owning segment and push to its remote free list.
|
||||
//
|
||||
// For Phase 1 (benchmarking), we accept this memory leak.
|
||||
// bench_mid_mt_gap uses single-threaded workload, so remote frees never happen.
|
||||
|
||||
MID_LOG("WARNING: Remote free not implemented, leaking %p (block_size=%u, class=%d)",
|
||||
ptr, hdr->block_size, class_idx);
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
|
||||
#endif
|
||||
|
||||
// TODO Phase 2: Implement remote free
|
||||
// segment_free_remote(ptr, hdr->block_size, class_idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_mt_thread_exit - Cleanup thread-local segments
|
||||
*
|
||||
* Called on thread exit to release resources
|
||||
*
|
||||
* Phase 6-B: No registry cleanup needed (header-based free)
|
||||
*/
|
||||
void mid_mt_thread_exit(void) {
|
||||
MID_LOG("Thread exit cleanup");
|
||||
|
||||
// Free all chunks from this thread's segments
|
||||
for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
|
||||
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
||||
|
||||
if (seg->chunk_base) {
|
||||
// Phase 6-B: No registry remove (no registry exists)
|
||||
|
||||
// Deallocate chunk
|
||||
chunk_deallocate(seg->chunk_base, seg->chunk_size);
|
||||
|
||||
// Clear segment
|
||||
memset(seg, 0, sizeof(MidThreadSegment));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Statistics (Debug/Profiling)
|
||||
// ============================================================================
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
|
||||
void mid_mt_print_stats(void) {
|
||||
printf("\n=== Mid Range MT Statistics ===\n");
|
||||
printf("Total allocations: %lu\n", g_mid_stats.total_allocs);
|
||||
printf("Total frees: %lu\n", g_mid_stats.total_frees);
|
||||
printf("Local frees: %lu (%.1f%%)\n",
|
||||
g_mid_stats.local_frees,
|
||||
100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1));
|
||||
printf("Remote frees: %lu (%.1f%%)\n",
|
||||
g_mid_stats.remote_frees,
|
||||
100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1));
|
||||
printf("Registry lookups: %lu\n", g_mid_stats.registry_lookups);
|
||||
printf("\n");
|
||||
|
||||
// Per-segment stats
|
||||
for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
|
||||
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
||||
if (seg->alloc_count > 0) {
|
||||
printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx));
|
||||
printf(" Allocations: %lu\n", seg->alloc_count);
|
||||
printf(" Frees: %lu\n", seg->free_count);
|
||||
printf(" Refills: %u\n", seg->refill_count);
|
||||
printf(" Used count: %u / %u\n", seg->used_count, seg->capacity);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#endif // MID_ENABLE_STATS
|
||||
@ -1,287 +0,0 @@
|
||||
/**
|
||||
* hakmem_mid_mt.h
|
||||
*
|
||||
* Mid Range Multi-threaded Allocator (1-32KB)
|
||||
* mimalloc-style per-thread segment design for optimal MT performance
|
||||
*
|
||||
* Part of Hybrid Approach:
|
||||
* - ≤1023B: Tiny Pool (header-based, C7 usable size)
|
||||
* - 1-32KB: Mid MT (this module, mimalloc-style per-thread)
|
||||
* - ≥64KB: Large Pool (learning-based, ELO strategies)
|
||||
*
|
||||
* Created: 2025-11-01
|
||||
* Goal: 46M → 100-120M ops/s (2.2-2.6x improvement)
|
||||
*/
|
||||
|
||||
#ifndef HAKMEM_MID_MT_H
|
||||
#define HAKMEM_MID_MT_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
// Size Classes
|
||||
// ============================================================================
|
||||
|
||||
#define MID_SIZE_CLASS_8K 0 // 8KB blocks
|
||||
#define MID_SIZE_CLASS_16K 1 // 16KB blocks
|
||||
#define MID_SIZE_CLASS_32K 2 // 32KB blocks
|
||||
#define MID_NUM_CLASSES 3 // Total number of size classes
|
||||
|
||||
// ============================================================================
|
||||
// Phase 6-B: Header-based Allocation (Lock-free Free)
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* MidMTHeader - Per-allocation header for lock-free free()
|
||||
*
|
||||
* Prepended to each Mid MT allocation for O(1) metadata lookup.
|
||||
* Eliminates need for global registry + mutex (13.98% CPU overhead).
|
||||
*
|
||||
* Memory Layout:
|
||||
* [MidMTHeader: 8 bytes][User data: block_size - 8 bytes]
|
||||
* ^ ^
|
||||
* block returned to user
|
||||
*
|
||||
* Performance:
|
||||
* - Before: pthread_mutex_lock (8.12%) + unlock (5.86%) = 13.98% CPU
|
||||
* - After: Simple header read (~2 cycles) = 0.01% CPU
|
||||
* - Expected: +17-27% throughput improvement
|
||||
*/
|
||||
typedef struct MidMTHeader {
|
||||
uint32_t block_size; // Block size (8192/16384/32768)
|
||||
uint16_t class_idx; // Size class index (0-2)
|
||||
uint16_t magic; // Magic number for validation
|
||||
} MidMTHeader;
|
||||
|
||||
#define MID_MT_MAGIC 0xAB42 // Mid MT allocation marker
|
||||
|
||||
// Phase 13: Close Tiny/Mid gap.
|
||||
// Phase 16: Dynamic Mid min size - must start where Tiny ends
|
||||
// Tiny max size is configurable via HAKMEM_TINY_MAX_CLASS:
|
||||
// - HAKMEM_TINY_MAX_CLASS=7 (default) → Tiny up to 1023B → Mid starts at 1024B
|
||||
// - HAKMEM_TINY_MAX_CLASS=5 → Tiny up to 255B → Mid starts at 256B
|
||||
#include "hakmem_tiny.h" // For tiny_get_max_size()
|
||||
|
||||
#define MID_MIN_SIZE_STATIC (1024) // Static fallback (C7 default)
|
||||
#define MID_MAX_SIZE (32 * 1024) // 32KB
|
||||
|
||||
static inline size_t mid_get_min_size(void) {
|
||||
// Phase 5-Step2 FIX: Use static 1024 instead of tiny_get_max_size() + 1
|
||||
// Bug: tiny_get_max_size() returns 2047 (C7 usable), making min = 2048
|
||||
// This caused 1KB-2KB allocations to fall through to mmap() (100-1000x slower!)
|
||||
// Fix: Use MID_MIN_SIZE_STATIC (1024) to align with actual Tiny/Mid boundary
|
||||
return MID_MIN_SIZE_STATIC; // 1024 = TINY_MAX_SIZE
|
||||
}
|
||||
#define MID_CHUNK_SIZE (4 * 1024 * 1024) // 4MB chunks (same as mimalloc segments)
|
||||
|
||||
// ============================================================================
|
||||
// Data Structures
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* MidThreadSegment - Per-thread segment for lock-free allocation
|
||||
*
|
||||
* Memory layout optimized for cache line alignment (64 bytes)
|
||||
* - Cache line 0: Fast path fields (free_list, current, end, used_count)
|
||||
* - Cache line 1: Metadata (chunk_base, sizes, capacity)
|
||||
* - Cache line 2: Statistics (optional, for debugging)
|
||||
*/
|
||||
typedef struct MidThreadSegment {
|
||||
// === Fast Path (Cache line 0) ===
|
||||
void* free_list; // Free objects linked list (NULL if empty)
|
||||
void* current; // Bump allocation pointer
|
||||
void* end; // End of current chunk
|
||||
uint32_t used_count; // Number of allocated blocks
|
||||
uint32_t padding0; // Alignment padding
|
||||
|
||||
// === Metadata (Cache line 1) ===
|
||||
void* chunk_base; // Base address of current chunk
|
||||
size_t chunk_size; // Size of chunk (typically 64KB)
|
||||
size_t block_size; // Size of each block (8KB/16KB/32KB)
|
||||
uint32_t capacity; // Total blocks in chunk
|
||||
uint32_t padding1; // Alignment padding
|
||||
|
||||
// === Statistics (Cache line 2) ===
|
||||
uint64_t alloc_count; // Total allocations
|
||||
uint64_t free_count; // Total frees
|
||||
uint32_t refill_count; // Number of chunk refills
|
||||
uint32_t padding2; // Alignment padding
|
||||
|
||||
} __attribute__((aligned(64))) MidThreadSegment;
|
||||
|
||||
// Phase 6-B: Registry structures removed (header-based free instead)
|
||||
|
||||
// ============================================================================
|
||||
// Global Variables
|
||||
// ============================================================================
|
||||
|
||||
// TLS: Each thread has its own segments (lock-free!)
|
||||
extern __thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES];
|
||||
|
||||
// ============================================================================
|
||||
// API Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* mid_mt_init - Initialize Mid Range MT allocator
|
||||
*
|
||||
* Call once at startup (thread-safe, idempotent)
|
||||
*/
|
||||
void mid_mt_init(void);
|
||||
|
||||
/**
|
||||
* mid_mt_alloc - Allocate memory from Mid Range pool
|
||||
*
|
||||
* @param size Allocation size (must be mid_get_min_size() ≤ size ≤ MID_MAX_SIZE)
|
||||
* Phase 16: Range adjusts dynamically based on Tiny's max size
|
||||
* Default: 1024B-32KB, can expand to 256B-32KB if Tiny reduced to C0-C5
|
||||
* @return Allocated pointer (aligned to block_size), or NULL on failure
|
||||
*
|
||||
* Thread-safety: Lock-free (uses TLS)
|
||||
* Performance: O(1) fast path, O(1) amortized
|
||||
*
|
||||
* Fast path:
|
||||
* 1. Check free_list (most common, ~4-5 instructions)
|
||||
* 2. Bump allocation if free_list empty (~6-8 instructions)
|
||||
* 3. Refill chunk if segment exhausted (rare, ~0.1%)
|
||||
*/
|
||||
void* mid_mt_alloc(size_t size);
|
||||
|
||||
/**
|
||||
* mid_mt_free - Free memory allocated by mid_mt_alloc
|
||||
*
|
||||
* @param ptr Pointer to free (must be from mid_mt_alloc)
|
||||
* @param size Original allocation size (for size class lookup)
|
||||
*
|
||||
* Thread-safety: Lock-free if freeing to own thread's segment
|
||||
* Requires registry lock if remote free (cross-thread)
|
||||
* Performance: O(1) local free, O(log N) remote free (registry lookup)
|
||||
*
|
||||
* Note: Phase 1 implementation does not handle remote free (memory leak)
|
||||
* Phase 2 will implement per-segment atomic remote free list
|
||||
*/
|
||||
void mid_mt_free(void* ptr, size_t size);
|
||||
|
||||
/**
|
||||
* mid_mt_thread_exit - Cleanup thread-local segments
|
||||
*
|
||||
* Called on thread exit to release resources
|
||||
* Should be registered via pthread_key_create or __attribute__((destructor))
|
||||
*/
|
||||
void mid_mt_thread_exit(void);
|
||||
|
||||
// Phase 6-B: mid_registry_lookup() removed (header-based free instead)
|
||||
|
||||
// ============================================================================
|
||||
// Inline Helper Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* mid_size_to_class - Convert size to size class index
|
||||
*
|
||||
* @param size Allocation size
|
||||
* @return Size class index (0-2), or -1 if out of range
|
||||
*/
|
||||
static inline int mid_size_to_class(size_t size) {
|
||||
if (size <= 8192) return MID_SIZE_CLASS_8K;
|
||||
if (size <= 16384) return MID_SIZE_CLASS_16K;
|
||||
if (size <= 32768) return MID_SIZE_CLASS_32K;
|
||||
return -1; // Out of range
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_class_to_size - Convert size class to block size
|
||||
*
|
||||
* @param class_idx Size class index (0-2)
|
||||
* @return Block size in bytes
|
||||
*/
|
||||
static inline size_t mid_class_to_size(int class_idx) {
|
||||
static const size_t sizes[MID_NUM_CLASSES] = {
|
||||
8192, // 8KB
|
||||
16384, // 16KB
|
||||
32768 // 32KB
|
||||
};
|
||||
return (class_idx >= 0 && class_idx < MID_NUM_CLASSES) ? sizes[class_idx] : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_is_in_range - Check if size is in Mid Range pool range
|
||||
*
|
||||
* @param size Allocation size
|
||||
* @return true if (tiny_max+1) ≤ size ≤ 32KB
|
||||
*
|
||||
* Phase 16: Dynamic range - adjusts based on Tiny's max size
|
||||
* PERF_OPT: Force inline to eliminate function call overhead in hot path
|
||||
*/
|
||||
__attribute__((always_inline))
|
||||
static inline bool mid_is_in_range(size_t size) {
|
||||
return (size >= mid_get_min_size() && size <= MID_MAX_SIZE);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Configuration (can be overridden via environment variables)
|
||||
// ============================================================================
|
||||
|
||||
// Default chunk size (64KB)
|
||||
#ifndef MID_DEFAULT_CHUNK_SIZE
|
||||
#define MID_DEFAULT_CHUNK_SIZE (64 * 1024)
|
||||
#endif
|
||||
|
||||
// Initial registry capacity
|
||||
#ifndef MID_REGISTRY_INITIAL_CAPACITY
|
||||
#define MID_REGISTRY_INITIAL_CAPACITY 64
|
||||
#endif
|
||||
|
||||
// Enable/disable statistics collection
|
||||
#ifndef MID_ENABLE_STATS
|
||||
#define MID_ENABLE_STATS 0 // DISABLED for performance
|
||||
#endif
|
||||
|
||||
// Enable/disable debug logging
|
||||
#ifndef MID_DEBUG
|
||||
#define MID_DEBUG 0 // DISABLE for performance testing
|
||||
#endif
|
||||
|
||||
#if MID_DEBUG
|
||||
#include <stdio.h>
|
||||
#define MID_LOG(fmt, ...) fprintf(stderr, "[MID_MT] " fmt "\n", ##__VA_ARGS__)
|
||||
#else
|
||||
#define MID_LOG(fmt, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
// Statistics (Debug/Profiling)
|
||||
// ============================================================================
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
|
||||
/**
|
||||
* MidStats - Global statistics for profiling
|
||||
*/
|
||||
typedef struct MidStats {
|
||||
uint64_t total_allocs; // Total allocations
|
||||
uint64_t total_frees; // Total frees
|
||||
uint64_t total_refills; // Total chunk refills
|
||||
uint64_t local_frees; // Local frees (same thread)
|
||||
uint64_t remote_frees; // Remote frees (cross-thread)
|
||||
uint64_t registry_lookups; // Registry lookups
|
||||
} MidStats;
|
||||
|
||||
extern MidStats g_mid_stats;
|
||||
|
||||
void mid_mt_print_stats(void);
|
||||
|
||||
#endif // MID_ENABLE_STATS
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HAKMEM_MID_MT_H
|
||||
Reference in New Issue
Block a user