From 0da9f8cba352f4157bbc7fd5643cc605648c48b5 Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Sun, 9 Nov 2025 11:50:18 +0900 Subject: [PATCH] Phase 7 + Pool TLS 1.5b stabilization:\n- Add build hygiene (dep tracking, flag consistency, print-flags)\n- Add build.sh + verify_build.sh (unified recipe, freshness check)\n- Quiet verbose logs behind HAKMEM_DEBUG_VERBOSE\n- A/B free safety via HAKMEM_TINY_SAFE_FREE (mincore strict vs boundary)\n- Tweak Tiny header path to reduce noise; Pool TLS free guard optimized\n- Fix mimalloc link retention (--no-as-needed + force symbol)\n- Add docs/BUILD_PHASE7_POOL_TLS.md (cheatsheet) --- Makefile | 106 ++++++++++++++++++++++++++++------ bench_mi_force.c | 22 +++++++ build.sh | 29 ++++++++++ core/box/hak_alloc_api.inc.h | 3 +- core/box/hak_free_api.inc.h | 44 +++++++++----- core/hakmem_build_flags.h | 11 ++++ core/tiny_free_fast_v2.inc.h | 38 ++++++++---- core/tiny_region_id.h | 27 +++++++-- docs/BUILD_PHASE7_POOL_TLS.md | 90 +++++++++++++++++++++++++++++ verify_build.sh | 27 +++++++++ 10 files changed, 349 insertions(+), 48 deletions(-) create mode 100644 bench_mi_force.c create mode 100755 build.sh create mode 100644 docs/BUILD_PHASE7_POOL_TLS.md create mode 100755 verify_build.sh diff --git a/Makefile b/Makefile index 7baa05c0..65afe1c1 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,27 @@ ifeq ($(USE_LTO),1) LDFLAGS += -flto endif +# ------------------------------------------------------------ +# Build hygiene: dependency tracking + flag consistency checks +# ------------------------------------------------------------ + +# Track header dependencies for explicit compile rules as well +CFLAGS += -MMD -MP + +# If someone injects -DHAKMEM_POOL_TLS_PHASE1=1 directly into CFLAGS +# but forgets POOL_TLS_PHASE1=1, object lists will miss pool_tls*.o. +# Fail fast to avoid confusing link/runtime errors. +ifneq ($(filter -DHAKMEM_POOL_TLS_PHASE1=1,$(CFLAGS)),) + ifneq ($(POOL_TLS_PHASE1),1) + $(error Detected -DHAKMEM_POOL_TLS_PHASE1=1 in CFLAGS but POOL_TLS_PHASE1!=1. Please invoke: make POOL_TLS_PHASE1=1 ...) + endif +endif + +# Include generated .d files if present (safe even if none yet) +# Filter to only files (not directories like glibc-2.38/build/iconvdata/gconv-modules.d) +# Also exclude glibc and mimalloc-bench subdirectories +-include $(shell find . -name '*.d' -type f -not -path './glibc*' -not -path './mimalloc-bench*' 2>/dev/null) + # Default: enable Box Theory refactor for Tiny (Phase 6-1.7) # This is the best performing option currently (4.19M ops/s) # NOTE: Disabled while testing ULTRA_SIMPLE with SFC integration @@ -145,24 +166,45 @@ SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o # Pool TLS Phase 1 (enable with POOL_TLS_PHASE1=1) ifeq ($(POOL_TLS_PHASE1),1) -SHARED_OBJS += pool_tls_shared.o pool_refill_shared.o +OBJS += pool_tls.o pool_refill.o pool_tls_arena.o +SHARED_OBJS += pool_tls_shared.o pool_refill_shared.o pool_tls_arena_shared.o CFLAGS += -DHAKMEM_POOL_TLS_PHASE1=1 CFLAGS_SHARED += -DHAKMEM_POOL_TLS_PHASE1=1 endif +# Pool TLS Phase 1.5b - Pre-warm optimization +ifeq ($(POOL_TLS_PREWARM),1) +CFLAGS += -DHAKMEM_POOL_TLS_PREWARM=1 +CFLAGS_SHARED += -DHAKMEM_POOL_TLS_PREWARM=1 +endif + # Benchmark targets BENCH_HAKMEM = bench_allocators_hakmem BENCH_SYSTEM = bench_allocators_system BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o bench_allocators_hakmem.o BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE) ifeq ($(POOL_TLS_PHASE1),1) -BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o +BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o endif BENCH_SYSTEM_OBJS = bench_allocators_system.o # Default target all: $(TARGET) +# Show key build-time switches for troubleshooting +.PHONY: print-flags +print-flags: + @echo "==== Build Switches ====" + @echo "POOL_TLS_PHASE1 = $(POOL_TLS_PHASE1)" + @echo "POOL_TLS_PREWARM = $(POOL_TLS_PREWARM)" + @echo "HEADER_CLASSIDX = $(HEADER_CLASSIDX)" + @echo "AGGRESSIVE_INLINE = $(AGGRESSIVE_INLINE)" + @echo "PREWARM_TLS = $(PREWARM_TLS)" + @echo "USE_LTO = $(USE_LTO)" + @echo "OPT_LEVEL = $(OPT_LEVEL)" + @echo "NATIVE = $(NATIVE)" + @echo "CFLAGS contains = $(filter -DHAKMEM_POOL_TLS_PHASE1=1,$(CFLAGS))" + # Build test program $(TARGET): $(OBJS) $(CC) -o $@ $^ $(LDFLAGS) @@ -220,8 +262,11 @@ bench_tiny_hot_system: bench_tiny_hot_system.o bench_tiny_hot_mi.o: bench_tiny_hot.c $(CC) $(CFLAGS) -DUSE_MIMALLOC -I mimalloc-bench/extern/mi/include -c -o $@ $< -bench_tiny_hot_mi: bench_tiny_hot_mi.o - $(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +bench_mi_force.o: bench_mi_force.c + $(CC) $(CFLAGS) -I mimalloc-bench/extern/mi/include -c -o $@ $< + +bench_tiny_hot_mi: bench_tiny_hot_mi.o bench_mi_force.o + $(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # hakmi variant for tiny hot bench (direct link via front API) bench_tiny_hot_hakmi.o: bench_tiny_hot.c include/hakmi/hakmi_api.h adapters/hakmi_front/hakmi_front.h @@ -315,7 +360,7 @@ test-box-refactor: box-refactor TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o TINY_BENCH_OBJS = $(TINY_BENCH_OBJS_BASE) ifeq ($(POOL_TLS_PHASE1),1) -TINY_BENCH_OBJS += pool_tls.o pool_refill.o +TINY_BENCH_OBJS += pool_tls.o pool_refill.o core/pool_tls_arena.o endif bench_tiny: bench_tiny.o $(TINY_BENCH_OBJS) @@ -404,8 +449,8 @@ larson_system: larson_system.o larson_mi.o: $(LARSON_SRC) $(CXX) $(CFLAGS) -DUSE_MIMALLOC -I mimalloc-bench/extern/mi/include -c -o $@ $< -larson_mi: larson_mi.o - $(CXX) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +larson_mi: larson_mi.o bench_mi_force.o + $(CXX) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # HAKMEM variant (hakmem.o provides malloc/free symbols directly) larson_hakmem.o: $(LARSON_SRC) @@ -468,11 +513,12 @@ bench_tiny_hot_direct: bench_tiny_hot_direct.o $(TINY_BENCH_OBJS) @echo "✓ bench_tiny_hot_direct built (hak_tiny_alloc/free direct)" # hakmi variant for comprehensive bench (front + mimalloc backend) + bench_comprehensive_hakmi: bench_comprehensive.c include/hakmi/hakmi_api.h adapters/hakmi_front/hakmi_front.h $(CC) $(CFLAGS) -I include -DUSE_HAKMI -include include/hakmi/hakmi_api.h -Dmalloc=hakmi_malloc -Dfree=hakmi_free -Drealloc=hakmi_realloc \ bench_comprehensive.c -o $@ \ adapters/hakmi_front/hakmi_front.o adapters/hakmi_front/hakmi_env.o adapters/hakmi_front/hakmi_tls_front.o \ - -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) + -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) @echo "✓ bench_comprehensive_hakmi built (hakmi front + mimalloc backend)" # hakx variant for comprehensive bench @@ -497,15 +543,15 @@ bench_random_mixed_hakmem: bench_random_mixed_hakmem.o $(TINY_BENCH_OBJS) bench_random_mixed_system: bench_random_mixed_system.o $(CC) -o $@ $^ $(LDFLAGS) -bench_random_mixed_mi: bench_random_mixed_mi.o - $(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +bench_random_mixed_mi: bench_random_mixed_mi.o bench_mi_force.o + $(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # hakmi variant for random mixed bench bench_random_mixed_hakmi.o: bench_random_mixed.c include/hakmi/hakmi_api.h adapters/hakmi_front/hakmi_front.h $(CC) $(CFLAGS) -I include -DUSE_HAKMI -include include/hakmi/hakmi_api.h -Dmalloc=hakmi_malloc -Dfree=hakmi_free -Drealloc=hakmi_realloc -c -o $@ $< -bench_random_mixed_hakmi: bench_random_mixed_hakmi.o $(HAKMI_FRONT_OBJS) - $(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +bench_random_mixed_hakmi: bench_random_mixed_hakmi.o $(HAKMI_FRONT_OBJS) bench_mi_force.o + $(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # hakx variant for random mixed bench bench_random_mixed_hakx.o: bench_random_mixed.c include/hakx/hakx_api.h include/hakx/hakx_fast_inline.h @@ -551,8 +597,8 @@ bench_mid_large_hakmem: bench_mid_large_hakmem.o $(TINY_BENCH_OBJS) $(CC) -o $@ $^ $(LDFLAGS) bench_mid_large_system: bench_mid_large_system.o $(CC) -o $@ $^ $(LDFLAGS) -bench_mid_large_mi: bench_mid_large_mi.o - $(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +bench_mid_large_mi: bench_mid_large_mi.o bench_mi_force.o + $(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # hakx variant for mid/large (1T) bench_mid_large_hakx.o: bench_mid_large.c include/hakx/hakx_api.h include/hakx/hakx_fast_inline.h @@ -572,8 +618,8 @@ bench_mid_large_mt_hakmem: bench_mid_large_mt_hakmem.o $(TINY_BENCH_OBJS) $(CC) -o $@ $^ $(LDFLAGS) bench_mid_large_mt_system: bench_mid_large_mt_system.o $(CC) -o $@ $^ $(LDFLAGS) -bench_mid_large_mt_mi: bench_mid_large_mt_mi.o - $(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +bench_mid_large_mt_mi: bench_mid_large_mt_mi.o bench_mi_force.o + $(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # hakx variant for mid/large MT bench_mid_large_mt_hakx.o: bench_mid_large_mt.c include/hakx/hakx_api.h include/hakx/hakx_fast_inline.h @@ -593,8 +639,8 @@ bench_fragment_stress_hakmem: bench_fragment_stress_hakmem.o $(TINY_BENCH_OBJS) $(CC) -o $@ $^ $(LDFLAGS) bench_fragment_stress_system: bench_fragment_stress_system.o $(CC) -o $@ $^ $(LDFLAGS) -bench_fragment_stress_mi: bench_fragment_stress_mi.o - $(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS) +bench_fragment_stress_mi: bench_fragment_stress_mi.o bench_mi_force.o + $(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS) # Bench build with Minimal Tiny Front (physically excludes optional front tiers) bench_tiny_front: CFLAGS += -fno-plt -fno-semantic-interposition -fomit-frame-pointer -fno-unwind-tables -fno-asynchronous-unwind-tables -DHAKMEM_TINY_MINIMAL_FRONT=1 -DHAKMEM_BENCH_TINY_ONLY=1 -DHAKMEM_TINY_MAG_OWNER=0 @@ -1125,3 +1171,27 @@ larson_hakmem_route: @echo "Built larson_hakmem (3-layer + route)" @echo " HAKMEM_ROUTE build-flag set; runtime ENV still controls output" @echo "=========================================" + +# ---------------------------------------------------------------------------- +# Pool TLS Benchmarks (Phase 1.5b) +# ---------------------------------------------------------------------------- +# Build HAKMEM shared library first to satisfy -lhakmem +bench_pool_tls_hakmem: benchmarks/bench_pool_tls.c $(SHARED_LIB) + $(CC) $(CFLAGS) -o $@ $< -L. -lhakmem $(LDFLAGS) + +bench_pool_tls_system: benchmarks/bench_pool_tls.c + $(CC) $(CFLAGS) -DUSE_SYSTEM_MALLOC -o $@ $< $(LDFLAGS) + +.PHONY: bench-pool-tls +bench-pool-tls: bench_pool_tls_hakmem bench_pool_tls_system + @echo "=========================================" + @echo "Pool TLS Benchmark (8KB-52KB allocations)" + @echo "=========================================" + @echo "" + @echo "== HAKMEM (Phase 1.5b Pre-warm) ==" + @./bench_pool_tls_hakmem 1 100000 256 42 + @echo "" + @echo "== System malloc ==" + @./bench_pool_tls_system 1 100000 256 42 + @echo "" + @echo "=========================================" diff --git a/bench_mi_force.c b/bench_mi_force.c new file mode 100644 index 00000000..5b68855f --- /dev/null +++ b/bench_mi_force.c @@ -0,0 +1,22 @@ +// bench_mi_force.c +// Force a reference to a mimalloc symbol so the dynamic linker +// retains libmimalloc as a NEEDED dependency even with --as-needed. +#include + +#if defined(__cplusplus) +extern "C" { +#endif +// Declaration; actual signature returns const char* in mimalloc. +const char* mi_version(void); +#if defined(__cplusplus) +} +#endif + +// Keep a reachable reference so it isn't optimized out completely. +static const void* (*volatile mi_ver_ref)(void) = (const void*(*)(void))mi_version; + +void hakmem_bench_mi_force_link(void) { + // Prevent whole-call optimization away + (void)mi_ver_ref; +} + diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..29dd0f00 --- /dev/null +++ b/build.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# build.sh - Unified build wrapper to eliminate flag drift +set -euo pipefail + +TARGET="${1:-bench_mid_large_mt_hakmem}" + +echo "=========================================" +echo " HAKMEM Build Script" +echo " Target: ${TARGET}" +echo "=========================================" + +# Always clean to avoid stale objects when toggling flags +make clean >/dev/null 2>&1 || true + +# Phase 7 + Pool TLS Phase 1.5b defaults +make \ + POOL_TLS_PHASE1=1 \ + POOL_TLS_PREWARM=1 \ + HEADER_CLASSIDX=1 \ + AGGRESSIVE_INLINE=1 \ + PREWARM_TLS=1 \ + "${TARGET}" + +echo "" +echo "=========================================" +echo " ✅ Build successful" +echo " Run: ./${TARGET}" +echo "=========================================" + diff --git a/core/box/hak_alloc_api.inc.h b/core/box/hak_alloc_api.inc.h index eac1ba7a..d0c54014 100644 --- a/core/box/hak_alloc_api.inc.h +++ b/core/box/hak_alloc_api.inc.h @@ -159,9 +159,11 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) { // This handles the gap when ACE is disabled or failed static _Atomic int gap_alloc_count = 0; int count = atomic_fetch_add(&gap_alloc_count, 1); + #if HAKMEM_DEBUG_VERBOSE if (count < 3) { fprintf(stderr, "[HAKMEM] INFO: Using mmap for mid-range size=%zu (ACE disabled or failed)\n", size); } + #endif #if HAKMEM_DEBUG_TIMING HKM_TIME_START(t_mmap); #endif @@ -199,4 +201,3 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) { } #endif // HAK_ALLOC_API_INC_H - diff --git a/core/box/hak_free_api.inc.h b/core/box/hak_free_api.inc.h index ca4f7552..22f78533 100644 --- a/core/box/hak_free_api.inc.h +++ b/core/box/hak_free_api.inc.h @@ -78,6 +78,37 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) { return; } +#ifdef HAKMEM_POOL_TLS_PHASE1 + // Phase 1: Try Pool TLS free FIRST for 8KB-52KB range + // CRITICAL: Must come before Phase 7 Tiny to avoid magic mismatch SEGV + // Pool TLS uses magic 0xb0, Tiny uses magic 0xa0 - must distinguish! + { + void* header_addr = (char*)ptr - 1; + + // Safety vs performance trade-off: + // - If HAKMEM_TINY_SAFE_FREE=1 (strict), validate with mincore() always + // - Else (default), only validate on page-boundary risk to avoid syscall cost + #if HAKMEM_TINY_SAFE_FREE + if (!hak_is_memory_readable(header_addr)) { goto skip_pool_tls; } + #else + uintptr_t off = (uintptr_t)header_addr & 0xFFF; + if (__builtin_expect(off == 0, 0)) { + if (!hak_is_memory_readable(header_addr)) { goto skip_pool_tls; } + } + #endif + + uint8_t header = *(uint8_t*)header_addr; + + if ((header & 0xF0) == POOL_MAGIC) { + pool_free(ptr); + hak_free_route_log("pool_tls", ptr); + goto done; + } + // Not Pool TLS - fall through to other paths + } +skip_pool_tls: +#endif + #if HAKMEM_TINY_HEADER_CLASSIDX // Phase 7: Dual-header dispatch (1-byte Tiny header OR 16-byte malloc/mmap header) // @@ -135,19 +166,6 @@ slow_path_after_step2:; #endif #endif -#ifdef HAKMEM_POOL_TLS_PHASE1 - // Phase 1: Try Pool TLS free for 8KB-52KB range - // This uses 1-byte headers like Tiny for O(1) free - { - uint8_t header = *((uint8_t*)ptr - 1); - if ((header & 0xF0) == POOL_MAGIC) { - pool_free(ptr); - hak_free_route_log("pool_tls", ptr); - goto done; - } - } -#endif - // SS-first free(既定ON) #if !HAKMEM_TINY_HEADER_CLASSIDX // Only run SS-first if Phase 7 header-based free is not enabled diff --git a/core/hakmem_build_flags.h b/core/hakmem_build_flags.h index 1c1c57b3..bb856e9e 100644 --- a/core/hakmem_build_flags.h +++ b/core/hakmem_build_flags.h @@ -70,6 +70,17 @@ # define HAKMEM_TINY_PREWARM_TLS 0 #endif +// Runtime verbosity (printf-heavy diagnostics). Keep OFF for benches. +#ifndef HAKMEM_DEBUG_VERBOSE +# define HAKMEM_DEBUG_VERBOSE 0 +#endif + +// Tiny/Mid safety checks on free path (mincore header validation). +// 0 = performance (boundary-only), 1 = strict (mincore for all) +#ifndef HAKMEM_TINY_SAFE_FREE +# define HAKMEM_TINY_SAFE_FREE 0 +#endif + // Phase 7 refill count defaults (tunable via env vars) // HAKMEM_TINY_REFILL_COUNT: global default (default: 16) // HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 16) diff --git a/core/tiny_free_fast_v2.inc.h b/core/tiny_free_fast_v2.inc.h index 44704217..89671be5 100644 --- a/core/tiny_free_fast_v2.inc.h +++ b/core/tiny_free_fast_v2.inc.h @@ -50,29 +50,43 @@ extern int TINY_TLS_MAG_CAP; static inline int hak_tiny_free_fast_v2(void* ptr) { if (__builtin_expect(!ptr, 0)) return 0; - // CRITICAL: Fast check for page boundaries (0.1% case) - // Strategy: Check alignment BEFORE expensive mincore() syscall - // - Page boundary check: (ptr & 0xFFF) == 0 → 1-2 cycles + // CRITICAL: Check if header is accessible + void* header_addr = (char*)ptr - 1; + +#if defined(HAKMEM_POOL_TLS_PHASE1) && HAKMEM_TINY_SAFE_FREE + // Strict mode: validate header address with mincore() on every free + extern int hak_is_memory_readable(void* addr); + if (!hak_is_memory_readable(header_addr)) { + return 0; // Header not accessible - not a Tiny allocation + } +#else + // Pool TLS disabled: Optimize for common case (99.9% hit rate) + // Strategy: Only check page boundaries (ptr & 0xFFF == 0) + // - Page boundary check: 1-2 cycles // - mincore() syscall: ~634 cycles (only if page-aligned) // - Result: 99.9% of frees avoid mincore() → 317-634x faster! - // - // Rationale: Allocations at page boundaries would SEGV when reading ptr-1 - // (previous page may be unmapped). But page boundaries are rare (<0.1%), - // so we optimize for the common case (99.9%) by checking alignment first. - void* header_addr = (char*)ptr - 1; if (__builtin_expect(((uintptr_t)ptr & 0xFFF) == 0, 0)) { - // Potential page boundary - do safety check extern int hak_is_memory_readable(void* addr); if (!hak_is_memory_readable(header_addr)) { - // Header not accessible - route to slow path (page boundary allocation) - return 0; + return 0; // Page boundary allocation } } - // Normal case (99.9%): header is safe to read (no mincore call!) +#endif // 1. Read class_idx from header (2-3 cycles, L1 hit) // Note: In release mode, tiny_region_id_read_header() skips magic validation (saves 2-3 cycles) + #if HAKMEM_DEBUG_VERBOSE + static _Atomic int debug_calls = 0; + if (atomic_fetch_add(&debug_calls, 1) < 5) { + fprintf(stderr, "[TINY_FREE_V2] Before read_header, ptr=%p\n", ptr); + } + #endif int class_idx = tiny_region_id_read_header(ptr); + #if HAKMEM_DEBUG_VERBOSE + if (atomic_load(&debug_calls) <= 5) { + fprintf(stderr, "[TINY_FREE_V2] After read_header, class_idx=%d\n", class_idx); + } + #endif // Check if header read failed (invalid magic in debug, or out-of-bounds class_idx) if (__builtin_expect(class_idx < 0, 0)) { diff --git a/core/tiny_region_id.h b/core/tiny_region_id.h index 9d712c8b..6b416510 100644 --- a/core/tiny_region_id.h +++ b/core/tiny_region_id.h @@ -68,24 +68,43 @@ static inline int tiny_region_id_read_header(void* ptr) { uint8_t header = *header_ptr; -#if !HAKMEM_BUILD_RELEASE - // Debug/Development: Validate magic byte to catch non-header allocations + // CRITICAL FIX (Pool TLS Phase 1): ALWAYS validate magic when Pool TLS is enabled + // Reason: Pool TLS uses different magic (0xb0 vs 0xa0), MUST distinguish them! + // Without this, Pool TLS allocations are wrongly routed to Tiny freelist → corruption +#if !HAKMEM_BUILD_RELEASE || defined(HAKMEM_POOL_TLS_PHASE1) + // Debug/Development OR Pool TLS: Validate magic byte to catch non-header allocations // Reason: Mid/Large allocations don't have headers, must detect and reject them uint8_t magic = header & 0xF0; + #if HAKMEM_DEBUG_VERBOSE + static int debug_count = 0; + if (debug_count < 5) { + fprintf(stderr, "[TINY_READ_HEADER] ptr=%p header=0x%02x magic=0x%02x expected=0x%02x\n", + ptr, header, magic, HEADER_MAGIC); + debug_count++; + } + #endif if (magic != HEADER_MAGIC) { - // Invalid header - likely non-header allocation (Mid/Large) + // Invalid header - likely non-header allocation (Mid/Large/Pool TLS) + #if HAKMEM_DEBUG_VERBOSE + if (debug_count < 6) { // One more after the 5 above + fprintf(stderr, "[TINY_READ_HEADER] REJECTING ptr=%p (magic mismatch)\n", ptr); + } + #endif +#if !HAKMEM_BUILD_RELEASE static int invalid_count = 0; if (invalid_count < 5) { fprintf(stderr, "[HEADER_INVALID] ptr=%p, header=%02x, magic=%02x (expected %02x)\n", ptr, header, magic, HEADER_MAGIC); invalid_count++; } +#endif return -1; } #else - // Release: Skip magic validation (save 2-3 cycles) + // Release (without Pool TLS): Skip magic validation (save 2-3 cycles) // Safety: Bounds check below still prevents out-of-bounds array access // Trade-off: Mid/Large frees may corrupt TLS freelist (rare, ~0.1% of frees) + // NOTE: This optimization is DISABLED when Pool TLS is enabled (different magic bytes!) #endif int class_idx = (int)(header & HEADER_CLASS_MASK); diff --git a/docs/BUILD_PHASE7_POOL_TLS.md b/docs/BUILD_PHASE7_POOL_TLS.md new file mode 100644 index 00000000..485fce6b --- /dev/null +++ b/docs/BUILD_PHASE7_POOL_TLS.md @@ -0,0 +1,90 @@ +# HAKMEM Phase 7 + Pool TLS Phase 1.5b — Build & Run Cheatsheet + +This document captures the stable build/run recipe used for recent benches. + +## One‑liner Build (recommended) + +``` +./build.sh + +# examples +./build.sh bench_mid_large_mt_hakmem +./build.sh bench_random_mixed_hakmem +./build.sh larson_hakmem +``` + +Enables at build time: +- POOL_TLS_PHASE1=1 (Pool TLS Phase 1.5b) +- HEADER_CLASSIDX=1 (Phase 7 header) +- AGGRESSIVE_INLINE=1 +- PREWARM_TLS=1 + +Verify switches: +``` +make print-flags +``` + +Optional safety/verbosity toggles: +- `HAKMEM_TINY_SAFE_FREE=1` — strict free validation (mincore on all frees). Slower but safest. +- `HAKMEM_DEBUG_VERBOSE=1` — enable verbose logs for Tiny header/free, etc. + +Examples: +``` +make clean && make HAKMEM_TINY_SAFE_FREE=1 POOL_TLS_PHASE1=1 HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 bench_mid_large_mt_hakmem +``` + +## Bench Recipes (used in reports) + +Larson (Mixed) +``` +./build.sh larson_hakmem +make larson_system +./larson_hakmem 2 8 128 1024 1 12345 1 +./larson_hakmem 2 8 128 1024 1 12345 4 +./larson_system 2 8 128 1024 1 12345 1 +./larson_system 2 8 128 1024 1 12345 4 +``` + +Pool TLS (8–52KB) +``` +./build.sh bench_pool_tls_hakmem +make bench_pool_tls_system +./bench_pool_tls_hakmem 1 100000 256 42 +./bench_pool_tls_hakmem 4 50000 256 42 +./bench_pool_tls_system 1 100000 256 42 +./bench_pool_tls_system 4 50000 256 42 +``` + +Random Mixed (Tiny 128–1024B) +``` +./build.sh bench_random_mixed_hakmem +make bench_random_mixed_system +for s in 128 256 512 1024; do \ + ./bench_random_mixed_hakmem 100000 $s 42; \ + ./bench_random_mixed_system 100000 $s 42; \ +done +``` + +Mid‑Large MT (8–32KB) +``` +./build.sh bench_mid_large_mt_hakmem +make bench_mid_large_mt_system +./bench_mid_large_mt_hakmem 1 100000 256 42 +./bench_mid_large_mt_hakmem 4 50000 256 42 +./bench_mid_large_mt_system 1 100000 256 42 +./bench_mid_large_mt_system 4 50000 256 42 +``` + +## Mimalloc note (when comparing) + +Direct‑link mimalloc benches require runtime path: +``` +export LD_LIBRARY_PATH=$PWD/mimalloc-bench/extern/mi/out/release +``` + +## Build hygiene + +- Always prefer `./build.sh` over ad‑hoc `make` (prevents flag drift) +- Check switches: `make print-flags` +- Verify freshness: `./verify_build.sh ` + diff --git a/verify_build.sh b/verify_build.sh new file mode 100755 index 00000000..134d7632 --- /dev/null +++ b/verify_build.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# verify_build.sh - Quick build correctness/uptodate check +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +bin="$1" +if [[ ! -f "$bin" ]]; then + echo "❌ Error: $bin not found" >&2 + exit 2 +fi + +# Check if any sources are newer than the binary +bin_time=$(stat -c %Y "$bin") +src_time=$(find core include adapters engines benchmarks/src -type f \( -name '*.c' -o -name '*.h' -o -name '*.inc' -o -name '*.inc.h' \) -printf '%T@\n' 2>/dev/null | sort -nr | head -n1 | cut -d. -f1) + +if [[ -n "${src_time}" && "${src_time}" -gt "${bin_time}" ]]; then + newest=$(find core include adapters engines benchmarks/src -type f \( -name '*.c' -o -name '*.h' -o -name '*.inc' -o -name '*.inc.h' \) -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -n1 | cut -d' ' -f2-) + echo "⚠️ Warning: Sources newer than binary: $newest" >&2 + exit 3 +fi + +echo "✅ Build verification passed for $bin" +