Phase 7 + Pool TLS 1.5b stabilization:\n- Add build hygiene (dep tracking, flag consistency, print-flags)\n- Add build.sh + verify_build.sh (unified recipe, freshness check)\n- Quiet verbose logs behind HAKMEM_DEBUG_VERBOSE\n- A/B free safety via HAKMEM_TINY_SAFE_FREE (mincore strict vs boundary)\n- Tweak Tiny header path to reduce noise; Pool TLS free guard optimized\n- Fix mimalloc link retention (--no-as-needed + force symbol)\n- Add docs/BUILD_PHASE7_POOL_TLS.md (cheatsheet)
This commit is contained in:
106
Makefile
106
Makefile
@ -49,6 +49,27 @@ ifeq ($(USE_LTO),1)
|
||||
LDFLAGS += -flto
|
||||
endif
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Build hygiene: dependency tracking + flag consistency checks
|
||||
# ------------------------------------------------------------
|
||||
|
||||
# Track header dependencies for explicit compile rules as well
|
||||
CFLAGS += -MMD -MP
|
||||
|
||||
# If someone injects -DHAKMEM_POOL_TLS_PHASE1=1 directly into CFLAGS
|
||||
# but forgets POOL_TLS_PHASE1=1, object lists will miss pool_tls*.o.
|
||||
# Fail fast to avoid confusing link/runtime errors.
|
||||
ifneq ($(filter -DHAKMEM_POOL_TLS_PHASE1=1,$(CFLAGS)),)
|
||||
ifneq ($(POOL_TLS_PHASE1),1)
|
||||
$(error Detected -DHAKMEM_POOL_TLS_PHASE1=1 in CFLAGS but POOL_TLS_PHASE1!=1. Please invoke: make POOL_TLS_PHASE1=1 ...)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Include generated .d files if present (safe even if none yet)
|
||||
# Filter to only files (not directories like glibc-2.38/build/iconvdata/gconv-modules.d)
|
||||
# Also exclude glibc and mimalloc-bench subdirectories
|
||||
-include $(shell find . -name '*.d' -type f -not -path './glibc*' -not -path './mimalloc-bench*' 2>/dev/null)
|
||||
|
||||
# Default: enable Box Theory refactor for Tiny (Phase 6-1.7)
|
||||
# This is the best performing option currently (4.19M ops/s)
|
||||
# NOTE: Disabled while testing ULTRA_SIMPLE with SFC integration
|
||||
@ -145,24 +166,45 @@ SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o
|
||||
|
||||
# Pool TLS Phase 1 (enable with POOL_TLS_PHASE1=1)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
SHARED_OBJS += pool_tls_shared.o pool_refill_shared.o
|
||||
OBJS += pool_tls.o pool_refill.o pool_tls_arena.o
|
||||
SHARED_OBJS += pool_tls_shared.o pool_refill_shared.o pool_tls_arena_shared.o
|
||||
CFLAGS += -DHAKMEM_POOL_TLS_PHASE1=1
|
||||
CFLAGS_SHARED += -DHAKMEM_POOL_TLS_PHASE1=1
|
||||
endif
|
||||
|
||||
# Pool TLS Phase 1.5b - Pre-warm optimization
|
||||
ifeq ($(POOL_TLS_PREWARM),1)
|
||||
CFLAGS += -DHAKMEM_POOL_TLS_PREWARM=1
|
||||
CFLAGS_SHARED += -DHAKMEM_POOL_TLS_PREWARM=1
|
||||
endif
|
||||
|
||||
# Benchmark targets
|
||||
BENCH_HAKMEM = bench_allocators_hakmem
|
||||
BENCH_SYSTEM = bench_allocators_system
|
||||
BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o bench_allocators_hakmem.o
|
||||
BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o
|
||||
BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o
|
||||
endif
|
||||
BENCH_SYSTEM_OBJS = bench_allocators_system.o
|
||||
|
||||
# Default target
|
||||
all: $(TARGET)
|
||||
|
||||
# Show key build-time switches for troubleshooting
|
||||
.PHONY: print-flags
|
||||
print-flags:
|
||||
@echo "==== Build Switches ===="
|
||||
@echo "POOL_TLS_PHASE1 = $(POOL_TLS_PHASE1)"
|
||||
@echo "POOL_TLS_PREWARM = $(POOL_TLS_PREWARM)"
|
||||
@echo "HEADER_CLASSIDX = $(HEADER_CLASSIDX)"
|
||||
@echo "AGGRESSIVE_INLINE = $(AGGRESSIVE_INLINE)"
|
||||
@echo "PREWARM_TLS = $(PREWARM_TLS)"
|
||||
@echo "USE_LTO = $(USE_LTO)"
|
||||
@echo "OPT_LEVEL = $(OPT_LEVEL)"
|
||||
@echo "NATIVE = $(NATIVE)"
|
||||
@echo "CFLAGS contains = $(filter -DHAKMEM_POOL_TLS_PHASE1=1,$(CFLAGS))"
|
||||
|
||||
# Build test program
|
||||
$(TARGET): $(OBJS)
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
@ -220,8 +262,11 @@ bench_tiny_hot_system: bench_tiny_hot_system.o
|
||||
bench_tiny_hot_mi.o: bench_tiny_hot.c
|
||||
$(CC) $(CFLAGS) -DUSE_MIMALLOC -I mimalloc-bench/extern/mi/include -c -o $@ $<
|
||||
|
||||
bench_tiny_hot_mi: bench_tiny_hot_mi.o
|
||||
$(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
bench_mi_force.o: bench_mi_force.c
|
||||
$(CC) $(CFLAGS) -I mimalloc-bench/extern/mi/include -c -o $@ $<
|
||||
|
||||
bench_tiny_hot_mi: bench_tiny_hot_mi.o bench_mi_force.o
|
||||
$(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# hakmi variant for tiny hot bench (direct link via front API)
|
||||
bench_tiny_hot_hakmi.o: bench_tiny_hot.c include/hakmi/hakmi_api.h adapters/hakmi_front/hakmi_front.h
|
||||
@ -315,7 +360,7 @@ test-box-refactor: box-refactor
|
||||
TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o
|
||||
TINY_BENCH_OBJS = $(TINY_BENCH_OBJS_BASE)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
TINY_BENCH_OBJS += pool_tls.o pool_refill.o
|
||||
TINY_BENCH_OBJS += pool_tls.o pool_refill.o core/pool_tls_arena.o
|
||||
endif
|
||||
|
||||
bench_tiny: bench_tiny.o $(TINY_BENCH_OBJS)
|
||||
@ -404,8 +449,8 @@ larson_system: larson_system.o
|
||||
larson_mi.o: $(LARSON_SRC)
|
||||
$(CXX) $(CFLAGS) -DUSE_MIMALLOC -I mimalloc-bench/extern/mi/include -c -o $@ $<
|
||||
|
||||
larson_mi: larson_mi.o
|
||||
$(CXX) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
larson_mi: larson_mi.o bench_mi_force.o
|
||||
$(CXX) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# HAKMEM variant (hakmem.o provides malloc/free symbols directly)
|
||||
larson_hakmem.o: $(LARSON_SRC)
|
||||
@ -468,11 +513,12 @@ bench_tiny_hot_direct: bench_tiny_hot_direct.o $(TINY_BENCH_OBJS)
|
||||
@echo "✓ bench_tiny_hot_direct built (hak_tiny_alloc/free direct)"
|
||||
|
||||
# hakmi variant for comprehensive bench (front + mimalloc backend)
|
||||
|
||||
bench_comprehensive_hakmi: bench_comprehensive.c include/hakmi/hakmi_api.h adapters/hakmi_front/hakmi_front.h
|
||||
$(CC) $(CFLAGS) -I include -DUSE_HAKMI -include include/hakmi/hakmi_api.h -Dmalloc=hakmi_malloc -Dfree=hakmi_free -Drealloc=hakmi_realloc \
|
||||
bench_comprehensive.c -o $@ \
|
||||
adapters/hakmi_front/hakmi_front.o adapters/hakmi_front/hakmi_env.o adapters/hakmi_front/hakmi_tls_front.o \
|
||||
-L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
-Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
@echo "✓ bench_comprehensive_hakmi built (hakmi front + mimalloc backend)"
|
||||
|
||||
# hakx variant for comprehensive bench
|
||||
@ -497,15 +543,15 @@ bench_random_mixed_hakmem: bench_random_mixed_hakmem.o $(TINY_BENCH_OBJS)
|
||||
bench_random_mixed_system: bench_random_mixed_system.o
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
|
||||
bench_random_mixed_mi: bench_random_mixed_mi.o
|
||||
$(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
bench_random_mixed_mi: bench_random_mixed_mi.o bench_mi_force.o
|
||||
$(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# hakmi variant for random mixed bench
|
||||
bench_random_mixed_hakmi.o: bench_random_mixed.c include/hakmi/hakmi_api.h adapters/hakmi_front/hakmi_front.h
|
||||
$(CC) $(CFLAGS) -I include -DUSE_HAKMI -include include/hakmi/hakmi_api.h -Dmalloc=hakmi_malloc -Dfree=hakmi_free -Drealloc=hakmi_realloc -c -o $@ $<
|
||||
|
||||
bench_random_mixed_hakmi: bench_random_mixed_hakmi.o $(HAKMI_FRONT_OBJS)
|
||||
$(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
bench_random_mixed_hakmi: bench_random_mixed_hakmi.o $(HAKMI_FRONT_OBJS) bench_mi_force.o
|
||||
$(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# hakx variant for random mixed bench
|
||||
bench_random_mixed_hakx.o: bench_random_mixed.c include/hakx/hakx_api.h include/hakx/hakx_fast_inline.h
|
||||
@ -551,8 +597,8 @@ bench_mid_large_hakmem: bench_mid_large_hakmem.o $(TINY_BENCH_OBJS)
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
bench_mid_large_system: bench_mid_large_system.o
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
bench_mid_large_mi: bench_mid_large_mi.o
|
||||
$(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
bench_mid_large_mi: bench_mid_large_mi.o bench_mi_force.o
|
||||
$(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# hakx variant for mid/large (1T)
|
||||
bench_mid_large_hakx.o: bench_mid_large.c include/hakx/hakx_api.h include/hakx/hakx_fast_inline.h
|
||||
@ -572,8 +618,8 @@ bench_mid_large_mt_hakmem: bench_mid_large_mt_hakmem.o $(TINY_BENCH_OBJS)
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
bench_mid_large_mt_system: bench_mid_large_mt_system.o
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
bench_mid_large_mt_mi: bench_mid_large_mt_mi.o
|
||||
$(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
bench_mid_large_mt_mi: bench_mid_large_mt_mi.o bench_mi_force.o
|
||||
$(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# hakx variant for mid/large MT
|
||||
bench_mid_large_mt_hakx.o: bench_mid_large_mt.c include/hakx/hakx_api.h include/hakx/hakx_fast_inline.h
|
||||
@ -593,8 +639,8 @@ bench_fragment_stress_hakmem: bench_fragment_stress_hakmem.o $(TINY_BENCH_OBJS)
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
bench_fragment_stress_system: bench_fragment_stress_system.o
|
||||
$(CC) -o $@ $^ $(LDFLAGS)
|
||||
bench_fragment_stress_mi: bench_fragment_stress_mi.o
|
||||
$(CC) -o $@ $^ -L mimalloc-bench/extern/mi/out/release -lmimalloc $(LDFLAGS)
|
||||
bench_fragment_stress_mi: bench_fragment_stress_mi.o bench_mi_force.o
|
||||
$(CC) -o $@ $^ -Wl,--no-as-needed -L mimalloc-bench/extern/mi/out/release -lmimalloc -Wl,--as-needed $(LDFLAGS)
|
||||
|
||||
# Bench build with Minimal Tiny Front (physically excludes optional front tiers)
|
||||
bench_tiny_front: CFLAGS += -fno-plt -fno-semantic-interposition -fomit-frame-pointer -fno-unwind-tables -fno-asynchronous-unwind-tables -DHAKMEM_TINY_MINIMAL_FRONT=1 -DHAKMEM_BENCH_TINY_ONLY=1 -DHAKMEM_TINY_MAG_OWNER=0
|
||||
@ -1125,3 +1171,27 @@ larson_hakmem_route:
|
||||
@echo "Built larson_hakmem (3-layer + route)"
|
||||
@echo " HAKMEM_ROUTE build-flag set; runtime ENV still controls output"
|
||||
@echo "========================================="
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Pool TLS Benchmarks (Phase 1.5b)
|
||||
# ----------------------------------------------------------------------------
|
||||
# Build HAKMEM shared library first to satisfy -lhakmem
|
||||
bench_pool_tls_hakmem: benchmarks/bench_pool_tls.c $(SHARED_LIB)
|
||||
$(CC) $(CFLAGS) -o $@ $< -L. -lhakmem $(LDFLAGS)
|
||||
|
||||
bench_pool_tls_system: benchmarks/bench_pool_tls.c
|
||||
$(CC) $(CFLAGS) -DUSE_SYSTEM_MALLOC -o $@ $< $(LDFLAGS)
|
||||
|
||||
.PHONY: bench-pool-tls
|
||||
bench-pool-tls: bench_pool_tls_hakmem bench_pool_tls_system
|
||||
@echo "========================================="
|
||||
@echo "Pool TLS Benchmark (8KB-52KB allocations)"
|
||||
@echo "========================================="
|
||||
@echo ""
|
||||
@echo "== HAKMEM (Phase 1.5b Pre-warm) =="
|
||||
@./bench_pool_tls_hakmem 1 100000 256 42
|
||||
@echo ""
|
||||
@echo "== System malloc =="
|
||||
@./bench_pool_tls_system 1 100000 256 42
|
||||
@echo ""
|
||||
@echo "========================================="
|
||||
|
||||
22
bench_mi_force.c
Normal file
22
bench_mi_force.c
Normal file
@ -0,0 +1,22 @@
|
||||
// bench_mi_force.c
|
||||
// Force a reference to a mimalloc symbol so the dynamic linker
|
||||
// retains libmimalloc as a NEEDED dependency even with --as-needed.
|
||||
#include <stddef.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
// Declaration; actual signature returns const char* in mimalloc.
|
||||
const char* mi_version(void);
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
// Keep a reachable reference so it isn't optimized out completely.
|
||||
static const void* (*volatile mi_ver_ref)(void) = (const void*(*)(void))mi_version;
|
||||
|
||||
void hakmem_bench_mi_force_link(void) {
|
||||
// Prevent whole-call optimization away
|
||||
(void)mi_ver_ref;
|
||||
}
|
||||
|
||||
29
build.sh
Executable file
29
build.sh
Executable file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
# build.sh - Unified build wrapper to eliminate flag drift
|
||||
set -euo pipefail
|
||||
|
||||
TARGET="${1:-bench_mid_large_mt_hakmem}"
|
||||
|
||||
echo "========================================="
|
||||
echo " HAKMEM Build Script"
|
||||
echo " Target: ${TARGET}"
|
||||
echo "========================================="
|
||||
|
||||
# Always clean to avoid stale objects when toggling flags
|
||||
make clean >/dev/null 2>&1 || true
|
||||
|
||||
# Phase 7 + Pool TLS Phase 1.5b defaults
|
||||
make \
|
||||
POOL_TLS_PHASE1=1 \
|
||||
POOL_TLS_PREWARM=1 \
|
||||
HEADER_CLASSIDX=1 \
|
||||
AGGRESSIVE_INLINE=1 \
|
||||
PREWARM_TLS=1 \
|
||||
"${TARGET}"
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo " ✅ Build successful"
|
||||
echo " Run: ./${TARGET}"
|
||||
echo "========================================="
|
||||
|
||||
@ -159,9 +159,11 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
|
||||
// This handles the gap when ACE is disabled or failed
|
||||
static _Atomic int gap_alloc_count = 0;
|
||||
int count = atomic_fetch_add(&gap_alloc_count, 1);
|
||||
#if HAKMEM_DEBUG_VERBOSE
|
||||
if (count < 3) {
|
||||
fprintf(stderr, "[HAKMEM] INFO: Using mmap for mid-range size=%zu (ACE disabled or failed)\n", size);
|
||||
}
|
||||
#endif
|
||||
#if HAKMEM_DEBUG_TIMING
|
||||
HKM_TIME_START(t_mmap);
|
||||
#endif
|
||||
@ -199,4 +201,3 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
|
||||
}
|
||||
|
||||
#endif // HAK_ALLOC_API_INC_H
|
||||
|
||||
|
||||
@ -78,6 +78,37 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef HAKMEM_POOL_TLS_PHASE1
|
||||
// Phase 1: Try Pool TLS free FIRST for 8KB-52KB range
|
||||
// CRITICAL: Must come before Phase 7 Tiny to avoid magic mismatch SEGV
|
||||
// Pool TLS uses magic 0xb0, Tiny uses magic 0xa0 - must distinguish!
|
||||
{
|
||||
void* header_addr = (char*)ptr - 1;
|
||||
|
||||
// Safety vs performance trade-off:
|
||||
// - If HAKMEM_TINY_SAFE_FREE=1 (strict), validate with mincore() always
|
||||
// - Else (default), only validate on page-boundary risk to avoid syscall cost
|
||||
#if HAKMEM_TINY_SAFE_FREE
|
||||
if (!hak_is_memory_readable(header_addr)) { goto skip_pool_tls; }
|
||||
#else
|
||||
uintptr_t off = (uintptr_t)header_addr & 0xFFF;
|
||||
if (__builtin_expect(off == 0, 0)) {
|
||||
if (!hak_is_memory_readable(header_addr)) { goto skip_pool_tls; }
|
||||
}
|
||||
#endif
|
||||
|
||||
uint8_t header = *(uint8_t*)header_addr;
|
||||
|
||||
if ((header & 0xF0) == POOL_MAGIC) {
|
||||
pool_free(ptr);
|
||||
hak_free_route_log("pool_tls", ptr);
|
||||
goto done;
|
||||
}
|
||||
// Not Pool TLS - fall through to other paths
|
||||
}
|
||||
skip_pool_tls:
|
||||
#endif
|
||||
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
// Phase 7: Dual-header dispatch (1-byte Tiny header OR 16-byte malloc/mmap header)
|
||||
//
|
||||
@ -135,19 +166,6 @@ slow_path_after_step2:;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAKMEM_POOL_TLS_PHASE1
|
||||
// Phase 1: Try Pool TLS free for 8KB-52KB range
|
||||
// This uses 1-byte headers like Tiny for O(1) free
|
||||
{
|
||||
uint8_t header = *((uint8_t*)ptr - 1);
|
||||
if ((header & 0xF0) == POOL_MAGIC) {
|
||||
pool_free(ptr);
|
||||
hak_free_route_log("pool_tls", ptr);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// SS-first free(既定ON)
|
||||
#if !HAKMEM_TINY_HEADER_CLASSIDX
|
||||
// Only run SS-first if Phase 7 header-based free is not enabled
|
||||
|
||||
@ -70,6 +70,17 @@
|
||||
# define HAKMEM_TINY_PREWARM_TLS 0
|
||||
#endif
|
||||
|
||||
// Runtime verbosity (printf-heavy diagnostics). Keep OFF for benches.
|
||||
#ifndef HAKMEM_DEBUG_VERBOSE
|
||||
# define HAKMEM_DEBUG_VERBOSE 0
|
||||
#endif
|
||||
|
||||
// Tiny/Mid safety checks on free path (mincore header validation).
|
||||
// 0 = performance (boundary-only), 1 = strict (mincore for all)
|
||||
#ifndef HAKMEM_TINY_SAFE_FREE
|
||||
# define HAKMEM_TINY_SAFE_FREE 0
|
||||
#endif
|
||||
|
||||
// Phase 7 refill count defaults (tunable via env vars)
|
||||
// HAKMEM_TINY_REFILL_COUNT: global default (default: 16)
|
||||
// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 16)
|
||||
|
||||
@ -50,29 +50,43 @@ extern int TINY_TLS_MAG_CAP;
|
||||
static inline int hak_tiny_free_fast_v2(void* ptr) {
|
||||
if (__builtin_expect(!ptr, 0)) return 0;
|
||||
|
||||
// CRITICAL: Fast check for page boundaries (0.1% case)
|
||||
// Strategy: Check alignment BEFORE expensive mincore() syscall
|
||||
// - Page boundary check: (ptr & 0xFFF) == 0 → 1-2 cycles
|
||||
// - mincore() syscall: ~634 cycles (only if page-aligned)
|
||||
// - Result: 99.9% of frees avoid mincore() → 317-634x faster!
|
||||
//
|
||||
// Rationale: Allocations at page boundaries would SEGV when reading ptr-1
|
||||
// (previous page may be unmapped). But page boundaries are rare (<0.1%),
|
||||
// so we optimize for the common case (99.9%) by checking alignment first.
|
||||
// CRITICAL: Check if header is accessible
|
||||
void* header_addr = (char*)ptr - 1;
|
||||
if (__builtin_expect(((uintptr_t)ptr & 0xFFF) == 0, 0)) {
|
||||
// Potential page boundary - do safety check
|
||||
|
||||
#if defined(HAKMEM_POOL_TLS_PHASE1) && HAKMEM_TINY_SAFE_FREE
|
||||
// Strict mode: validate header address with mincore() on every free
|
||||
extern int hak_is_memory_readable(void* addr);
|
||||
if (!hak_is_memory_readable(header_addr)) {
|
||||
// Header not accessible - route to slow path (page boundary allocation)
|
||||
return 0;
|
||||
return 0; // Header not accessible - not a Tiny allocation
|
||||
}
|
||||
#else
|
||||
// Pool TLS disabled: Optimize for common case (99.9% hit rate)
|
||||
// Strategy: Only check page boundaries (ptr & 0xFFF == 0)
|
||||
// - Page boundary check: 1-2 cycles
|
||||
// - mincore() syscall: ~634 cycles (only if page-aligned)
|
||||
// - Result: 99.9% of frees avoid mincore() → 317-634x faster!
|
||||
if (__builtin_expect(((uintptr_t)ptr & 0xFFF) == 0, 0)) {
|
||||
extern int hak_is_memory_readable(void* addr);
|
||||
if (!hak_is_memory_readable(header_addr)) {
|
||||
return 0; // Page boundary allocation
|
||||
}
|
||||
}
|
||||
// Normal case (99.9%): header is safe to read (no mincore call!)
|
||||
#endif
|
||||
|
||||
// 1. Read class_idx from header (2-3 cycles, L1 hit)
|
||||
// Note: In release mode, tiny_region_id_read_header() skips magic validation (saves 2-3 cycles)
|
||||
#if HAKMEM_DEBUG_VERBOSE
|
||||
static _Atomic int debug_calls = 0;
|
||||
if (atomic_fetch_add(&debug_calls, 1) < 5) {
|
||||
fprintf(stderr, "[TINY_FREE_V2] Before read_header, ptr=%p\n", ptr);
|
||||
}
|
||||
#endif
|
||||
int class_idx = tiny_region_id_read_header(ptr);
|
||||
#if HAKMEM_DEBUG_VERBOSE
|
||||
if (atomic_load(&debug_calls) <= 5) {
|
||||
fprintf(stderr, "[TINY_FREE_V2] After read_header, class_idx=%d\n", class_idx);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Check if header read failed (invalid magic in debug, or out-of-bounds class_idx)
|
||||
if (__builtin_expect(class_idx < 0, 0)) {
|
||||
|
||||
@ -68,24 +68,43 @@ static inline int tiny_region_id_read_header(void* ptr) {
|
||||
|
||||
uint8_t header = *header_ptr;
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
// Debug/Development: Validate magic byte to catch non-header allocations
|
||||
// CRITICAL FIX (Pool TLS Phase 1): ALWAYS validate magic when Pool TLS is enabled
|
||||
// Reason: Pool TLS uses different magic (0xb0 vs 0xa0), MUST distinguish them!
|
||||
// Without this, Pool TLS allocations are wrongly routed to Tiny freelist → corruption
|
||||
#if !HAKMEM_BUILD_RELEASE || defined(HAKMEM_POOL_TLS_PHASE1)
|
||||
// Debug/Development OR Pool TLS: Validate magic byte to catch non-header allocations
|
||||
// Reason: Mid/Large allocations don't have headers, must detect and reject them
|
||||
uint8_t magic = header & 0xF0;
|
||||
#if HAKMEM_DEBUG_VERBOSE
|
||||
static int debug_count = 0;
|
||||
if (debug_count < 5) {
|
||||
fprintf(stderr, "[TINY_READ_HEADER] ptr=%p header=0x%02x magic=0x%02x expected=0x%02x\n",
|
||||
ptr, header, magic, HEADER_MAGIC);
|
||||
debug_count++;
|
||||
}
|
||||
#endif
|
||||
if (magic != HEADER_MAGIC) {
|
||||
// Invalid header - likely non-header allocation (Mid/Large)
|
||||
// Invalid header - likely non-header allocation (Mid/Large/Pool TLS)
|
||||
#if HAKMEM_DEBUG_VERBOSE
|
||||
if (debug_count < 6) { // One more after the 5 above
|
||||
fprintf(stderr, "[TINY_READ_HEADER] REJECTING ptr=%p (magic mismatch)\n", ptr);
|
||||
}
|
||||
#endif
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
static int invalid_count = 0;
|
||||
if (invalid_count < 5) {
|
||||
fprintf(stderr, "[HEADER_INVALID] ptr=%p, header=%02x, magic=%02x (expected %02x)\n",
|
||||
ptr, header, magic, HEADER_MAGIC);
|
||||
invalid_count++;
|
||||
}
|
||||
#endif
|
||||
return -1;
|
||||
}
|
||||
#else
|
||||
// Release: Skip magic validation (save 2-3 cycles)
|
||||
// Release (without Pool TLS): Skip magic validation (save 2-3 cycles)
|
||||
// Safety: Bounds check below still prevents out-of-bounds array access
|
||||
// Trade-off: Mid/Large frees may corrupt TLS freelist (rare, ~0.1% of frees)
|
||||
// NOTE: This optimization is DISABLED when Pool TLS is enabled (different magic bytes!)
|
||||
#endif
|
||||
|
||||
int class_idx = (int)(header & HEADER_CLASS_MASK);
|
||||
|
||||
90
docs/BUILD_PHASE7_POOL_TLS.md
Normal file
90
docs/BUILD_PHASE7_POOL_TLS.md
Normal file
@ -0,0 +1,90 @@
|
||||
# HAKMEM Phase 7 + Pool TLS Phase 1.5b — Build & Run Cheatsheet
|
||||
|
||||
This document captures the stable build/run recipe used for recent benches.
|
||||
|
||||
## One‑liner Build (recommended)
|
||||
|
||||
```
|
||||
./build.sh <target>
|
||||
|
||||
# examples
|
||||
./build.sh bench_mid_large_mt_hakmem
|
||||
./build.sh bench_random_mixed_hakmem
|
||||
./build.sh larson_hakmem
|
||||
```
|
||||
|
||||
Enables at build time:
|
||||
- POOL_TLS_PHASE1=1 (Pool TLS Phase 1.5b)
|
||||
- HEADER_CLASSIDX=1 (Phase 7 header)
|
||||
- AGGRESSIVE_INLINE=1
|
||||
- PREWARM_TLS=1
|
||||
|
||||
Verify switches:
|
||||
```
|
||||
make print-flags
|
||||
```
|
||||
|
||||
Optional safety/verbosity toggles:
|
||||
- `HAKMEM_TINY_SAFE_FREE=1` — strict free validation (mincore on all frees). Slower but safest.
|
||||
- `HAKMEM_DEBUG_VERBOSE=1` — enable verbose logs for Tiny header/free, etc.
|
||||
|
||||
Examples:
|
||||
```
|
||||
make clean && make HAKMEM_TINY_SAFE_FREE=1 POOL_TLS_PHASE1=1 HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 bench_mid_large_mt_hakmem
|
||||
```
|
||||
|
||||
## Bench Recipes (used in reports)
|
||||
|
||||
Larson (Mixed)
|
||||
```
|
||||
./build.sh larson_hakmem
|
||||
make larson_system
|
||||
./larson_hakmem 2 8 128 1024 1 12345 1
|
||||
./larson_hakmem 2 8 128 1024 1 12345 4
|
||||
./larson_system 2 8 128 1024 1 12345 1
|
||||
./larson_system 2 8 128 1024 1 12345 4
|
||||
```
|
||||
|
||||
Pool TLS (8–52KB)
|
||||
```
|
||||
./build.sh bench_pool_tls_hakmem
|
||||
make bench_pool_tls_system
|
||||
./bench_pool_tls_hakmem 1 100000 256 42
|
||||
./bench_pool_tls_hakmem 4 50000 256 42
|
||||
./bench_pool_tls_system 1 100000 256 42
|
||||
./bench_pool_tls_system 4 50000 256 42
|
||||
```
|
||||
|
||||
Random Mixed (Tiny 128–1024B)
|
||||
```
|
||||
./build.sh bench_random_mixed_hakmem
|
||||
make bench_random_mixed_system
|
||||
for s in 128 256 512 1024; do \
|
||||
./bench_random_mixed_hakmem 100000 $s 42; \
|
||||
./bench_random_mixed_system 100000 $s 42; \
|
||||
done
|
||||
```
|
||||
|
||||
Mid‑Large MT (8–32KB)
|
||||
```
|
||||
./build.sh bench_mid_large_mt_hakmem
|
||||
make bench_mid_large_mt_system
|
||||
./bench_mid_large_mt_hakmem 1 100000 256 42
|
||||
./bench_mid_large_mt_hakmem 4 50000 256 42
|
||||
./bench_mid_large_mt_system 1 100000 256 42
|
||||
./bench_mid_large_mt_system 4 50000 256 42
|
||||
```
|
||||
|
||||
## Mimalloc note (when comparing)
|
||||
|
||||
Direct‑link mimalloc benches require runtime path:
|
||||
```
|
||||
export LD_LIBRARY_PATH=$PWD/mimalloc-bench/extern/mi/out/release
|
||||
```
|
||||
|
||||
## Build hygiene
|
||||
|
||||
- Always prefer `./build.sh` over ad‑hoc `make` (prevents flag drift)
|
||||
- Check switches: `make print-flags`
|
||||
- Verify freshness: `./verify_build.sh <binary>`
|
||||
|
||||
27
verify_build.sh
Executable file
27
verify_build.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env bash
|
||||
# verify_build.sh - Quick build correctness/uptodate check
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $# -lt 1 ]]; then
|
||||
echo "Usage: $0 <binary>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
bin="$1"
|
||||
if [[ ! -f "$bin" ]]; then
|
||||
echo "❌ Error: $bin not found" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Check if any sources are newer than the binary
|
||||
bin_time=$(stat -c %Y "$bin")
|
||||
src_time=$(find core include adapters engines benchmarks/src -type f \( -name '*.c' -o -name '*.h' -o -name '*.inc' -o -name '*.inc.h' \) -printf '%T@\n' 2>/dev/null | sort -nr | head -n1 | cut -d. -f1)
|
||||
|
||||
if [[ -n "${src_time}" && "${src_time}" -gt "${bin_time}" ]]; then
|
||||
newest=$(find core include adapters engines benchmarks/src -type f \( -name '*.c' -o -name '*.h' -o -name '*.inc' -o -name '*.inc.h' \) -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -n1 | cut -d' ' -f2-)
|
||||
echo "⚠️ Warning: Sources newer than binary: $newest" >&2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
echo "✅ Build verification passed for $bin"
|
||||
|
||||
Reference in New Issue
Block a user