Two-Speed Optimization Part 2: Remove atomic trace counters from hot path
Performance improvements: - lock incl instructions completely removed from malloc/free hot paths - Cache misses reduced from 24.4% → 13.4% of cycles - Throughput: 85M → 89.12M ops/sec (+4.8% improvement) - Cycles/op: 48.8 → 48.25 (-1.1%) Changes in core/box/hak_wrappers.inc.h: - malloc: Guard g_wrap_malloc_trace_count atomic with #if !HAKMEM_BUILD_RELEASE - free: Guard g_wrap_free_trace_count and g_free_wrapper_calls with same guard Debug builds retain full instrumentation via HAK_TRACE. Release builds execute completely clean hot paths without atomic operations. Verified via: - perf report: lock incl instructions gone - perf stat: cycles/op reduced, cache miss % improved - objdump: 0 lock instructions in hot paths Next: Inline unified_cache_refill for additional 3-4 cycles/op improvement 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -89,10 +89,14 @@ void* malloc(size_t size) {
|
|||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
uint64_t count = atomic_fetch_add(&malloc_count, 1);
|
uint64_t count = atomic_fetch_add(&malloc_count, 1);
|
||||||
#endif
|
#endif
|
||||||
|
#if !HAKMEM_BUILD_RELEASE
|
||||||
|
// Debug-only trace counter: in release builds this atomic increment
|
||||||
|
// is disabled to avoid hot-path cache misses and contention.
|
||||||
static _Atomic int g_wrap_malloc_trace_count = 0;
|
static _Atomic int g_wrap_malloc_trace_count = 0;
|
||||||
if (atomic_fetch_add_explicit(&g_wrap_malloc_trace_count, 1, memory_order_relaxed) < 256) {
|
if (atomic_fetch_add_explicit(&g_wrap_malloc_trace_count, 1, memory_order_relaxed) < 256) {
|
||||||
HAK_TRACE("[wrap_malloc_enter]\n");
|
HAK_TRACE("[wrap_malloc_enter]\n");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
// NDEBUG: malloc_count increment disabled - removes 27.55% bottleneck
|
// NDEBUG: malloc_count increment disabled - removes 27.55% bottleneck
|
||||||
|
|
||||||
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
||||||
@ -226,11 +230,15 @@ void* malloc(size_t size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free(void* ptr) {
|
void free(void* ptr) {
|
||||||
|
#if !HAKMEM_BUILD_RELEASE
|
||||||
|
// Debug-only trace counters; disabled in release to keep free() hot path
|
||||||
|
// free of atomic increments.
|
||||||
static _Atomic int g_wrap_free_trace_count = 0;
|
static _Atomic int g_wrap_free_trace_count = 0;
|
||||||
if (atomic_fetch_add_explicit(&g_wrap_free_trace_count, 1, memory_order_relaxed) < 256) {
|
if (atomic_fetch_add_explicit(&g_wrap_free_trace_count, 1, memory_order_relaxed) < 256) {
|
||||||
HAK_TRACE("[wrap_free_enter]\n");
|
HAK_TRACE("[wrap_free_enter]\n");
|
||||||
}
|
}
|
||||||
atomic_fetch_add_explicit(&g_free_wrapper_calls, 1, memory_order_relaxed);
|
atomic_fetch_add_explicit(&g_free_wrapper_calls, 1, memory_order_relaxed);
|
||||||
|
#endif
|
||||||
if (!ptr) return;
|
if (!ptr) return;
|
||||||
|
|
||||||
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
||||||
|
|||||||
Reference in New Issue
Block a user