Phase 24-26: Hot path atomic telemetry prune (+2.00% cumulative)
Summary: - Phase 24 (alloc stats): +0.93% GO - Phase 25 (free stats): +1.07% GO - Phase 26 (diagnostics): -0.33% NEUTRAL (code cleanliness) - Total: 11 atomics compiled-out, +2.00% improvement Phase 24: OBSERVE tax prune (tiny_class_stats_box.h) - Added HAKMEM_TINY_CLASS_STATS_COMPILED (default: 0) - Wrapped 5 stats functions: uc_miss, warm_hit, shared_lock, tls_carve_* - Result: +0.93% (baseline 56.675M vs compiled-in 56.151M ops/s) Phase 25: Tiny free stats prune (tiny_superslab_free.inc.h) - Added HAKMEM_TINY_FREE_STATS_COMPILED (default: 0) - Wrapped g_free_ss_enter atomic in free hot path - Result: +1.07% (baseline 57.017M vs compiled-in 56.415M ops/s) Phase 26: Hot path diagnostic atomics prune - Added 5 compile gates for low-frequency error counters: - HAKMEM_TINY_C7_FREE_COUNT_COMPILED - HAKMEM_TINY_HDR_MISMATCH_LOG_COMPILED - HAKMEM_TINY_HDR_META_MISMATCH_COMPILED - HAKMEM_TINY_METRIC_BAD_CLASS_COMPILED - HAKMEM_TINY_HDR_META_FAST_COMPILED - Result: -0.33% NEUTRAL (within noise, kept for cleanliness) Alignment with mimalloc principles: - "No atomics on hot path" - telemetry moved to compile-time opt-in - Fixed per-op tax elimination - Production builds: maximum performance (atomics compiled-out) - Research builds: full diagnostics (COMPILED=1) Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -108,15 +108,17 @@
|
||||
//
|
||||
__attribute__((always_inline))
|
||||
static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
// Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
|
||||
int lifo_mode = tiny_unified_lifo_enabled();
|
||||
|
||||
extern __thread TinyUnifiedCache g_unified_cache[];
|
||||
|
||||
// TLS cache access (1 cache miss)
|
||||
// NOTE: Range check removed - caller (hak_tiny_size_to_class) guarantees valid class_idx
|
||||
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
|
||||
|
||||
#if HAKMEM_TINY_UNIFIED_LIFO_COMPILED
|
||||
// Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
|
||||
// Phase 22: Compile-out when disabled (default OFF)
|
||||
int lifo_mode = tiny_unified_lifo_enabled();
|
||||
|
||||
// Phase 15 v1: LIFO vs FIFO mode switch
|
||||
if (lifo_mode) {
|
||||
// === LIFO MODE: Stack-based (LIFO) ===
|
||||
@ -134,8 +136,9 @@ static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
TINY_HOT_METRICS_MISS(class_idx);
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
// === FIFO MODE: Ring-based (existing) ===
|
||||
// === FIFO MODE: Ring-based (existing, default) ===
|
||||
// Branch 1: Cache empty check (LIKELY hit)
|
||||
// Hot path: cache has objects (head != tail)
|
||||
// Cold path: cache empty (head == tail) → refill needed
|
||||
@ -187,15 +190,17 @@ static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
//
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_hot_free_fast(int class_idx, void* base) {
|
||||
// Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
|
||||
int lifo_mode = tiny_unified_lifo_enabled();
|
||||
|
||||
extern __thread TinyUnifiedCache g_unified_cache[];
|
||||
|
||||
// TLS cache access (1 cache miss)
|
||||
// NOTE: Range check removed - caller guarantees valid class_idx
|
||||
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
|
||||
|
||||
#if HAKMEM_TINY_UNIFIED_LIFO_COMPILED
|
||||
// Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
|
||||
// Phase 22: Compile-out when disabled (default OFF)
|
||||
int lifo_mode = tiny_unified_lifo_enabled();
|
||||
|
||||
// Phase 15 v1: LIFO vs FIFO mode switch
|
||||
if (lifo_mode) {
|
||||
// === LIFO MODE: Stack-based (LIFO) ===
|
||||
@ -214,8 +219,9 @@ static inline int tiny_hot_free_fast(int class_idx, void* base) {
|
||||
#endif
|
||||
return 0; // FULL
|
||||
}
|
||||
#endif
|
||||
|
||||
// === FIFO MODE: Ring-based (existing) ===
|
||||
// === FIFO MODE: Ring-based (existing, default) ===
|
||||
// Calculate next tail (for full check)
|
||||
uint16_t next_tail = (cache->tail + 1) & cache->mask;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user