## Summary Phase 18 v2 attempted instruction count reduction via conditional compilation: - Stats collection → no-op - ENV checks → constant propagation - Binary size: 653K → 649K (-4K, -0.6%) Result: NEUTRAL (below GO threshold) - Throughput: +2.32% (target: +5% minimum) ❌ - Instructions: -5.06% (target: -15% minimum) ❌ - Cycles: -3.26% (positive signal) - Branches: -8.67% (positive signal) - Cache-misses: +30% (unexpected, likely layout) ## Analysis Positive signals: - Implementation correct (Branch -8.67%, Instruction -5.06%) - Binary size reduced (-4K) - Modest throughput gain (+2.32%) - Cycles and branch overhead reduced Negative signals: - Instruction reduction insufficient (-5.06% << -15% smoking gun) - Throughput gain below +5% threshold - Cache-misses increased (+30%, layout noise?) ## Verdict Freeze Phase 18 v2 (weak positive, insufficient for production). Per user guidance: "If instructions don't drop clearly, continuation value is thin." -5.06% instruction reduction is marginal. Allocator micro-optimization plateau confirmed. ## Key Insight Phase 17 showed: - IPC = 2.30 (consistent, memory-bound) - I-cache gap: 55% (Phase 17: 153K → 68K) - Instruction gap: 48% (Phase 17: 41.3B → 21.5B) Phase 18 v1/v2 results confirm: - Layout tweaks are fragile (v1: I-cache +91%) - Instruction removal is modest benefit (v2: -5.06%) - Allocator is NOT the bottleneck (IPC constant, memory-limited) ## Recommendation Do NOT continue Phase 18 micro-optimizations. Next frontier requires different approach: 1. Architectural redesign (SIMD, lock-free, batching) 2. Memory layout optimization (cache-friendly structures) 3. Broader profiling (not allocator-focused) Or: Accept that 48M → 85M (75% gap) is achievable with current architecture. Files: - docs/analysis/PHASE18_HOT_TEXT_ISOLATION_2_AB_TEST_RESULTS.md (results) - CURRENT_TASK.md (Phase 18 complete status) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
115 lines
5.6 KiB
C
115 lines
5.6 KiB
C
#ifndef HAK_FRONT_FASTLANE_STATS_BOX_H
|
|
#define HAK_FRONT_FASTLANE_STATS_BOX_H
|
|
|
|
// ============================================================================
|
|
// Phase 6: Front FastLane - Stats Box
|
|
// ============================================================================
|
|
//
|
|
// Purpose: Visibility into FastLane hit/fallback rates
|
|
//
|
|
// Counters (compile-out when HAKMEM_DEBUG_COUNTERS=0):
|
|
//
|
|
// Malloc:
|
|
// - malloc_total: Total try_malloc attempts
|
|
// - malloc_hit: Successful FastLane alloc
|
|
// - malloc_fallback_*: Fallback reasons (3-6 types)
|
|
//
|
|
// Free:
|
|
// - free_total: Total try_free attempts
|
|
// - free_hit: Successful FastLane free
|
|
// - free_fallback_*: Fallback reasons (3-6 types)
|
|
//
|
|
// Output (on exit, if HAKMEM_DEBUG_COUNTERS=1):
|
|
// [FRONT_FASTLANE] malloc_total=N hit=N fb_*=N ... free_total=N hit=N fb_*=N ...
|
|
//
|
|
// Box Theory:
|
|
// - L2: Stats layer (compile-out when counters disabled)
|
|
// - Zero overhead: No-op macros when HAKMEM_DEBUG_COUNTERS=0
|
|
//
|
|
// ============================================================================
|
|
|
|
#include <stdatomic.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
// Stats structure (global, thread-safe via atomics)
|
|
typedef struct {
|
|
// Malloc stats
|
|
_Atomic uint64_t malloc_total; // Total try_malloc calls
|
|
_Atomic uint64_t malloc_hit; // Successful FastLane alloc
|
|
_Atomic uint64_t malloc_fallback_stub; // Stub: not implemented yet (Patch 2)
|
|
_Atomic uint64_t malloc_fallback_size; // Size out of Tiny range
|
|
_Atomic uint64_t malloc_fallback_class; // Class calculation failed
|
|
_Atomic uint64_t malloc_fallback_alloc; // Allocation failed (refill needed)
|
|
_Atomic uint64_t malloc_fallback_other; // Other reasons
|
|
|
|
// Free stats
|
|
_Atomic uint64_t free_total; // Total try_free calls
|
|
_Atomic uint64_t free_hit; // Successful FastLane free
|
|
_Atomic uint64_t free_fallback_stub; // Stub: not implemented yet (Patch 2)
|
|
_Atomic uint64_t free_fallback_aligned; // Page-aligned pointer
|
|
_Atomic uint64_t free_fallback_header; // Invalid header magic
|
|
_Atomic uint64_t free_fallback_class; // Class out of bounds
|
|
_Atomic uint64_t free_fallback_failure; // Free failed (cold path needed)
|
|
_Atomic uint64_t free_fallback_other; // Other reasons
|
|
} FrontFastLaneStats;
|
|
|
|
// Global stats instance
|
|
static FrontFastLaneStats g_front_fastlane_stats = {0};
|
|
|
|
// Increment macros (relaxed ordering - stats only)
|
|
// Phase 18 v2: BENCH_MINIMAL conditional (no-op when HAKMEM_BENCH_MINIMAL=1)
|
|
#if HAKMEM_BENCH_MINIMAL
|
|
#define FRONT_FASTLANE_STAT_INC(field) do { (void)0; } while(0)
|
|
#else
|
|
#define FRONT_FASTLANE_STAT_INC(field) \
|
|
atomic_fetch_add_explicit(&g_front_fastlane_stats.field, 1, memory_order_relaxed)
|
|
#endif
|
|
|
|
// Dump stats on exit (call from wrapper destructor or main)
|
|
static void front_fastlane_stats_dump(void) {
|
|
uint64_t m_total = atomic_load_explicit(&g_front_fastlane_stats.malloc_total, memory_order_relaxed);
|
|
uint64_t f_total = atomic_load_explicit(&g_front_fastlane_stats.free_total, memory_order_relaxed);
|
|
|
|
if (m_total == 0 && f_total == 0) return; // No activity
|
|
|
|
// Malloc stats
|
|
uint64_t m_hit = atomic_load_explicit(&g_front_fastlane_stats.malloc_hit, memory_order_relaxed);
|
|
uint64_t m_fb_stub = atomic_load_explicit(&g_front_fastlane_stats.malloc_fallback_stub, memory_order_relaxed);
|
|
uint64_t m_fb_size = atomic_load_explicit(&g_front_fastlane_stats.malloc_fallback_size, memory_order_relaxed);
|
|
uint64_t m_fb_class = atomic_load_explicit(&g_front_fastlane_stats.malloc_fallback_class, memory_order_relaxed);
|
|
uint64_t m_fb_alloc = atomic_load_explicit(&g_front_fastlane_stats.malloc_fallback_alloc, memory_order_relaxed);
|
|
uint64_t m_fb_other = atomic_load_explicit(&g_front_fastlane_stats.malloc_fallback_other, memory_order_relaxed);
|
|
|
|
// Free stats
|
|
uint64_t f_hit = atomic_load_explicit(&g_front_fastlane_stats.free_hit, memory_order_relaxed);
|
|
uint64_t f_fb_stub = atomic_load_explicit(&g_front_fastlane_stats.free_fallback_stub, memory_order_relaxed);
|
|
uint64_t f_fb_aligned = atomic_load_explicit(&g_front_fastlane_stats.free_fallback_aligned, memory_order_relaxed);
|
|
uint64_t f_fb_header = atomic_load_explicit(&g_front_fastlane_stats.free_fallback_header, memory_order_relaxed);
|
|
uint64_t f_fb_class = atomic_load_explicit(&g_front_fastlane_stats.free_fallback_class, memory_order_relaxed);
|
|
uint64_t f_fb_failure = atomic_load_explicit(&g_front_fastlane_stats.free_fallback_failure, memory_order_relaxed);
|
|
uint64_t f_fb_other = atomic_load_explicit(&g_front_fastlane_stats.free_fallback_other, memory_order_relaxed);
|
|
|
|
fprintf(stderr, "[FRONT_FASTLANE] malloc_total=%lu hit=%lu fb_stub=%lu fb_size=%lu fb_class=%lu fb_alloc=%lu fb_other=%lu | "
|
|
"free_total=%lu hit=%lu fb_stub=%lu fb_aligned=%lu fb_header=%lu fb_class=%lu fb_failure=%lu fb_other=%lu\n",
|
|
(unsigned long)m_total, (unsigned long)m_hit,
|
|
(unsigned long)m_fb_stub, (unsigned long)m_fb_size, (unsigned long)m_fb_class,
|
|
(unsigned long)m_fb_alloc, (unsigned long)m_fb_other,
|
|
(unsigned long)f_total, (unsigned long)f_hit,
|
|
(unsigned long)f_fb_stub, (unsigned long)f_fb_aligned, (unsigned long)f_fb_header,
|
|
(unsigned long)f_fb_class, (unsigned long)f_fb_failure, (unsigned long)f_fb_other);
|
|
}
|
|
|
|
#else // HAKMEM_DEBUG_COUNTERS == 0
|
|
|
|
// No-op macros (zero overhead)
|
|
#define FRONT_FASTLANE_STAT_INC(field) do {} while(0)
|
|
|
|
static inline void front_fastlane_stats_dump(void) {}
|
|
|
|
#endif // HAKMEM_DEBUG_COUNTERS
|
|
|
|
#endif // HAK_FRONT_FASTLANE_STATS_BOX_H
|