Phase 35-39: FAST build optimization complete (+7.13% cumulative)

Phase 35-A: BENCH_MINIMAL gate function elimination (GO +4.39%)
- tiny_front_v3_enabled() → constant true
- tiny_metadata_cache_enabled() → constant 0
- learner_v7_enabled() → constant false
- small_learner_v2_enabled() → constant false

Phase 36: Policy snapshot init-once (GO +0.71%)
- small_policy_v7_snapshot() version check skip in BENCH_MINIMAL
- TLS cache for policy snapshot

Phase 37: Standard TLS cache (NO-GO -0.07%)
- TLS cache for Standard build attempted
- Runtime gate overhead negates benefit

Phase 38: FAST/OBSERVE/Standard workflow established
- make perf_fast, make perf_observe targets
- Scorecard and documentation updates

Phase 39: Hot path gate constantization (GO +1.98%)
- front_gate_unified_enabled() → constant 1
- alloc_dualhot_enabled() → constant 0
- g_bench_fast_front, g_v3_enabled blocks → compile-out
- free_dispatch_stats_enabled() → constant false

Results:
- FAST v3: 56.04M ops/s (47.4% of mimalloc)
- Standard: 53.50M ops/s (45.3% of mimalloc)
- M1 target (50%): 5.5% remaining

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-16 15:01:56 +09:00
parent 506e724c3b
commit b7085c47e1
22 changed files with 1550 additions and 397 deletions

View File

@ -1,6 +1,7 @@
#ifndef HAKMEM_FREE_DISPATCH_STATS_BOX_H
#define HAKMEM_FREE_DISPATCH_STATS_BOX_H
#include "../hakmem_build_flags.h" // Phase 39: HAKMEM_BENCH_MINIMAL (GO +1.98%)
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
@ -26,13 +27,18 @@ typedef struct FreeDispatchStats {
} FreeDispatchStats;
// ENV gate
// Phase 39: BENCH_MINIMAL → 固定 false (GO +1.98%)
static inline bool free_dispatch_stats_enabled(void) {
#if HAKMEM_BENCH_MINIMAL
return false; // FAST v3: 定数化 (stats OFF)
#else
static int g_enabled = -1;
if (__builtin_expect(g_enabled == -1, 0)) {
const char* e = getenv("HAKMEM_FREE_DISPATCH_STATS");
g_enabled = (e && *e && *e != '0') ? 1 : 0;
}
return g_enabled;
#endif
}
// Global stats instance

View File

@ -3,6 +3,7 @@
#ifndef HAK_FREE_API_INC_H
#define HAK_FREE_API_INC_H
#include "../hakmem_build_flags.h" // Phase 39: HAKMEM_BENCH_MINIMAL (GO +1.98%)
#include <sys/mman.h> // For mincore() in AllocHeader safety check
#include "hakmem_tiny_superslab.h" // For SUPERSLAB_MAGIC, SuperSlab
#include "../ptr_trace.h" // Debug: pointer trace immediate dump on libc fallback
@ -112,6 +113,8 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
#endif
// Bench-only ultra-short path: try header-based tiny fast free first
// Enable with: HAKMEM_BENCH_FAST_FRONT=1
// Phase 39: BENCH_MINIMAL → compile-out (GO +1.98%)
#if !HAKMEM_BENCH_MINIMAL
{
static int g_bench_fast_front = -1;
if (__builtin_expect(g_bench_fast_front == -1, 0)) {
@ -129,6 +132,7 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
}
#endif
}
#endif
if (!ptr) {
#if HAKMEM_DEBUG_TIMING
@ -168,7 +172,8 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
case FG_DOMAIN_TINY: {
// Phase FREE-FRONT-V3-2: v3 snapshot routing (optional, default OFF)
// Optimized: No tiny_route_for_class() calls, no redundant ENV checks
#if HAKMEM_TINY_HEADER_CLASSIDX
// Phase 39: BENCH_MINIMAL → compile-out (GO +1.98%)
#if HAKMEM_TINY_HEADER_CLASSIDX && !HAKMEM_BENCH_MINIMAL
{
// Check if v3 snapshot routing is enabled (cached)
static int g_v3_enabled = -1;

View File

@ -0,0 +1,15 @@
// small_policy_snapshot_tls_box.c - Phase 37: Lightweight TLS cache implementation
#include <stdlib.h> // for NULL
#include "small_policy_snapshot_tls_box.h"
// TLS singleton for policy snapshot cache
static __thread SmallPolicySnapshotTLSCache g_policy_snapshot_tls_cache = {
.cached_ptr = NULL,
.cached_version = 0,
.initialized = 0
};
SmallPolicySnapshotTLSCache* small_policy_snapshot_tls_get(void) {
return &g_policy_snapshot_tls_cache;
}

View File

@ -0,0 +1,81 @@
// small_policy_snapshot_tls_box.h - Phase 37: Lightweight TLS cache for policy snapshot
//
// Purpose:
// - Reduce fixed tax from global version read in small_policy_v7_snapshot()
// - Fast path: return cached pointer without global memory access
// - Slow path: refresh only when global version changes
//
// Box Theory:
// - Single Responsibility: TLS caching for policy snapshot
// - Reversible: ENV gate HAKMEM_POLICY_SNAPSHOT_TLS (default ON)
// - Clear Boundary: Only affects small_policy_v7_snapshot() internal
#ifndef SMALL_POLICY_SNAPSHOT_TLS_BOX_H
#define SMALL_POLICY_SNAPSHOT_TLS_BOX_H
#include "../hakmem_build_flags.h"
#include <stdlib.h> // for getenv
#include <stdint.h>
#include <stdbool.h>
// Forward declaration
struct SmallPolicyV7;
// TLS cache state
typedef struct SmallPolicySnapshotTLSCache {
const struct SmallPolicyV7* cached_ptr; // Cached policy pointer
uint32_t cached_version; // Last seen global version
int initialized; // 0 = not init, 1 = initialized
} SmallPolicySnapshotTLSCache;
// ENV gate: default OFF (Phase 37 NO-GO: TLS cache has no benefit)
// Set HAKMEM_POLICY_SNAPSHOT_TLS=1 to enable (research only)
#if HAKMEM_BENCH_MINIMAL
// BENCH_MINIMAL: always use Phase 36 optimization (skip version check entirely)
static inline int policy_snapshot_tls_enabled(void) {
return 0; // Disabled in BENCH_MINIMAL (use simpler Phase 36 path)
}
#else
static inline int policy_snapshot_tls_enabled(void) {
static int g = -1;
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_POLICY_SNAPSHOT_TLS");
// Phase 37 NO-GO: default OFF (TLS cache adds overhead, no benefit)
if (e && *e == '1') {
g = 1; // explicitly enabled (research only)
} else {
g = 0; // default OFF
}
}
return g;
}
#endif
// Get TLS cache (thread-local singleton)
SmallPolicySnapshotTLSCache* small_policy_snapshot_tls_get(void);
// Check if TLS cache is valid (fast path: just compare version)
// Returns: 1 if cache is valid and can return cached_ptr, 0 if refresh needed
static inline int small_policy_snapshot_tls_check(
SmallPolicySnapshotTLSCache* cache,
uint32_t global_version
) {
// Fast path: initialized and version matches
if (__builtin_expect(cache->initialized && cache->cached_version == global_version, 1)) {
return 1; // Cache hit
}
return 0; // Cache miss - needs refresh
}
// Update TLS cache after refresh
static inline void small_policy_snapshot_tls_update(
SmallPolicySnapshotTLSCache* cache,
const struct SmallPolicyV7* ptr,
uint32_t version
) {
cache->cached_ptr = ptr;
cache->cached_version = version;
cache->initialized = 1;
}
#endif // SMALL_POLICY_SNAPSHOT_TLS_BOX_H

View File

@ -92,6 +92,8 @@ static inline int tiny_alloc_gate_validate(TinyAllocGateContext* ctx)
return 0;
}
if (ctx->class_idx >= 0 && (uint8_t)ctx->class_idx != meta_cls) {
// Phase 34B: Compile-out alloc gate class mismatch counter (default OFF)
#if HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED
static _Atomic uint32_t g_alloc_gate_cls_mis = 0;
uint32_t n = atomic_fetch_add_explicit(&g_alloc_gate_cls_mis, 1, memory_order_relaxed);
if (n < 8) {
@ -105,6 +107,9 @@ static inline int tiny_alloc_gate_validate(TinyAllocGateContext* ctx)
info.slab_idx);
fflush(stderr);
}
#else
(void)0; // No-op when compiled out
#endif
// クラス不一致自体は Fail-Fast せず、ログだけ残す(将来の Guard 差し込みポイント)。
}

View File

@ -1,6 +1,7 @@
// tiny_front_v3_env_box.h - Tiny Front v3 ENV gate & snapshot (guard/UC/header)
#pragma once
#include "../hakmem_build_flags.h" // Phase 35-A: HAKMEM_BENCH_MINIMAL
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
@ -28,6 +29,12 @@ extern TinyFrontV3Snapshot g_tiny_front_v3_snapshot;
extern int g_tiny_front_v3_snapshot_ready;
// ENV gate: default ON (set HAKMEM_TINY_FRONT_V3_ENABLED=0 to disable)
// Phase 35-A: BENCH_MINIMAL mode - compile-time constant (default ON)
#if HAKMEM_BENCH_MINIMAL
static inline bool tiny_front_v3_enabled(void) {
return true; // Fixed ON in bench mode (default behavior)
}
#else
static inline bool tiny_front_v3_enabled(void) {
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
@ -40,6 +47,7 @@ static inline bool tiny_front_v3_enabled(void) {
}
return g_enable != 0;
}
#endif
// Optional: size→class LUT gate (default ON, set HAKMEM_TINY_FRONT_V3_LUT_ENABLED=0 to disable)
static inline bool tiny_front_v3_lut_enabled(void) {

View File

@ -18,6 +18,12 @@
// Forward declare the learner enabled check (to avoid header conflicts)
extern bool small_learner_v2_enabled(void);
// Phase 35-A: BENCH_MINIMAL mode - compile-time constant (default OFF)
#if HAKMEM_BENCH_MINIMAL
static inline int tiny_metadata_cache_enabled(void) {
return 0; // Fixed OFF in bench mode (default behavior)
}
#else
static inline int tiny_metadata_cache_enabled(void) {
static int g = -1;
static int g_probe_left = 64; // tolerate early getenv() instability (bench_profile putenv)
@ -54,5 +60,6 @@ static inline int tiny_metadata_cache_enabled(void) {
g = 0;
return 0;
}
#endif
#endif // HAK_TINY_METADATA_CACHE_ENV_BOX_H

View File

@ -87,8 +87,12 @@ static inline uint32_t tiny_self_u32_local(void) {
// ENV Control (cached, lazy init)
// ============================================================================
// Enable flag (default: 0, OFF)
// Enable flag (default: ON)
// Phase 39: BENCH_MINIMAL → 固定 1 (lazy-init 削除) — GO +1.98%
static inline int front_gate_unified_enabled(void) {
#if HAKMEM_BENCH_MINIMAL
return 1; // FAST v3: 定数化
#else
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED");
@ -101,6 +105,7 @@ static inline int front_gate_unified_enabled(void) {
#endif
}
return g_enable;
#endif
}
// ============================================================================
@ -140,7 +145,11 @@ static inline int front_gate_unified_enabled(void) {
//
// Phase ALLOC-TINY-FAST-DUALHOT-2: Probe window ENV gate (safe from early putenv)
// Phase 39: BENCH_MINIMAL → 固定 0 (lazy-init 削除) — GO +1.98%
static inline int alloc_dualhot_enabled(void) {
#if HAKMEM_BENCH_MINIMAL
return 0; // FAST v3: 定数化 (default OFF)
#else
static int g = -1;
static int g_probe_left = 64; // Probe window: tolerate early putenv before gate init
if (__builtin_expect(g == -1, 0)) {
@ -158,6 +167,7 @@ static inline int alloc_dualhot_enabled(void) {
}
}
return g;
#endif
}
// Phase 2 B3: tiny_alloc_route_cold() - Handle rare routes (V7, MID, ULTRA)

View File

@ -26,6 +26,17 @@
# endif
#endif
// ------------------------------------------------------------
// Phase 35-A: Benchmark Minimal Mode
// ------------------------------------------------------------
// HAKMEM_BENCH_MINIMAL: Eliminate gate function overhead for benchmarks
// When =1: Gate functions return compile-time constants (no lazy init check)
// When =0: Normal runtime gate behavior (default)
// Usage: Build with -DHAKMEM_BENCH_MINIMAL=1 for benchmark-only binaries
#ifndef HAKMEM_BENCH_MINIMAL
# define HAKMEM_BENCH_MINIMAL 0
#endif
// ------------------------------------------------------------
// Instrumentation & counters (compile-time)
// ------------------------------------------------------------
@ -372,6 +383,35 @@
# define HAKMEM_TINY_FREE_TRACE_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 32: Tiny Free Calls Atomic Prune (Compile-out diagnostic counter)
// ------------------------------------------------------------
// Tiny Free Calls: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need free path call counting
// Target: g_hak_tiny_free_calls atomic in core/hakmem_tiny_free.inc:335
// Impact: HOT path atomic (every free operation, unconditional)
// Expected improvement: +0.3% to +0.7% (diagnostic counter, less critical than Phase 25)
#ifndef HAKMEM_TINY_FREE_CALLS_COMPILED
# define HAKMEM_TINY_FREE_CALLS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 34: Batch Atomic Prune (Compile-out remaining WARM path atomics)
// ------------------------------------------------------------
// Phase 34A: Splice Debug Counter (WARM path, refill)
// Target: g_splice_count in core/tiny_refill_opt.h:79
// Impact: WARM path atomic (every refill splice operation)
#ifndef HAKMEM_SPLICE_DEBUG_COMPILED
# define HAKMEM_SPLICE_DEBUG_COMPILED 0
#endif
// Phase 34B: Alloc Gate Class Mismatch (ERROR path, rare)
// Target: g_alloc_gate_cls_mis in core/box/tiny_alloc_gate_box.h:95
// Impact: ERROR path atomic (class mismatch detection, rare)
#ifndef HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED
# define HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED 0
#endif
// ------------------------------------------------------------
// Helper enum (for documentation / logging)
// ------------------------------------------------------------

View File

@ -332,8 +332,12 @@ void hak_tiny_free(void* ptr) {
(void)0; // No-op when trace compiled out
#endif
// Track total tiny free calls (diagnostics)
#if HAKMEM_TINY_FREE_CALLS_COMPILED
extern _Atomic uint64_t g_hak_tiny_free_calls;
atomic_fetch_add_explicit(&g_hak_tiny_free_calls, 1, memory_order_relaxed);
#else
(void)0; // No-op when diagnostic counter compiled out
#endif
if (!ptr || !g_tiny_initialized) return;
hak_tiny_stats_poll();

View File

@ -5,6 +5,7 @@
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "hakmem_build_flags.h" // Phase 36: HAKMEM_BENCH_MINIMAL
#include "box/smallobject_learner_v2_box.h"
#include "box/smallobject_stats_mid_v3_box.h"
@ -245,10 +246,17 @@ uint32_t small_learner_v2_retire_efficiency_pct(uint32_t class_idx) {
// Configuration & Control
// ============================================================================
// Phase 36: BENCH_MINIMAL mode - learner is disabled (bench profiles don't use learner)
#if HAKMEM_BENCH_MINIMAL
bool small_learner_v2_enabled(void) {
return false; // Fixed OFF in bench mode
}
#else
bool small_learner_v2_enabled(void) {
const char *env = getenv("HAKMEM_SMALL_LEARNER_V7_ENABLED");
return (env && *env && *env != '0');
}
#endif
void small_learner_v2_set_c5_threshold_pct(uint32_t threshold) {
g_c5_threshold_pct = threshold;

View File

@ -3,8 +3,10 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "hakmem_build_flags.h" // Phase 36: HAKMEM_BENCH_MINIMAL
#include "box/smallobject_policy_v7_box.h"
#include "box/smallobject_learner_v7_box.h" // For Learner API
#include "box/small_policy_snapshot_tls_box.h" // Phase 37: TLS cache
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
@ -18,6 +20,12 @@
static SmallLearnerStatsV7 g_small_learner_stats_v7;
static int g_learner_v7_enabled = -1; // -1: uninit, 0: disabled, 1: enabled
// Phase 36: BENCH_MINIMAL mode - learner is disabled (bench profiles don't use learner)
#if HAKMEM_BENCH_MINIMAL
static inline int learner_v7_enabled(void) {
return 0; // Fixed OFF in bench mode
}
#else
static inline int learner_v7_enabled(void) {
if (unlikely(g_learner_v7_enabled < 0)) {
// Phase v10: Learner default ON (when v7 is enabled)
@ -33,6 +41,7 @@ static inline int learner_v7_enabled(void) {
}
return g_learner_v7_enabled;
}
#endif
// ============================================================================
// TLS Policy Snapshot (v7-7: version-based invalidation)
@ -48,7 +57,46 @@ void small_policy_v7_bump_version(void) {
}
const SmallPolicyV7* small_policy_v7_snapshot(void) {
// Check if TLS cache is stale (version mismatch or uninitialized)
#if HAKMEM_BENCH_MINIMAL
// Phase 36: BENCH_MINIMAL mode - skip version check, use init-once TLS cache
// Assumes: Learner disabled, policy doesn't change during benchmark
static __thread int s_initialized = 0;
if (unlikely(!s_initialized)) {
small_policy_v7_init_from_env(&g_small_policy_v7);
s_initialized = 1;
}
return &g_small_policy_v7;
#else
// Phase 37: TLS cache fast path (default ON)
if (policy_snapshot_tls_enabled()) {
SmallPolicySnapshotTLSCache* cache = small_policy_snapshot_tls_get();
uint32_t gver = g_policy_v7_version;
// Fast path: cache valid → return immediately (no global read beyond version)
if (small_policy_snapshot_tls_check(cache, gver)) {
return cache->cached_ptr;
}
// Slow path: refresh cache
small_policy_v7_init_from_env(&g_small_policy_v7);
// v7-7: Apply Learner-driven route updates
if (learner_v7_enabled() && g_small_learner_stats_v7.total_retires > 0) {
small_policy_v7_update_from_learner(&g_small_learner_stats_v7, &g_small_policy_v7);
}
// Initialize global version to 1 if uninitialized (0)
if (gver == 0) {
__sync_val_compare_and_swap(&g_policy_v7_version, 0, 1);
gver = 1;
}
// Update TLS cache
small_policy_snapshot_tls_update(cache, &g_small_policy_v7, gver);
return &g_small_policy_v7;
}
// Fallback: original version-check path (HAKMEM_POLICY_SNAPSHOT_TLS=0)
if (unlikely(g_small_policy_v7_version != g_policy_v7_version || g_policy_v7_version == 0)) {
small_policy_v7_init_from_env(&g_small_policy_v7);
@ -65,6 +113,7 @@ const SmallPolicyV7* small_policy_v7_snapshot(void) {
g_small_policy_v7_version = g_policy_v7_version;
}
return &g_small_policy_v7;
#endif
}
// ============================================================================

View File

@ -76,6 +76,8 @@ static inline void trc_splice_to_sll(int class_idx, TinyRefillChain* c,
}
// DEBUG: Validate chain is properly NULL-terminated BEFORE splicing
// Phase 34A: Compile-out splice debug counter (default OFF)
#if HAKMEM_SPLICE_DEBUG_COMPILED
static _Atomic uint64_t g_splice_count = 0;
uint64_t splice_num = atomic_fetch_add(&g_splice_count, 1);
if (splice_num > 40 && splice_num < 80 && class_idx == 0) {
@ -98,6 +100,9 @@ static inline void trc_splice_to_sll(int class_idx, TinyRefillChain* c,
}
fflush(stderr);
}
#else
(void)0; // No-op when compiled out
#endif
// 🐛 DEBUG: Log splice call BEFORE calling tls_sll_splice()
#if !HAKMEM_BUILD_RELEASE