Phase 5 E4-1: Free Wrapper ENV Snapshot (+3.51% GO, ADOPTED)

Target: Consolidate free wrapper TLS reads (2→1)
- free() is 25.26% self% (top hot spot)
- Strategy: Apply E1 success pattern (ENV snapshot) to free path

Implementation:
- ENV gate: HAKMEM_FREE_WRAPPER_ENV_SNAPSHOT=0/1 (default 0)
- core/box/free_wrapper_env_snapshot_box.{h,c}: New box
  - Consolidates 2 TLS reads → 1 TLS read (50% reduction)
  - Reduces 4 branches → 3 branches (25% reduction)
  - Lazy init with probe window (bench_profile putenv sync)
- core/box/hak_wrappers.inc.h: Integration in free() wrapper
- Makefile: Add free_wrapper_env_snapshot_box.o to all targets

A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (SNAPSHOT=0): 45.35M ops/s (mean), 45.31M ops/s (median)
- Optimized (SNAPSHOT=1): 46.94M ops/s (mean), 47.15M ops/s (median)
- Improvement: +3.51% mean, +4.07% median

Decision: GO (+3.51% >= +1.0% threshold)
- Exceeded conservative estimate (+1.5% → +3.51%)
- Similar efficiency to E1 (+3.92%)
- Health check: PASS (all profiles)
- Action: PROMOTED to MIXED_TINYV3_C7_SAFE preset

Phase 5 Cumulative:
- E1 (ENV Snapshot): +3.92%
- E4-1 (Free Wrapper Snapshot): +3.51%
- Total Phase 4-5: ~+7.5%

E3-4 Correction:
- Phase 4 E3-4 (ENV Constructor Init): NO-GO / FROZEN
- Initial A/B showed +4.75%, but investigation revealed:
  - Branch prediction hint mismatch (UNLIKELY with always-true)
  - Retest confirmed -1.78% regression
  - Root cause: __builtin_expect(..., 0) with ctor_mode==1
- Decision: Freeze as research box (default OFF)
- Learning: Branch hints need careful tuning, TLS consolidation safer

Deliverables:
- docs/analysis/PHASE5_E4_FREE_GATE_OPTIMIZATION_1_DESIGN.md
- docs/analysis/PHASE5_E4_1_FREE_WRAPPER_ENV_SNAPSHOT_NEXT_INSTRUCTIONS.md
- docs/analysis/PHASE5_E4_2_MALLOC_WRAPPER_ENV_SNAPSHOT_NEXT_INSTRUCTIONS.md (next)
- docs/analysis/PHASE5_POST_E1_NEXT_INSTRUCTIONS.md
- docs/analysis/ENV_PROFILE_PRESETS.md (E4-1 added, E3-4 corrected)
- CURRENT_TASK.md (E4-1 complete, E3-4 frozen)
- core/bench_profile.h (E4-1 promoted to default)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-14 04:24:34 +09:00
parent 21e2e4ac2b
commit 4a070d8a14
17 changed files with 1184 additions and 114 deletions

View File

@ -0,0 +1,45 @@
// free_wrapper_env_snapshot_box.c - Box: Free Wrapper ENV Snapshot Implementation
//
// Phase 5 E4-1: Free Gate Optimization
#include "free_wrapper_env_snapshot_box.h"
#include "wrapper_env_box.h"
#include "tiny_front_config_box.h"
#include "free_tiny_fast_hotcold_env_box.h"
#include "../front/malloc_tiny_fast.h"
#include <stdio.h>
// TLS storage (initialized to zero on thread creation)
__thread struct free_wrapper_env_snapshot g_free_wrapper_env = {0};
// Lazy init implementation: Called once per thread on first free() call
void free_wrapper_env_snapshot_init(void)
{
// Read wrapper env config (wrap_shape flag)
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg();
g_free_wrapper_env.wrap_shape = wcfg->wrap_shape;
// Read front gate unified constant (compile-time macro)
g_free_wrapper_env.front_gate_unified = TINY_FRONT_UNIFIED_GATE_ENABLED;
// Read hotcold enabled flag (runtime ENV check)
g_free_wrapper_env.hotcold_enabled = hak_free_tiny_fast_hotcold_enabled();
// Mark as initialized (lazy init complete)
g_free_wrapper_env.initialized = 1;
#if !HAKMEM_BUILD_RELEASE
// Debug: Log snapshot initialization (first 5 threads only)
static _Atomic uint32_t g_init_log_count = 0;
uint32_t n = atomic_fetch_add_explicit(&g_init_log_count, 1, memory_order_relaxed);
if (n < 5) {
fprintf(stderr,
"[FREE_WRAPPER_ENV_SNAPSHOT_INIT] wrap_shape=%d front_gate=%d hotcold=%d\n",
g_free_wrapper_env.wrap_shape,
g_free_wrapper_env.front_gate_unified,
g_free_wrapper_env.hotcold_enabled);
fflush(stderr);
}
#endif
}

View File

@ -0,0 +1,71 @@
// free_wrapper_env_snapshot_box.h - Box: Free Wrapper ENV Snapshot
//
// Phase 5 E4-1: Free Gate Optimization
//
// Purpose:
// Consolidate multiple TLS reads in free() wrapper into a single snapshot
// to reduce overhead (25.26% self% -> target 24.0%)
//
// Strategy:
// - Reuse E1 success pattern (ENV snapshot consolidation, +3.92%)
// - Avoid E3-4 failure pattern (constructor init, -1.44%)
// - 2 TLS reads -> 1 TLS read (50% reduction)
// - 4 branches -> 3 branches (25% reduction)
//
// Box Boundary:
// - Input: None (thread-local initialization on first access)
// - Output: const struct free_wrapper_env_snapshot* (cached snapshot)
// - ENV gate: HAKMEM_FREE_WRAPPER_ENV_SNAPSHOT=0/1 (default: 0, research box)
//
// Safety:
// - TLS storage (thread-safe)
// - Lazy init (once per thread)
// - ENV-gated rollback (SNAPSHOT=0 disables)
#ifndef FREE_WRAPPER_ENV_SNAPSHOT_BOX_H
#define FREE_WRAPPER_ENV_SNAPSHOT_BOX_H
#include <stdint.h>
#include <stdlib.h>
#include "../hakmem_build_flags.h"
// Snapshot structure: Consolidates 3 ENV checks into 1 TLS read
// Size: 4 bytes (cache-friendly, fits in single cache line)
struct free_wrapper_env_snapshot {
uint8_t wrap_shape; // HAKMEM_WRAP_SHAPE (from wrapper_env_cfg)
uint8_t front_gate_unified; // TINY_FRONT_UNIFIED_GATE_ENABLED (compile-time constant)
uint8_t hotcold_enabled; // HAKMEM_FREE_TINY_FAST_HOTCOLD (from env)
uint8_t initialized; // Lazy init flag (0 = not initialized, 1 = initialized)
};
// Thread-local storage for snapshot (initialized on first access per thread)
extern __thread struct free_wrapper_env_snapshot g_free_wrapper_env;
// ENV gate: Enable/disable snapshot optimization (default: OFF, research box)
static inline int free_wrapper_env_snapshot_enabled(void)
{
static __thread int s_enabled = -1;
if (__builtin_expect(s_enabled == -1, 0)) {
const char* env = getenv("HAKMEM_FREE_WRAPPER_ENV_SNAPSHOT");
s_enabled = (env && *env == '1') ? 1 : 0;
}
return s_enabled;
}
// Lazy init: Initialize snapshot on first access (once per thread)
void free_wrapper_env_snapshot_init(void);
// Primary API: Get snapshot (1 TLS read, lazy init on first call)
static inline const struct free_wrapper_env_snapshot* free_wrapper_env_get(void)
{
// Fast path: Already initialized
if (__builtin_expect(g_free_wrapper_env.initialized, 1)) {
return &g_free_wrapper_env;
}
// Slow path: First access, initialize snapshot
free_wrapper_env_snapshot_init();
return &g_free_wrapper_env;
}
#endif // FREE_WRAPPER_ENV_SNAPSHOT_BOX_H

View File

@ -36,6 +36,7 @@ void* realloc(void* ptr, size_t size) {
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
#include "wrapper_env_box.h" // Wrapper env cache (step trace / LD safe / free trace)
#include "wrapper_env_cache_box.h" // Phase 3 D2: TLS cache for wrapper_env_cfg pointer
#include "free_wrapper_env_snapshot_box.h" // Phase 5 E4-1: Free wrapper ENV snapshot
#include "../hakmem_internal.h" // AllocHeader helpers for diagnostics
#include "../hakmem_super_registry.h" // Superslab lookup for diagnostics
#include "../superslab/superslab_inline.h" // slab_index_for, capacity
@ -462,7 +463,9 @@ static void free_cold(void* ptr, const wrapper_env_cfg_t* wcfg) {
#endif
}
// No valid hakmem header → external pointer (BenchMeta, libc allocation, etc.)
if (__builtin_expect(wcfg->wrap_diag, 0)) {
// Phase 5 E4-1: Get wcfg for wrap_diag check (may be snapshot path or legacy path)
const wrapper_env_cfg_t* wcfg_diag = wrapper_env_cfg_fast();
if (__builtin_expect(wcfg_diag->wrap_diag, 0)) {
SuperSlab* ss = hak_super_lookup(ptr);
int slab_idx = -1;
int meta_cls = -1;
@ -549,12 +552,66 @@ void free(void* ptr) {
// Fallback to normal path for non-Tiny or no-header mode
}
// Phase 3 D2: Use wrapper_env_cfg_fast() to reduce hot path overhead
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg_fast();
// Phase 5 E4-1: Free Wrapper ENV Snapshot (optional, ENV-gated)
// Strategy: Consolidate 2 TLS reads -> 1 TLS read (50% reduction)
// Expected gain: +1.5-2.5% (from free() 25.26% self% reduction)
if (__builtin_expect(free_wrapper_env_snapshot_enabled(), 0)) {
// Optimized path: Single TLS snapshot (1 TLS read instead of 2)
const struct free_wrapper_env_snapshot* env = free_wrapper_env_get();
// Phase 2 B4: HAKMEM_WRAP_SHAPE dispatch (hot/cold split for free)
if (__builtin_expect(wcfg->wrap_shape, 0)) {
// B4 Optimized: Hot path handles simple cases, delegates to free_cold()
// Fast path: Front gate unified (LIKELY in current presets)
if (__builtin_expect(env->front_gate_unified, 1)) {
int freed;
if (__builtin_expect(env->hotcold_enabled, 0)) {
freed = free_tiny_fast_hot(ptr); // Hot/cold split version
} else {
freed = free_tiny_fast(ptr); // Legacy monolithic version
}
if (__builtin_expect(freed, 1)) {
return; // Success (pushed to Unified Cache)
}
}
// Slow path fallback: Wrap shape dispatch
if (__builtin_expect(env->wrap_shape, 0)) {
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg_fast();
return free_cold(ptr, wcfg);
}
// Fall through to legacy classification path below
} else {
// Legacy path (SNAPSHOT=0, default): Original behavior preserved
// Phase 3 D2: Use wrapper_env_cfg_fast() to reduce hot path overhead
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg_fast();
// Phase 2 B4: HAKMEM_WRAP_SHAPE dispatch (hot/cold split for free)
if (__builtin_expect(wcfg->wrap_shape, 0)) {
// B4 Optimized: Hot path handles simple cases, delegates to free_cold()
// Phase 26: Front Gate Unification (Tiny free fast path)
// Placed AFTER BenchFast check, BEFORE expensive classify_ptr()
// Bypasses: hak_free_at routing + wrapper overhead + classification
// Target: +10-15% performance (pairs with malloc_tiny_fast)
// ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF)
// Phase 4-Step3: Use config macro for compile-time optimization
// Phase 7-Step1: Changed expect hint from 0→1 (unified path is now LIKELY)
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) {
// Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split dispatch
int freed;
if (__builtin_expect(hak_free_tiny_fast_hotcold_enabled(), 0)) {
freed = free_tiny_fast_hot(ptr); // NEW: Hot/Cold split version
} else {
freed = free_tiny_fast(ptr); // OLD: Legacy monolithic version
}
if (__builtin_expect(freed, 1)) {
return; // Success (pushed to Unified Cache)
}
// Unified Cache full OR invalid header → fallback to cold path
}
// All hot cases exhausted → delegate to free_cold() for classification and fallback
return free_cold(ptr, wcfg);
}
// Phase 2 B4: Legacy path (HAKMEM_WRAP_SHAPE=0, default)
// Phase 26: Front Gate Unification (Tiny free fast path)
// Placed AFTER BenchFast check, BEFORE expensive classify_ptr()
// Bypasses: hak_free_at routing + wrapper overhead + classification
@ -573,32 +630,8 @@ void free(void* ptr) {
if (__builtin_expect(freed, 1)) {
return; // Success (pushed to Unified Cache)
}
// Unified Cache full OR invalid header → fallback to cold path
// Unified Cache full OR invalid header → fallback to normal path
}
// All hot cases exhausted → delegate to free_cold() for classification and fallback
return free_cold(ptr, wcfg);
}
// Phase 2 B4: Legacy path (HAKMEM_WRAP_SHAPE=0, default)
// Phase 26: Front Gate Unification (Tiny free fast path)
// Placed AFTER BenchFast check, BEFORE expensive classify_ptr()
// Bypasses: hak_free_at routing + wrapper overhead + classification
// Target: +10-15% performance (pairs with malloc_tiny_fast)
// ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF)
// Phase 4-Step3: Use config macro for compile-time optimization
// Phase 7-Step1: Changed expect hint from 0→1 (unified path is now LIKELY)
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) {
// Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split dispatch
int freed;
if (__builtin_expect(hak_free_tiny_fast_hotcold_enabled(), 0)) {
freed = free_tiny_fast_hot(ptr); // NEW: Hot/Cold split version
} else {
freed = free_tiny_fast(ptr); // OLD: Legacy monolithic version
}
if (__builtin_expect(freed, 1)) {
return; // Success (pushed to Unified Cache)
}
// Unified Cache full OR invalid header → fallback to normal path
}
do { static int on=-1; if (on==-1){ const char* e=getenv("HAKMEM_FREE_WRAP_TRACE"); on=(e&&*e&&*e!='0')?1:0;} if(on){ fprintf(stderr,"[WRAP_FREE_ENTER] ptr=%p depth=%d init=%d\n", ptr, g_hakmem_lock_depth, g_initializing); } } while(0);
@ -735,7 +768,9 @@ void free(void* ptr) {
#endif
}
// No valid hakmem header → external pointer (BenchMeta, libc allocation, etc.)
if (__builtin_expect(wcfg->wrap_diag, 0)) {
// Phase 5 E4-1: Get wcfg for wrap_diag check (may be snapshot path or legacy path)
const wrapper_env_cfg_t* wcfg_diag = wrapper_env_cfg_fast();
if (__builtin_expect(wcfg_diag->wrap_diag, 0)) {
SuperSlab* ss = hak_super_lookup(ptr);
int slab_idx = -1;
int meta_cls = -1;

View File

@ -60,9 +60,13 @@ extern int g_hakmem_env_snapshot_ctor_mode;
// ENV gate: default OFF (research box, set =1 to enable)
// E3-4: Dual-mode - constructor init (fast) or legacy lazy init (fallback)
static inline bool hakmem_env_snapshot_enabled(void) {
// E3-4 Fast path: constructor mode (no lazy check, just global read)
// Default is OFF, so ctor_mode==1 is UNLIKELY.
if (__builtin_expect(g_hakmem_env_snapshot_ctor_mode == 1, 0)) {
// E3-4 Fast path: constructor mode (no lazy check, just global read).
// Important: do not put a static LIKELY/UNLIKELY hint here.
// - Default runs want ctor_mode==0 to be "fast"
// - CTOR runs want ctor_mode==1 to be "fast"
// Any fixed hint will be wrong for one of the modes and can induce steady-state mispredicts.
int ctor_mode = g_hakmem_env_snapshot_ctor_mode;
if (ctor_mode == 1) {
return g_hakmem_env_snapshot_gate != 0;
}