hakmem/core/bench_profile.h

#pragma once
#include <dlfcn.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>

#ifdef USE_HAKMEM
#include "box/wrapper_env_box.h"  // wrapper_env_refresh_from_env (Phase 2 B4)
#include "box/tiny_static_route_box.h"  // tiny_static_route_refresh_from_env (Phase 3 C3)
#include "box/hakmem_env_snapshot_box.h"  // hakmem_env_snapshot_refresh_from_env (Phase 4 E1)
#include "box/tiny_free_route_cache_env_box.h"  // tiny_free_static_route_refresh_from_env (Phase 8)
#include "box/tiny_c7_preserve_header_env_box.h"  // tiny_c7_preserve_header_env_refresh_from_env (Phase 13 v1)
#include "box/tiny_tcache_env_box.h"  // tiny_tcache_env_refresh_from_env (Phase 14 v1)
#include "box/tiny_unified_lifo_env_box.h"  // tiny_unified_lifo_env_refresh_from_env (Phase 15 v1)
#include "box/front_fastlane_alloc_legacy_direct_env_box.h"  // front_fastlane_alloc_legacy_direct_env_refresh_from_env (Phase 16 v1)
#include "box/fastlane_direct_env_box.h"  // fastlane_direct_env_refresh_from_env (Phase 19-1)
#endif

// env が未設定のときだけ既定値を入れる
static inline void bench_setenv_default(const char* key, const char* val) {
  if (getenv(key) != NULL) return;
  static void* (*real_malloc)(size_t) = NULL;
  static int (*real_putenv)(char*) = NULL;
  if (!real_malloc) {
    real_malloc = (void* (*)(size_t))dlsym(RTLD_NEXT, "malloc");
    if (!real_malloc) real_malloc = malloc;
  }
  if (!real_putenv) {
    real_putenv = (int (*)(char*))dlsym(RTLD_NEXT, "putenv");
    if (!real_putenv) real_putenv = putenv;
  }
  size_t klen = strlen(key);
  size_t vlen = strlen(val);
  char* buf = (char*)real_malloc(klen + vlen + 2);
  if (!buf) return;
  memcpy(buf, key, klen);
  buf[klen] = '=';
  memcpy(buf + klen + 1, val, vlen);
  buf[klen + 1 + vlen] = '\0';
  {
    char msg[256];
    int n = snprintf(msg, sizeof(msg), "[bench_profile] set %s=%s\n", key, val);
    if (n > 0) {
      if (n > (int)sizeof(msg)) n = (int)sizeof(msg);
      ssize_t w = write(2, msg, (size_t)n);
      (void)w;
    }
  }
  real_putenv(buf);  // takes ownership; do not free
}

// ベンチ専用: HAKMEM_PROFILE に応じて ENV をプリセットする
static inline void bench_apply_profile(void) {
  const char* p = getenv("HAKMEM_PROFILE");
  if (!p || !*p) return;

	if (strcmp(p, "MIXED_TINYV3_C7_SAFE") == 0) {
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
    bench_setenv_default("HAKMEM_TINY_C7_HOT", "1");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x0");
    bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_V4_ENABLED", "0");
    bench_setenv_default("HAKMEM_SMALL_SEGMENT_V4_ENABLED", "0");
    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
    bench_setenv_default("HAKMEM_TINY_FRONT_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_TINY_FRONT_V3_LUT_ENABLED", "1");
    bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED", "1");
	    // Phase FREE-TINY-FAST-DUALHOT-1: C0-C3 direct fast free (skip policy snapshot)
	    bench_setenv_default("HAKMEM_FREE_TINY_FAST_HOTCOLD", "1");
	    // Phase 2 B4: Wrapper hot/cold split (malloc/free wrapper shape)
	    bench_setenv_default("HAKMEM_WRAP_SHAPE", "1");
	    // Phase 4 E1: ENV Snapshot Consolidation (+3.92% proven on Mixed)
	    bench_setenv_default("HAKMEM_ENV_SNAPSHOT", "1");
	    // Phase 5 E4-1: Free wrapper ENV snapshot (+3.51% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FREE_WRAPPER_ENV_SNAPSHOT", "1");
	    // Phase 5 E4-2: Malloc wrapper ENV snapshot (+21.83% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_MALLOC_WRAPPER_ENV_SNAPSHOT", "1");
	    // Phase 5 E5-1: Free Tiny Direct Path (+3.35% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FREE_TINY_DIRECT", "1");
	    // Phase 6-1: Front FastLane (Layer Collapse) (+11.13% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FRONT_FASTLANE", "1");
	    // Phase 6-2: Front FastLane Free DeDup (+5.18% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FRONT_FASTLANE_FREE_DEDUP", "1");
	    // Phase 19-1b: FastLane Direct (wrapper layer bypass, +5.88% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1");
	    // Phase 9: FREE-TINY-FAST MONO DUALHOT (+2.72% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FREE_TINY_FAST_MONO_DUALHOT", "1");
	    // Phase 10: FREE-TINY-FAST MONO LEGACY DIRECT (+1.89% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT", "1");
	    // Phase 4-4: C6 ULTRA free+alloc 統合を有効化 (default OFF, manual opt-in)
	    bench_setenv_default("HAKMEM_TINY_C6_ULTRA_FREE_ENABLED", "0");
	    // Phase MID-V3: Mid/Pool HotBox v3
	    // Mixed (16–1024B) では MID_V3(C6) が大きく遅くなるため、デフォルト OFF に固定。
	    // C6-heavy プロファイル側でのみ ON を推奨する（C6-heavy のみ最適化対象）。
	    bench_setenv_default("HAKMEM_MID_V3_ENABLED", "0");
	    bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x0");
		    // Phase 2 B3: Routing branch shape optimization (LIKELY on LEGACY, cold helper for rare routes)
		    bench_setenv_default("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", "1");
		    // Phase 3 C3: Static routing (policy_snapshot bypass, +2.2% proven)
		    bench_setenv_default("HAKMEM_TINY_STATIC_ROUTE", "1");
		    // Phase 3 D1: Free route cache (TLS cache for free path routing, +2.19% proven)
		    bench_setenv_default("HAKMEM_FREE_STATIC_ROUTE", "1");
  } else if (strcmp(p, "C6_HEAVY_LEGACY_POOLV1") == 0) {
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
    bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
    bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_ENABLED", "0");
    bench_setenv_default("HAKMEM_MID_DESC_CACHE_ENABLED", "1");
    // Phase 4-4: C6 ULTRA free+alloc 統合を有効化 (default OFF, manual opt-in)
    bench_setenv_default("HAKMEM_TINY_C6_ULTRA_FREE_ENABLED", "0");
    // Phase MID-V3: Mid/Pool HotBox v3 (257-768B, C6 only)
    bench_setenv_default("HAKMEM_MID_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x40");
	    // Phase 6-1: Front FastLane (Layer Collapse) (+11.13% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FRONT_FASTLANE", "1");
	    // Phase 6-2: Front FastLane Free DeDup (+5.18% proven on Mixed, 10-run)
	    bench_setenv_default("HAKMEM_FRONT_FASTLANE_FREE_DEDUP", "1");
	    // Phase 19-1b: FastLane Direct (wrapper layer bypass)
	    bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1");
	    // Phase 2 B3: Routing branch shape optimization (LIKELY on LEGACY, cold helper for rare routes)
	    bench_setenv_default("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", "1");
  } else if (strcmp(p, "C6_V7_STUB") == 0) {
    // Phase v7-1: C6-only v7 stub 実験用（MID v3 fallback）
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
    bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
    bench_setenv_default("HAKMEM_MID_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x40");
    // v7 stub ON (C6-only)
    bench_setenv_default("HAKMEM_SMALL_HEAP_V7_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V7_CLASSES", "0x40");
  } else if (strcmp(p, "C6_HEAVY_LEGACY_POOLV1_FLATTEN") == 0) {
    // LEGACY mid/smallmid ベンチ専用（C7_SAFE では使用しない）
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "LEGACY");
    bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
    bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_ENABLED", "1");
    bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_STATS", "1");
    bench_setenv_default("HAKMEM_POOL_ZERO_MODE", "header");
  } else if (strcmp(p, "DEBUG_TINY_FRONT_PERF") == 0) {
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
    bench_setenv_default("HAKMEM_TINY_C7_HOT", "1");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
    bench_setenv_default("HAKMEM_TINY_FRONT_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_TINY_FRONT_V3_LUT_ENABLED", "1");
    bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED", "1");
  } else if (strcmp(p, "C6_SMALL_HEAP_V3_EXPERIMENT") == 0) {
    // C6 を SmallObject v3 に載せる研究用（標準では使用しない）
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
    bench_setenv_default("HAKMEM_TINY_C6_HOT", "1");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x40"); // C6 only
    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x0");
    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
	  } else if (strcmp(p, "C6_SMALL_HEAP_V4_EXPERIMENT") == 0) {
    // C6 を SmallObject v4 に載せる研究用（標準では使用しない）
    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
    bench_setenv_default("HAKMEM_TINY_C6_HOT", "1");
    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x0");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "1");
    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x40"); // C6 only
	    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
	  }

#ifdef USE_HAKMEM
	  // Phase 3 C3 Step 0: Ensure policy snapshot reflects final ENV after putenv defaults.
	  small_policy_v7_bump_version();
	  // Phase 2 B4: Sync wrapper ENV cache after bench_profile putenv defaults.
	  wrapper_env_refresh_from_env();
	  // Phase 3 C3: Sync static route cache after bench_profile putenv defaults.
	  tiny_static_route_refresh_from_env();
	  // Phase 4 E1: Sync ENV snapshot cache after bench_profile putenv defaults.
	  hakmem_env_snapshot_refresh_from_env();
	  // Phase 8: Sync free static route ENV cache after bench_profile putenv defaults.
	  tiny_free_static_route_refresh_from_env();
	  // Phase 13 v1: Sync C7 preserve header ENV cache after bench_profile putenv defaults.
	  tiny_c7_preserve_header_env_refresh_from_env();
	  // Phase 14 v1: Sync tcache ENV cache after bench_profile putenv defaults.
	  tiny_tcache_env_refresh_from_env();
	  // Phase 15 v1: Sync LIFO ENV cache after bench_profile putenv defaults.
	  tiny_unified_lifo_env_refresh_from_env();
	  // Phase 16 v1: Sync LEGACY direct ENV cache after bench_profile putenv defaults.
	  front_fastlane_alloc_legacy_direct_env_refresh_from_env();
	  // Phase 19-1: Sync FastLane Direct ENV cache after bench_profile putenv defaults.
	  fastlane_direct_env_refresh_from_env();
#endif
	}
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+								#pragma once
 								#include <dlfcn.h>
 								#include <stdlib.h>
 								#include <string.h>
 								#include <stdio.h>
 								#include <unistd.h>
 								#ifdef USE_HAKMEM
 								#include "box/wrapper_env_box.h"  // wrapper_env_refresh_from_env (Phase 2 B4)
 								#include "box/tiny_static_route_box.h"  // tiny_static_route_refresh_from_env (Phase 3 C3)
 								#include "box/hakmem_env_snapshot_box.h"  // hakmem_env_snapshot_refresh_from_env (Phase 4 E1)
-												Phase 8: FREE-STATIC-ROUTE ENV Cache Hardening (GO +2.61%)

Results:
- A/B test: +2.61% on Mixed (10-run, clean env)
- Baseline: 49.26M ops/s
- Optimized: 50.55M ops/s
- Improvement: +1.29M ops/s (+2.61%)

Strategy:
- Fix ENV cache accident (main前キャッシュ事故の修正)
- Add refresh mechanism to sync with bench_profile putenv
- Ensure Phase 3 D1 optimization works reliably

Success factors:
1. Performance improvement: +2.61% (existing win-box now reliable)
2. ENV cache accident fixed: refresh mechanism works correctly
3. Standard deviation improved: 867K → 336K ops/s (61% reduction)
4. Baseline quality improved: existing optimization now guaranteed

Implementation:
- Patch 1: Make ENV gate refreshable (tiny_free_route_cache_env_box.{h,c})
  - Changed static int to extern _Atomic int
  - Added tiny_free_static_route_refresh_from_env()
- Patch 2: Integrate refresh into bench_profile.h
  - Call refresh after bench_setenv_default() group
- Patch 3: Update Makefile for new .c file

ENV cache fix verification:
- [FREE_STATIC_ROUTE] enabled appears twice (refresh working)
- bench_profile putenv now reliably reflected

Files modified:
- core/box/tiny_free_route_cache_env_box.h: extern + refresh API
- core/box/tiny_free_route_cache_env_box.c: NEW (global state + refresh)
- core/bench_profile.h: add refresh call
- Makefile: add new .o file

Health check: PASSED (all profiles)

Rollback: HAKMEM_FREE_STATIC_ROUTE=0 or revert Patch 1/2

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 18:49:08 +09:00
+								#include "box/tiny_free_route_cache_env_box.h"  // tiny_free_static_route_refresh_from_env (Phase 8)
-												Phase 13 v1 + E5-2 retest: Both NEUTRAL, freeze as research boxes

Phase 13 v1: Header Write Elimination (C7 preserve header)
- Verdict: NEUTRAL (+0.78%)
- Implementation: HAKMEM_TINY_C7_PRESERVE_HEADER ENV gate (default OFF)
- Makes C7 nextptr offset conditional (0→1 when enabled)
- 4-point matrix A/B test results:
  * Case A (baseline): 51.49M ops/s
  * Case B (WRITE_ONCE=1): 52.07M ops/s (+1.13%)
  * Case C (C7_PRESERVE=1): 51.36M ops/s (-0.26%)
  * Case D (both): 51.89M ops/s (+0.78% NEUTRAL)
- Action: Freeze as research box (default OFF, manual opt-in)

Phase 5 E5-2: Header Write-Once retest (promotion test)
- Verdict: NEUTRAL (+0.54%)
- Motivation: Phase 13 Case B showed +1.13%, re-tested with dedicated 20-run
- Results (20-run):
  * Case A (baseline): 51.10M ops/s
  * Case B (WRITE_ONCE=1): 51.37M ops/s (+0.54%)
- Previous test: +0.45% (consistent with NEUTRAL)
- Action: Keep as research box (default OFF, manual opt-in)

Key findings:
- Header write tax optimization shows consistent NEUTRAL results
- Neither Phase 13 v1 nor E5-2 reaches GO threshold (+1.0%)
- Both implemented as reversible ENV gates for future research

Files changed:
- New: core/box/tiny_c7_preserve_header_env_box.{c,h}
- Modified: core/box/tiny_layout_box.h (C7 offset conditional)
- Modified: core/tiny_nextptr.h, core/box/tiny_header_box.h (comments)
- Modified: core/bench_profile.h (refresh sync)
- Modified: Makefile (add new .o files)
- Modified: scripts/run_mixed_10_cleanenv.sh (add C7_PRESERVE ENV)
- Docs: PHASE13_*, PHASE5_E5_2_HEADER_WRITE_ONCE_* (design/results)

Next: Phase 14 (Pointer-chase reduction, tcache-style intrusive LIFO)

🤖 Generated with Claude Code

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 00:32:25 +09:00
+								#include "box/tiny_c7_preserve_header_env_box.h"  // tiny_c7_preserve_header_env_refresh_from_env (Phase 13 v1)
-												Phase 14 v1: Pointer-Chase Reduction (tcache) NEUTRAL (+0.20%)

Implementation:
- Intrusive LIFO tcache layer (L1) before UnifiedCache
- TLS per-class bins (head pointer + count)
- Intrusive next pointers (via tiny_next_store/load SSOT)
- Cap: 64 blocks per class (default)
- ENV: HAKMEM_TINY_TCACHE=0/1 (default: 0, OFF)

A/B Test Results (Mixed 10-run):
- Baseline (TCACHE=0): 51,083,379 ops/s
- Optimized (TCACHE=1): 51,186,838 ops/s
- Mean delta: +0.20% (below +1.0% GO threshold)
- Median delta: +0.59%

Verdict: NEUTRAL - Freeze as research box (default OFF)

Root Cause (v1 wiring incomplete):
- Free side pushes to tcache via unified_cache_push()
- Alloc hot path (tiny_hot_alloc_fast) doesn't consume tcache
- tcache becomes "sink" without alloc-side pop → ROI not measurable

Files:
- Created: core/box/tiny_tcache_{env_box,box}.h, tiny_tcache_env_box.c
- Modified: core/front/tiny_unified_cache.h (integration)
- Modified: core/bench_profile.h (refresh sync)
- Modified: Makefile (build integration)
- Results: docs/analysis/PHASE14_POINTER_CHASE_REDUCTION_1_AB_TEST_RESULTS.md
- v2 Instructions: docs/analysis/PHASE14_POINTER_CHASE_REDUCTION_2_NEXT_INSTRUCTIONS.md

Next: Phase 14 v2 (connect tcache to tiny_front_hot_box alloc/free hot path)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 01:28:50 +09:00
+								#include "box/tiny_tcache_env_box.h"  // tiny_tcache_env_refresh_from_env (Phase 14 v1)
-												Phase 15 v1: UnifiedCache FIFO→LIFO NEUTRAL (-0.70% Mixed, +0.42% C7)

Transform existing array-based UnifiedCache from FIFO ring to LIFO stack.

A/B Results:
- Mixed (16-1024B): -0.70% (52,965,966 → 52,593,948 ops/s)
- C7-only (1025-2048B): +0.42% (78,010,783 → 78,335,509 ops/s)

Verdict: NEUTRAL (both below +1.0% GO threshold) - freeze as research box

Implementation:
- L0 ENV gate: tiny_unified_lifo_env_box.{h,c} (HAKMEM_TINY_UNIFIED_LIFO=0/1)
- L1 LIFO ops: tiny_unified_lifo_box.h (unified_cache_try_pop/push_lifo)
- L2 integration: tiny_front_hot_box.h (mode check at entry)
- Reuses existing slots[] array (no intrusive pointers)

Root Causes:
1. Mode check overhead (tiny_unified_lifo_enabled() call)
2. Minimal LIFO vs FIFO locality delta in practice
3. Existing FIFO ring already well-optimized

Bonus Fix: LTO bug for tiny_c7_preserve_header_enabled() (Phase 13/14 latent issue)
- Converted static inline to extern + non-inline implementation
- Fixes undefined reference during LTO linking

Design: docs/analysis/PHASE15_UNIFIEDCACHE_LIFO_1_DESIGN.md
Results: docs/analysis/PHASE15_UNIFIEDCACHE_LIFO_1_AB_TEST_RESULTS.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 02:19:26 +09:00
+								#include "box/tiny_unified_lifo_env_box.h"  // tiny_unified_lifo_env_refresh_from_env (Phase 15 v1)
-												Phase 16 v1 NEUTRAL, Phase 17 Case B confirmed, Phase 18 design added

## Phase 16 v1: Front FastLane Alloc LEGACY Direct — NEUTRAL (+0.62%)

Target: Reduce alloc-side fixed costs by adding LEGACY direct path to
FastLane entry, mirroring Phase 9/10 free-side winning pattern.

Result: +0.62% on Mixed (below +1.0% GO threshold) → NEUTRAL, freeze as
research box (default OFF).

Critical issue: Initial impl crashed (segfault) for C4-C7. Root cause:
unified_cache_refill() incompatibility. Safety fix: Limited to C0-C3
only (matching existing dualhot pattern).

Files:
- core/box/front_fastlane_alloc_legacy_direct_env_box.{h,c} (new)
- core/box/front_fastlane_box.h (LEGACY direct path, lines 93-119)
- core/bench_profile.h (env refresh sync)
- Makefile (new obj)
- docs/analysis/PHASE16_*.md (design/results/instructions)

ENV: HAKMEM_FRONT_FASTLANE_ALLOC_LEGACY_DIRECT=0 (default OFF, opt-in)

Verdict: Research box frozen. Phase 14-16 plateau confirms dispatch/
routing optimization ROI is exhausted post-Phase-6 FastLane collapse.

---

## Phase 17: FORCE_LIBC Gap Validation — Case B Confirmed

Purpose: Validate "system malloc faster" observation using same-binary
A/B testing to isolate allocator logic差 vs binary layout penalty.

Method:
- Same-binary toggle: HAKMEM_FORCE_LIBC_ALLOC=0/1 (bench_random_mixed_hakmem)
- System binary: bench_random_mixed_system (21K separate binary)
- Perf stat: Hardware counter analysis (I-cache, cycles, instructions)

Result: **Case B confirmed** — Allocator差 negligible, layout penalty dominates.

Gap breakdown (Mixed, 20M iters, ws=400):
- hakmem (FORCE_LIBC=0): 48.12M ops/s
- libc (FORCE_LIBC=1, same binary): 48.31M ops/s → +0.39% (noise level)
- system binary (21K): 83.85M ops/s → +73.57% vs libc, +74.26% vs hakmem

Perf stat (200M iters):
- I-cache misses: 153K (hakmem) → 68K (system) = -55% (smoking gun)
- Cycles: 17.9B → 10.2B = -43%
- Instructions: 41.3B → 21.5B = -48%
- Binary size: 653K → 21K (30x difference)

Root cause: Binary size (30x) causes I-cache thrashing. Code bloat >>
algorithmic efficiency.

Conclusion: Phase 12's "system malloc 1.6x faster" was real, but
misattributed. Gap is layout/I-cache, NOT allocator algorithm.

Files:
- docs/analysis/PHASE17_*.md (results/instructions)
- scripts/run_mixed_10_cleanenv.sh (Phase 9/10 defaults aligned)

Next: Phase 18 Hot Text Isolation (layout optimization, not algorithm opt)

---

## Phase 18: Hot Text Isolation — Design Added

Purpose: Reduce I-cache misses + instruction footprint via layout control
(binary optimization, not allocator algorithm changes).

Strategy (v1 → v2 progression):

v1 (TU split + hot/cold attrs + optional gc-sections):
- Target: +2% throughput (GO threshold, realistic for layout tweaks)
- Secondary: I-cache -10%, instructions -5% (direction confirmation)
- Risk: Low (reversible via build knob)
- Expected: +0-2% (NEUTRAL likely, but validates approach)

v2 (BENCH_MINIMAL compile-out):
- Target: +10-20% throughput (本命)
- Method: Conditional compilation removes stats/ENV/debug from hot path
- Expected: Instruction count -30-40% → significant I-cache improvement

Files:
- docs/analysis/PHASE18_*.md (design/instructions)
- CURRENT_TASK.md (Phase 17 complete, Phase 18 v1/v2 plan)

Build gate: HOT_TEXT_ISOLATION=0/1 (Makefile knob)

Next: Implement Phase 18 v1 (TU split first, BENCH_MINIMAL if v1 NEUTRAL)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 05:25:47 +09:00
+								#include "box/front_fastlane_alloc_legacy_direct_env_box.h"  // front_fastlane_alloc_legacy_direct_env_refresh_from_env (Phase 16 v1)
-												Phase 17 v2 (FORCE_LIBC fix) + Phase 19-1b (FastLane Direct) — GO (+5.88%)

## Phase 17 v2: FORCE_LIBC Gap Validation Fix

**Critical bug fix**: Phase 17 v1 の測定が壊れていた

**Problem**: HAKMEM_FORCE_LIBC_ALLOC=1 が FastLane より後でしか見えず、
same-binary A/B が実質 "hakmem vs hakmem" になっていた（+0.39% 誤測定）

**Fix**: core/box/hak_wrappers.inc.h:171 と :645 に g_force_libc_alloc==1 の
early bypass を追加、__libc_malloc/__libc_free に最初に直行

**Result**: 正しい同一バイナリ A/B 測定
- hakmem (FORCE_LIBC=0): 48.99M ops/s
- libc (FORCE_LIBC=1): 79.72M ops/s (+62.7%)
- system binary: 88.06M ops/s (+10.5% vs libc)

**Gap 分解**:
- Allocator 差: +62.7% (主戦場)
- Layout penalty: +10.5% (副次的)

**Conclusion**: Case A 確定 (allocator dominant, NOT layout)
Phase 17 v1 の Case B 判定は誤り。

Files:
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_AB_TEST_RESULTS.md (v2)
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_NEXT_INSTRUCTIONS.md (updated)

---

## Phase 19: FastLane Instruction Reduction Analysis

**Goal**: libc との instruction gap (-35% instructions, -56% branches) を削減

**perf stat 分析** (FORCE_LIBC=0 vs 1, 200M ops):
- hakmem: 209.09 instructions/op, 52.33 branches/op
- libc: 135.92 instructions/op, 22.93 branches/op
- Delta: +73.17 instructions/op (+53.8%), +29.40 branches/op (+128.2%)

**Hot path** (perf report):
- front_fastlane_try_free: 23.97% cycles
- malloc wrapper: 23.84% cycles
- free wrapper: 6.82% cycles
- **Wrapper overhead: ~55% of all cycles**

**Reduction candidates**:
- A: Wrapper layer 削除 (-17.5 inst/op, +10-15% 期待)
- B: ENV snapshot 統合 (-10.0 inst/op, +5-8%)
- C: Stats 削除 (-5.0 inst/op, +3-5%)
- D: Header inline (-4.0 inst/op, +2-3%)
- E: Route fast path (-3.5 inst/op, +2-3%)

Files:
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_1_DESIGN.md
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_2_NEXT_INSTRUCTIONS.md

---

## Phase 19-1b: FastLane Direct — GO (+5.88%)

**Strategy**: Wrapper layer を bypass し、core allocator を直接呼ぶ
- free() → free_tiny_fast() (not free_tiny_fast_hot)
- malloc() → malloc_tiny_fast()

**Phase 19-1 が NO-GO (-3.81%) だった原因**:
1. __builtin_expect(fastlane_direct_enabled(), 0) が逆効果（A/B 不公平）
2. free_tiny_fast_hot() が誤選択（free_tiny_fast() が勝ち筋）

**Phase 19-1b の修正**:
1. __builtin_expect() 削除
2. free_tiny_fast() を直接呼び出し

**Result** (Mixed, 10-run, 20M iters, ws=400):
- Baseline (FASTLANE_DIRECT=0): 49.17M ops/s
- Optimized (FASTLANE_DIRECT=1): 52.06M ops/s
- **Delta: +5.88%** (GO 基準 +5% クリア)

**perf stat** (200M iters):
- Instructions/op: 199.90 → 169.45 (-30.45, -15.23%)
- Branches/op: 51.49 → 41.52 (-9.97, -19.36%)
- Cycles/op: 88.88 → 84.37 (-4.51, -5.07%)
- I-cache miss: 111K → 98K (-11.79%)

**Trade-offs** (acceptable):
- iTLB miss: +41.46% (front-end cost)
- dTLB miss: +29.15% (backend cost)
- Overall gain (+5.88%) outweighs costs

**Implementation**:
1. **ENV gate**: core/box/fastlane_direct_env_box.{h,c}
   - HAKMEM_FASTLANE_DIRECT=0/1 (default: 0, opt-in)
   - Single _Atomic global (wrapper キャッシュ問題を解決)

2. **Wrapper 修正**: core/box/hak_wrappers.inc.h
   - malloc: direct call to malloc_tiny_fast() when FASTLANE_DIRECT=1
   - free: direct call to free_tiny_fast() when FASTLANE_DIRECT=1
   - Safety: !g_initialized では direct 使わない、fallback 維持

3. **Preset 昇格**: core/bench_profile.h:88
   - bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1")
   - Comment: +5.88% proven on Mixed, 10-run

4. **cleanenv 更新**: scripts/run_mixed_10_cleanenv.sh:22
   - HAKMEM_FASTLANE_DIRECT=${HAKMEM_FASTLANE_DIRECT:-1}
   - Phase 9/10 と同様に昇格

**Verdict**: GO — 本線採用、プリセット昇格完了

**Rollback**: HAKMEM_FASTLANE_DIRECT=0 で既存 FastLane path に戻る

Files:
- core/box/fastlane_direct_env_box.{h,c} (new)
- core/box/hak_wrappers.inc.h (modified)
- core/bench_profile.h (preset promotion)
- scripts/run_mixed_10_cleanenv.sh (ENV default aligned)
- Makefile (new obj)
- docs/analysis/PHASE19_1B_FASTLANE_DIRECT_REVISED_AB_TEST_RESULTS.md

---

## Cumulative Performance

- Baseline (all optimizations OFF): ~40M ops/s (estimated)
- Current (Phase 19-1b): 52.06M ops/s
- **Cumulative gain: ~+30% from baseline**

Remaining gap to libc (79.72M):
- Current: 52.06M ops/s
- Target: 79.72M ops/s
- **Gap: +53.2%** (was +62.7% before Phase 19-1b)

Next: Phase 19-2 (ENV snapshot consolidation, +5-8% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 11:28:40 +09:00
+								#include "box/fastlane_direct_env_box.h"  // fastlane_direct_env_refresh_from_env (Phase 19-1)
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+								#endif
 								// env が未設定のときだけ既定値を入れる
 								static inline void bench_setenv_default(const char* key, const char* val) {
 								  if (getenv(key) != NULL) return;
 								  static void* (*real_malloc)(size_t) = NULL;
 								  static int (*real_putenv)(char*) = NULL;
 								  if (!real_malloc) {
 								    real_malloc = (void* (*)(size_t))dlsym(RTLD_NEXT, "malloc");
 								    if (!real_malloc) real_malloc = malloc;
 								  }
 								  if (!real_putenv) {
 								    real_putenv = (int (*)(char*))dlsym(RTLD_NEXT, "putenv");
 								    if (!real_putenv) real_putenv = putenv;
 								  }
 								  size_t klen = strlen(key);
 								  size_t vlen = strlen(val);
 								  char* buf = (char*)real_malloc(klen + vlen + 2);
 								  if (!buf) return;
 								  memcpy(buf, key, klen);
 								  buf[klen] = '=';
 								  memcpy(buf + klen + 1, val, vlen);
 								  buf[klen + 1 + vlen] = '\0';
 								  {
 								    char msg[256];
 								    int n = snprintf(msg, sizeof(msg), "[bench_profile] set %s=%s\n", key, val);
 								    if (n > 0) {
 								      if (n > (int)sizeof(msg)) n = (int)sizeof(msg);
 								      ssize_t w = write(2, msg, (size_t)n);
 								      (void)w;
 								    }
 								  }
 								  real_putenv(buf);  // takes ownership; do not free
 								}
 								// ベンチ専用: HAKMEM_PROFILE に応じて ENV をプリセットする
 								static inline void bench_apply_profile(void) {
 								  const char* p = getenv("HAKMEM_PROFILE");
 								  if (!p || !*p) return;
 									if (strcmp(p, "MIXED_TINYV3_C7_SAFE") == 0) {
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
 								    bench_setenv_default("HAKMEM_TINY_C7_HOT", "1");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x0");
 								    bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_V4_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_SMALL_SEGMENT_V4_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_TINY_FRONT_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_TINY_FRONT_V3_LUT_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED", "1");
 									    // Phase FREE-TINY-FAST-DUALHOT-1: C0-C3 direct fast free (skip policy snapshot)
 									    bench_setenv_default("HAKMEM_FREE_TINY_FAST_HOTCOLD", "1");
 									    // Phase 2 B4: Wrapper hot/cold split (malloc/free wrapper shape)
 									    bench_setenv_default("HAKMEM_WRAP_SHAPE", "1");
 									    // Phase 4 E1: ENV Snapshot Consolidation (+3.92% proven on Mixed)
 									    bench_setenv_default("HAKMEM_ENV_SNAPSHOT", "1");
 									    // Phase 5 E4-1: Free wrapper ENV snapshot (+3.51% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FREE_WRAPPER_ENV_SNAPSHOT", "1");
 									    // Phase 5 E4-2: Malloc wrapper ENV snapshot (+21.83% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_MALLOC_WRAPPER_ENV_SNAPSHOT", "1");
 									    // Phase 5 E5-1: Free Tiny Direct Path (+3.35% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FREE_TINY_DIRECT", "1");
-												Phase 6-2: Promote Front FastLane Free DeDup (default ON)

Results:
- A/B test: +5.18% on Mixed (10-run, clean env)
- Baseline: 46.68M ops/s
- Optimized: 49.10M ops/s
- Improvement: +2.42M ops/s (+5.18%)

Strategy:
- Eliminate duplicate header validation in front_fastlane_try_free()
- Direct call to free_tiny_fast() when dedup enabled
- Single validation path (no redundant checks)

Success factors:
1. Complete duplicate elimination (free path optimization)
2. Free path importance (50% of Mixed workload)
3. Improved execution stability (CV: 1.00% → 0.58%)

Phase 6 cumulative:
- Phase 6-1 FastLane: +11.13%
- Phase 6-2 Free DeDup: +5.18%
- Total: ~+16-17% from baseline (multiplicative effect)

Promotion:
- Default: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=1 (opt-out)
- Added to MIXED_TINYV3_C7_SAFE preset
- Added to C6_HEAVY_LEGACY_POOLV1 preset
- Rollback: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=0

Files modified:
- core/box/front_fastlane_env_box.h: default 0 → 1
- core/bench_profile.h: added to presets
- CURRENT_TASK.md: Phase 6-2 GO result

Health check: PASSED (all profiles)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 17:38:21 +09:00
+									    // Phase 6-1: Front FastLane (Layer Collapse) (+11.13% proven on Mixed, 10-run)
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+									    bench_setenv_default("HAKMEM_FRONT_FASTLANE", "1");
-												Phase 6-2: Promote Front FastLane Free DeDup (default ON)

Results:
- A/B test: +5.18% on Mixed (10-run, clean env)
- Baseline: 46.68M ops/s
- Optimized: 49.10M ops/s
- Improvement: +2.42M ops/s (+5.18%)

Strategy:
- Eliminate duplicate header validation in front_fastlane_try_free()
- Direct call to free_tiny_fast() when dedup enabled
- Single validation path (no redundant checks)

Success factors:
1. Complete duplicate elimination (free path optimization)
2. Free path importance (50% of Mixed workload)
3. Improved execution stability (CV: 1.00% → 0.58%)

Phase 6 cumulative:
- Phase 6-1 FastLane: +11.13%
- Phase 6-2 Free DeDup: +5.18%
- Total: ~+16-17% from baseline (multiplicative effect)

Promotion:
- Default: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=1 (opt-out)
- Added to MIXED_TINYV3_C7_SAFE preset
- Added to C6_HEAVY_LEGACY_POOLV1 preset
- Rollback: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=0

Files modified:
- core/box/front_fastlane_env_box.h: default 0 → 1
- core/bench_profile.h: added to presets
- CURRENT_TASK.md: Phase 6-2 GO result

Health check: PASSED (all profiles)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 17:38:21 +09:00
+									    // Phase 6-2: Front FastLane Free DeDup (+5.18% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FRONT_FASTLANE_FREE_DEDUP", "1");
-												Phase 17 v2 (FORCE_LIBC fix) + Phase 19-1b (FastLane Direct) — GO (+5.88%)

## Phase 17 v2: FORCE_LIBC Gap Validation Fix

**Critical bug fix**: Phase 17 v1 の測定が壊れていた

**Problem**: HAKMEM_FORCE_LIBC_ALLOC=1 が FastLane より後でしか見えず、
same-binary A/B が実質 "hakmem vs hakmem" になっていた（+0.39% 誤測定）

**Fix**: core/box/hak_wrappers.inc.h:171 と :645 に g_force_libc_alloc==1 の
early bypass を追加、__libc_malloc/__libc_free に最初に直行

**Result**: 正しい同一バイナリ A/B 測定
- hakmem (FORCE_LIBC=0): 48.99M ops/s
- libc (FORCE_LIBC=1): 79.72M ops/s (+62.7%)
- system binary: 88.06M ops/s (+10.5% vs libc)

**Gap 分解**:
- Allocator 差: +62.7% (主戦場)
- Layout penalty: +10.5% (副次的)

**Conclusion**: Case A 確定 (allocator dominant, NOT layout)
Phase 17 v1 の Case B 判定は誤り。

Files:
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_AB_TEST_RESULTS.md (v2)
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_NEXT_INSTRUCTIONS.md (updated)

---

## Phase 19: FastLane Instruction Reduction Analysis

**Goal**: libc との instruction gap (-35% instructions, -56% branches) を削減

**perf stat 分析** (FORCE_LIBC=0 vs 1, 200M ops):
- hakmem: 209.09 instructions/op, 52.33 branches/op
- libc: 135.92 instructions/op, 22.93 branches/op
- Delta: +73.17 instructions/op (+53.8%), +29.40 branches/op (+128.2%)

**Hot path** (perf report):
- front_fastlane_try_free: 23.97% cycles
- malloc wrapper: 23.84% cycles
- free wrapper: 6.82% cycles
- **Wrapper overhead: ~55% of all cycles**

**Reduction candidates**:
- A: Wrapper layer 削除 (-17.5 inst/op, +10-15% 期待)
- B: ENV snapshot 統合 (-10.0 inst/op, +5-8%)
- C: Stats 削除 (-5.0 inst/op, +3-5%)
- D: Header inline (-4.0 inst/op, +2-3%)
- E: Route fast path (-3.5 inst/op, +2-3%)

Files:
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_1_DESIGN.md
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_2_NEXT_INSTRUCTIONS.md

---

## Phase 19-1b: FastLane Direct — GO (+5.88%)

**Strategy**: Wrapper layer を bypass し、core allocator を直接呼ぶ
- free() → free_tiny_fast() (not free_tiny_fast_hot)
- malloc() → malloc_tiny_fast()

**Phase 19-1 が NO-GO (-3.81%) だった原因**:
1. __builtin_expect(fastlane_direct_enabled(), 0) が逆効果（A/B 不公平）
2. free_tiny_fast_hot() が誤選択（free_tiny_fast() が勝ち筋）

**Phase 19-1b の修正**:
1. __builtin_expect() 削除
2. free_tiny_fast() を直接呼び出し

**Result** (Mixed, 10-run, 20M iters, ws=400):
- Baseline (FASTLANE_DIRECT=0): 49.17M ops/s
- Optimized (FASTLANE_DIRECT=1): 52.06M ops/s
- **Delta: +5.88%** (GO 基準 +5% クリア)

**perf stat** (200M iters):
- Instructions/op: 199.90 → 169.45 (-30.45, -15.23%)
- Branches/op: 51.49 → 41.52 (-9.97, -19.36%)
- Cycles/op: 88.88 → 84.37 (-4.51, -5.07%)
- I-cache miss: 111K → 98K (-11.79%)

**Trade-offs** (acceptable):
- iTLB miss: +41.46% (front-end cost)
- dTLB miss: +29.15% (backend cost)
- Overall gain (+5.88%) outweighs costs

**Implementation**:
1. **ENV gate**: core/box/fastlane_direct_env_box.{h,c}
   - HAKMEM_FASTLANE_DIRECT=0/1 (default: 0, opt-in)
   - Single _Atomic global (wrapper キャッシュ問題を解決)

2. **Wrapper 修正**: core/box/hak_wrappers.inc.h
   - malloc: direct call to malloc_tiny_fast() when FASTLANE_DIRECT=1
   - free: direct call to free_tiny_fast() when FASTLANE_DIRECT=1
   - Safety: !g_initialized では direct 使わない、fallback 維持

3. **Preset 昇格**: core/bench_profile.h:88
   - bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1")
   - Comment: +5.88% proven on Mixed, 10-run

4. **cleanenv 更新**: scripts/run_mixed_10_cleanenv.sh:22
   - HAKMEM_FASTLANE_DIRECT=${HAKMEM_FASTLANE_DIRECT:-1}
   - Phase 9/10 と同様に昇格

**Verdict**: GO — 本線採用、プリセット昇格完了

**Rollback**: HAKMEM_FASTLANE_DIRECT=0 で既存 FastLane path に戻る

Files:
- core/box/fastlane_direct_env_box.{h,c} (new)
- core/box/hak_wrappers.inc.h (modified)
- core/bench_profile.h (preset promotion)
- scripts/run_mixed_10_cleanenv.sh (ENV default aligned)
- Makefile (new obj)
- docs/analysis/PHASE19_1B_FASTLANE_DIRECT_REVISED_AB_TEST_RESULTS.md

---

## Cumulative Performance

- Baseline (all optimizations OFF): ~40M ops/s (estimated)
- Current (Phase 19-1b): 52.06M ops/s
- **Cumulative gain: ~+30% from baseline**

Remaining gap to libc (79.72M):
- Current: 52.06M ops/s
- Target: 79.72M ops/s
- **Gap: +53.2%** (was +62.7% before Phase 19-1b)

Next: Phase 19-2 (ENV snapshot consolidation, +5-8% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 11:28:40 +09:00
+									    // Phase 19-1b: FastLane Direct (wrapper layer bypass, +5.88% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1");
-												Phase 9: FREE-TINY-FAST MONO DUALHOT (GO +2.72%)

Results:
- A/B test: +2.72% on Mixed (10-run, clean env)
- Baseline: 48.89M ops/s
- Optimized: 50.22M ops/s
- Improvement: +1.33M ops/s (+2.72%)
- Stability: Standard deviation reduced by 60.8% (2.44M → 955K ops/s)

Strategy:
- Transplant C0-C3 "second hot" path to monolithic free_tiny_fast()
- Early-exit within monolithic (no hot/cold split)
- FastLane free now benefits from C0-C3 direct path

Success factors:
1. Performance improvement: +2.72% (2.7x GO threshold)
2. Stability improvement: 2.6x more stable (stdev 60.8% reduction)
3. Learned from Phase 7 failure:
   - Phase 7: Function split (hot/cold) → NO-GO
   - Phase 9: Early-exit within monolithic → GO
4. FastLane free compatibility: C0-C3 direct path now works with FastLane
5. Policy snapshot overhead reduction: C0-C3 (48% of Mixed) skip route lookup

Implementation:
- Patch 1: ENV gate box (free_tiny_fast_mono_dualhot_env_box.h)
  - ENV: HAKMEM_FREE_TINY_FAST_MONO_DUALHOT=0/1 (default 0)
  - Probe window: 64 (avoid bench_profile putenv race)
- Patch 2: Early-exit in free_tiny_fast() (malloc_tiny_fast.h)
  - Conditions: class_idx <= 3, !LARSON_FIX, route==LEGACY
  - Direct call: tiny_legacy_fallback_free_base()
- Patch 3: Visibility (free_path_stats_box.h)
  - mono_dualhot_hit counter (compile-out in release)
- Patch 4: cleanenv extension (run_mixed_10_cleanenv.sh)
  - ENV leak protection

Files modified:
- core/bench_profile.h: add to MIXED_TINYV3_C7_SAFE preset
- core/front/malloc_tiny_fast.h: early-exit insertion
- core/box/free_path_stats_box.h: counter
- core/box/free_tiny_fast_mono_dualhot_env_box.h: NEW (ENV gate)
- scripts/run_mixed_10_cleanenv.sh: ENV leak protection

Health check: PASSED (all profiles)

Promotion: Added to MIXED_TINYV3_C7_SAFE preset (default ON, opt-out)

Rollback: HAKMEM_FREE_TINY_FAST_MONO_DUALHOT=0

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 19:16:49 +09:00
+									    // Phase 9: FREE-TINY-FAST MONO DUALHOT (+2.72% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FREE_TINY_FAST_MONO_DUALHOT", "1");
-												Phase 10: FREE-TINY-FAST MONO LEGACY DIRECT (GO +1.89%)

Results:
- A/B test: +1.89% on Mixed (10-run, clean env)
- Baseline: 51.96M ops/s
- Optimized: 52.94M ops/s
- Improvement: +984K ops/s (+1.89%)
- C6-heavy verification: +7.86% (nonlegacy_mask works correctly, no misfires)

Strategy:
- Extend Phase 9 (C0-C3 DUALHOT) to C4-C7 LEGACY DIRECT
- Fail-Fast principle: Never misclassify MID/ULTRA/V7 as LEGACY
- nonlegacy_mask: Cached at init, hot path uses single bit operation

Success factors:
1. Performance improvement: +1.89% (1.9x GO threshold)
2. Safety verified: nonlegacy_mask prevents MID v3 misfire in C6-heavy
3. Phase 9 coexistence: C0-C3 (Phase 9) + C4-C7 (Phase 10) = full LEGACY coverage
4. Minimal overhead: Single bit operation in hot path (mask & (1u<<class))

Implementation:
- Patch 1: ENV gate box (free_tiny_fast_mono_legacy_direct_env_box.h)
  - ENV: HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT=0/1 (default 0)
  - nonlegacy_mask cached (reuses free_policy_fast_v2_nonlegacy_mask())
  - Probe window: 64 (avoid bench_profile putenv race)
- Patch 2: Early-exit in free_tiny_fast() (malloc_tiny_fast.h)
  - Conditions: !nonlegacy_mask, route==LEGACY, !LARSON_FIX, done==1
  - Direct call: tiny_legacy_fallback_free_base()
- Patch 3: Visibility (free_path_stats_box.h)
  - mono_legacy_direct_hit counter (compile-out in release)
- Patch 4: cleanenv extension (run_mixed_10_cleanenv.sh)
  - ENV leak protection

Safety verification (C6-heavy):
- OFF: 19.75M ops/s
- ON: 21.30M ops/s (+7.86%)
- nonlegacy_mask correctly excludes C6 (MID v3 active)
- Improvement from C0-C5, C7 direct path acceleration

Files modified:
- core/bench_profile.h: add to MIXED_TINYV3_C7_SAFE preset
- core/front/malloc_tiny_fast.h: early-exit insertion
- core/box/free_path_stats_box.h: counter
- core/box/free_tiny_fast_mono_legacy_direct_env_box.h: NEW (ENV gate + nonlegacy_mask)
- scripts/run_mixed_10_cleanenv.sh: ENV leak protection

Health check: PASSED (all profiles)

Promotion: Added to MIXED_TINYV3_C7_SAFE preset (default ON, opt-out)

Rollback: HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT=0

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 20:09:40 +09:00
+									    // Phase 10: FREE-TINY-FAST MONO LEGACY DIRECT (+1.89% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT", "1");
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+									    // Phase 4-4: C6 ULTRA free+alloc 統合を有効化 (default OFF, manual opt-in)
 									    bench_setenv_default("HAKMEM_TINY_C6_ULTRA_FREE_ENABLED", "0");
 									    // Phase MID-V3: Mid/Pool HotBox v3
 									    // Mixed (16–1024B) では MID_V3(C6) が大きく遅くなるため、デフォルト OFF に固定。
 									    // C6-heavy プロファイル側でのみ ON を推奨する（C6-heavy のみ最適化対象）。
 									    bench_setenv_default("HAKMEM_MID_V3_ENABLED", "0");
 									    bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x0");
 										    // Phase 2 B3: Routing branch shape optimization (LIKELY on LEGACY, cold helper for rare routes)
 										    bench_setenv_default("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", "1");
 										    // Phase 3 C3: Static routing (policy_snapshot bypass, +2.2% proven)
 										    bench_setenv_default("HAKMEM_TINY_STATIC_ROUTE", "1");
 										    // Phase 3 D1: Free route cache (TLS cache for free path routing, +2.19% proven)
 										    bench_setenv_default("HAKMEM_FREE_STATIC_ROUTE", "1");
 								  } else if (strcmp(p, "C6_HEAVY_LEGACY_POOLV1") == 0) {
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
 								    bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
 								    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_MID_DESC_CACHE_ENABLED", "1");
 								    // Phase 4-4: C6 ULTRA free+alloc 統合を有効化 (default OFF, manual opt-in)
 								    bench_setenv_default("HAKMEM_TINY_C6_ULTRA_FREE_ENABLED", "0");
 								    // Phase MID-V3: Mid/Pool HotBox v3 (257-768B, C6 only)
 								    bench_setenv_default("HAKMEM_MID_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x40");
-												Phase 6-2: Promote Front FastLane Free DeDup (default ON)

Results:
- A/B test: +5.18% on Mixed (10-run, clean env)
- Baseline: 46.68M ops/s
- Optimized: 49.10M ops/s
- Improvement: +2.42M ops/s (+5.18%)

Strategy:
- Eliminate duplicate header validation in front_fastlane_try_free()
- Direct call to free_tiny_fast() when dedup enabled
- Single validation path (no redundant checks)

Success factors:
1. Complete duplicate elimination (free path optimization)
2. Free path importance (50% of Mixed workload)
3. Improved execution stability (CV: 1.00% → 0.58%)

Phase 6 cumulative:
- Phase 6-1 FastLane: +11.13%
- Phase 6-2 Free DeDup: +5.18%
- Total: ~+16-17% from baseline (multiplicative effect)

Promotion:
- Default: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=1 (opt-out)
- Added to MIXED_TINYV3_C7_SAFE preset
- Added to C6_HEAVY_LEGACY_POOLV1 preset
- Rollback: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=0

Files modified:
- core/box/front_fastlane_env_box.h: default 0 → 1
- core/bench_profile.h: added to presets
- CURRENT_TASK.md: Phase 6-2 GO result

Health check: PASSED (all profiles)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 17:38:21 +09:00
+									    // Phase 6-1: Front FastLane (Layer Collapse) (+11.13% proven on Mixed, 10-run)
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+									    bench_setenv_default("HAKMEM_FRONT_FASTLANE", "1");
-												Phase 6-2: Promote Front FastLane Free DeDup (default ON)

Results:
- A/B test: +5.18% on Mixed (10-run, clean env)
- Baseline: 46.68M ops/s
- Optimized: 49.10M ops/s
- Improvement: +2.42M ops/s (+5.18%)

Strategy:
- Eliminate duplicate header validation in front_fastlane_try_free()
- Direct call to free_tiny_fast() when dedup enabled
- Single validation path (no redundant checks)

Success factors:
1. Complete duplicate elimination (free path optimization)
2. Free path importance (50% of Mixed workload)
3. Improved execution stability (CV: 1.00% → 0.58%)

Phase 6 cumulative:
- Phase 6-1 FastLane: +11.13%
- Phase 6-2 Free DeDup: +5.18%
- Total: ~+16-17% from baseline (multiplicative effect)

Promotion:
- Default: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=1 (opt-out)
- Added to MIXED_TINYV3_C7_SAFE preset
- Added to C6_HEAVY_LEGACY_POOLV1 preset
- Rollback: HAKMEM_FRONT_FASTLANE_FREE_DEDUP=0

Files modified:
- core/box/front_fastlane_env_box.h: default 0 → 1
- core/bench_profile.h: added to presets
- CURRENT_TASK.md: Phase 6-2 GO result

Health check: PASSED (all profiles)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 17:38:21 +09:00
+									    // Phase 6-2: Front FastLane Free DeDup (+5.18% proven on Mixed, 10-run)
 									    bench_setenv_default("HAKMEM_FRONT_FASTLANE_FREE_DEDUP", "1");
-												Phase 17 v2 (FORCE_LIBC fix) + Phase 19-1b (FastLane Direct) — GO (+5.88%)

## Phase 17 v2: FORCE_LIBC Gap Validation Fix

**Critical bug fix**: Phase 17 v1 の測定が壊れていた

**Problem**: HAKMEM_FORCE_LIBC_ALLOC=1 が FastLane より後でしか見えず、
same-binary A/B が実質 "hakmem vs hakmem" になっていた（+0.39% 誤測定）

**Fix**: core/box/hak_wrappers.inc.h:171 と :645 に g_force_libc_alloc==1 の
early bypass を追加、__libc_malloc/__libc_free に最初に直行

**Result**: 正しい同一バイナリ A/B 測定
- hakmem (FORCE_LIBC=0): 48.99M ops/s
- libc (FORCE_LIBC=1): 79.72M ops/s (+62.7%)
- system binary: 88.06M ops/s (+10.5% vs libc)

**Gap 分解**:
- Allocator 差: +62.7% (主戦場)
- Layout penalty: +10.5% (副次的)

**Conclusion**: Case A 確定 (allocator dominant, NOT layout)
Phase 17 v1 の Case B 判定は誤り。

Files:
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_AB_TEST_RESULTS.md (v2)
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_NEXT_INSTRUCTIONS.md (updated)

---

## Phase 19: FastLane Instruction Reduction Analysis

**Goal**: libc との instruction gap (-35% instructions, -56% branches) を削減

**perf stat 分析** (FORCE_LIBC=0 vs 1, 200M ops):
- hakmem: 209.09 instructions/op, 52.33 branches/op
- libc: 135.92 instructions/op, 22.93 branches/op
- Delta: +73.17 instructions/op (+53.8%), +29.40 branches/op (+128.2%)

**Hot path** (perf report):
- front_fastlane_try_free: 23.97% cycles
- malloc wrapper: 23.84% cycles
- free wrapper: 6.82% cycles
- **Wrapper overhead: ~55% of all cycles**

**Reduction candidates**:
- A: Wrapper layer 削除 (-17.5 inst/op, +10-15% 期待)
- B: ENV snapshot 統合 (-10.0 inst/op, +5-8%)
- C: Stats 削除 (-5.0 inst/op, +3-5%)
- D: Header inline (-4.0 inst/op, +2-3%)
- E: Route fast path (-3.5 inst/op, +2-3%)

Files:
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_1_DESIGN.md
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_2_NEXT_INSTRUCTIONS.md

---

## Phase 19-1b: FastLane Direct — GO (+5.88%)

**Strategy**: Wrapper layer を bypass し、core allocator を直接呼ぶ
- free() → free_tiny_fast() (not free_tiny_fast_hot)
- malloc() → malloc_tiny_fast()

**Phase 19-1 が NO-GO (-3.81%) だった原因**:
1. __builtin_expect(fastlane_direct_enabled(), 0) が逆効果（A/B 不公平）
2. free_tiny_fast_hot() が誤選択（free_tiny_fast() が勝ち筋）

**Phase 19-1b の修正**:
1. __builtin_expect() 削除
2. free_tiny_fast() を直接呼び出し

**Result** (Mixed, 10-run, 20M iters, ws=400):
- Baseline (FASTLANE_DIRECT=0): 49.17M ops/s
- Optimized (FASTLANE_DIRECT=1): 52.06M ops/s
- **Delta: +5.88%** (GO 基準 +5% クリア)

**perf stat** (200M iters):
- Instructions/op: 199.90 → 169.45 (-30.45, -15.23%)
- Branches/op: 51.49 → 41.52 (-9.97, -19.36%)
- Cycles/op: 88.88 → 84.37 (-4.51, -5.07%)
- I-cache miss: 111K → 98K (-11.79%)

**Trade-offs** (acceptable):
- iTLB miss: +41.46% (front-end cost)
- dTLB miss: +29.15% (backend cost)
- Overall gain (+5.88%) outweighs costs

**Implementation**:
1. **ENV gate**: core/box/fastlane_direct_env_box.{h,c}
   - HAKMEM_FASTLANE_DIRECT=0/1 (default: 0, opt-in)
   - Single _Atomic global (wrapper キャッシュ問題を解決)

2. **Wrapper 修正**: core/box/hak_wrappers.inc.h
   - malloc: direct call to malloc_tiny_fast() when FASTLANE_DIRECT=1
   - free: direct call to free_tiny_fast() when FASTLANE_DIRECT=1
   - Safety: !g_initialized では direct 使わない、fallback 維持

3. **Preset 昇格**: core/bench_profile.h:88
   - bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1")
   - Comment: +5.88% proven on Mixed, 10-run

4. **cleanenv 更新**: scripts/run_mixed_10_cleanenv.sh:22
   - HAKMEM_FASTLANE_DIRECT=${HAKMEM_FASTLANE_DIRECT:-1}
   - Phase 9/10 と同様に昇格

**Verdict**: GO — 本線採用、プリセット昇格完了

**Rollback**: HAKMEM_FASTLANE_DIRECT=0 で既存 FastLane path に戻る

Files:
- core/box/fastlane_direct_env_box.{h,c} (new)
- core/box/hak_wrappers.inc.h (modified)
- core/bench_profile.h (preset promotion)
- scripts/run_mixed_10_cleanenv.sh (ENV default aligned)
- Makefile (new obj)
- docs/analysis/PHASE19_1B_FASTLANE_DIRECT_REVISED_AB_TEST_RESULTS.md

---

## Cumulative Performance

- Baseline (all optimizations OFF): ~40M ops/s (estimated)
- Current (Phase 19-1b): 52.06M ops/s
- **Cumulative gain: ~+30% from baseline**

Remaining gap to libc (79.72M):
- Current: 52.06M ops/s
- Target: 79.72M ops/s
- **Gap: +53.2%** (was +62.7% before Phase 19-1b)

Next: Phase 19-2 (ENV snapshot consolidation, +5-8% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 11:28:40 +09:00
+									    // Phase 19-1b: FastLane Direct (wrapper layer bypass)
 									    bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1");
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+									    // Phase 2 B3: Routing branch shape optimization (LIKELY on LEGACY, cold helper for rare routes)
 									    bench_setenv_default("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", "1");
 								  } else if (strcmp(p, "C6_V7_STUB") == 0) {
 								    // Phase v7-1: C6-only v7 stub 実験用（MID v3 fallback）
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
 								    bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
 								    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_MID_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x40");
 								    // v7 stub ON (C6-only)
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V7_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V7_CLASSES", "0x40");
 								  } else if (strcmp(p, "C6_HEAVY_LEGACY_POOLV1_FLATTEN") == 0) {
 								    // LEGACY mid/smallmid ベンチ専用（C7_SAFE では使用しない）
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "LEGACY");
 								    bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
 								    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_STATS", "1");
 								    bench_setenv_default("HAKMEM_POOL_ZERO_MODE", "header");
 								  } else if (strcmp(p, "DEBUG_TINY_FRONT_PERF") == 0) {
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
 								    bench_setenv_default("HAKMEM_TINY_C7_HOT", "1");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
 								    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_TINY_FRONT_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_TINY_FRONT_V3_LUT_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED", "1");
 								  } else if (strcmp(p, "C6_SMALL_HEAP_V3_EXPERIMENT") == 0) {
 								    // C6 を SmallObject v3 に載せる研究用（標準では使用しない）
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
 								    bench_setenv_default("HAKMEM_TINY_C6_HOT", "1");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x40"); // C6 only
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x0");
 								    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 									  } else if (strcmp(p, "C6_SMALL_HEAP_V4_EXPERIMENT") == 0) {
 								    // C6 を SmallObject v4 に載せる研究用（標準では使用しない）
 								    bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
 								    bench_setenv_default("HAKMEM_TINY_C6_HOT", "1");
 								    bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x0");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "1");
 								    bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x40"); // C6 only
 									    bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
 									  }
 								#ifdef USE_HAKMEM
 									  // Phase 3 C3 Step 0: Ensure policy snapshot reflects final ENV after putenv defaults.
 									  small_policy_v7_bump_version();
 									  // Phase 2 B4: Sync wrapper ENV cache after bench_profile putenv defaults.
 									  wrapper_env_refresh_from_env();
 									  // Phase 3 C3: Sync static route cache after bench_profile putenv defaults.
 									  tiny_static_route_refresh_from_env();
 									  // Phase 4 E1: Sync ENV snapshot cache after bench_profile putenv defaults.
 									  hakmem_env_snapshot_refresh_from_env();
-												Phase 8: FREE-STATIC-ROUTE ENV Cache Hardening (GO +2.61%)

Results:
- A/B test: +2.61% on Mixed (10-run, clean env)
- Baseline: 49.26M ops/s
- Optimized: 50.55M ops/s
- Improvement: +1.29M ops/s (+2.61%)

Strategy:
- Fix ENV cache accident (main前キャッシュ事故の修正)
- Add refresh mechanism to sync with bench_profile putenv
- Ensure Phase 3 D1 optimization works reliably

Success factors:
1. Performance improvement: +2.61% (existing win-box now reliable)
2. ENV cache accident fixed: refresh mechanism works correctly
3. Standard deviation improved: 867K → 336K ops/s (61% reduction)
4. Baseline quality improved: existing optimization now guaranteed

Implementation:
- Patch 1: Make ENV gate refreshable (tiny_free_route_cache_env_box.{h,c})
  - Changed static int to extern _Atomic int
  - Added tiny_free_static_route_refresh_from_env()
- Patch 2: Integrate refresh into bench_profile.h
  - Call refresh after bench_setenv_default() group
- Patch 3: Update Makefile for new .c file

ENV cache fix verification:
- [FREE_STATIC_ROUTE] enabled appears twice (refresh working)
- bench_profile putenv now reliably reflected

Files modified:
- core/box/tiny_free_route_cache_env_box.h: extern + refresh API
- core/box/tiny_free_route_cache_env_box.c: NEW (global state + refresh)
- core/bench_profile.h: add refresh call
- Makefile: add new .o file

Health check: PASSED (all profiles)

Rollback: HAKMEM_FREE_STATIC_ROUTE=0 or revert Patch 1/2

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-14 18:49:08 +09:00
+									  // Phase 8: Sync free static route ENV cache after bench_profile putenv defaults.
 									  tiny_free_static_route_refresh_from_env();
-												Phase 13 v1 + E5-2 retest: Both NEUTRAL, freeze as research boxes

Phase 13 v1: Header Write Elimination (C7 preserve header)
- Verdict: NEUTRAL (+0.78%)
- Implementation: HAKMEM_TINY_C7_PRESERVE_HEADER ENV gate (default OFF)
- Makes C7 nextptr offset conditional (0→1 when enabled)
- 4-point matrix A/B test results:
  * Case A (baseline): 51.49M ops/s
  * Case B (WRITE_ONCE=1): 52.07M ops/s (+1.13%)
  * Case C (C7_PRESERVE=1): 51.36M ops/s (-0.26%)
  * Case D (both): 51.89M ops/s (+0.78% NEUTRAL)
- Action: Freeze as research box (default OFF, manual opt-in)

Phase 5 E5-2: Header Write-Once retest (promotion test)
- Verdict: NEUTRAL (+0.54%)
- Motivation: Phase 13 Case B showed +1.13%, re-tested with dedicated 20-run
- Results (20-run):
  * Case A (baseline): 51.10M ops/s
  * Case B (WRITE_ONCE=1): 51.37M ops/s (+0.54%)
- Previous test: +0.45% (consistent with NEUTRAL)
- Action: Keep as research box (default OFF, manual opt-in)

Key findings:
- Header write tax optimization shows consistent NEUTRAL results
- Neither Phase 13 v1 nor E5-2 reaches GO threshold (+1.0%)
- Both implemented as reversible ENV gates for future research

Files changed:
- New: core/box/tiny_c7_preserve_header_env_box.{c,h}
- Modified: core/box/tiny_layout_box.h (C7 offset conditional)
- Modified: core/tiny_nextptr.h, core/box/tiny_header_box.h (comments)
- Modified: core/bench_profile.h (refresh sync)
- Modified: Makefile (add new .o files)
- Modified: scripts/run_mixed_10_cleanenv.sh (add C7_PRESERVE ENV)
- Docs: PHASE13_*, PHASE5_E5_2_HEADER_WRITE_ONCE_* (design/results)

Next: Phase 14 (Pointer-chase reduction, tcache-style intrusive LIFO)

🤖 Generated with Claude Code

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 00:32:25 +09:00
+									  // Phase 13 v1: Sync C7 preserve header ENV cache after bench_profile putenv defaults.
 									  tiny_c7_preserve_header_env_refresh_from_env();
-												Phase 14 v1: Pointer-Chase Reduction (tcache) NEUTRAL (+0.20%)

Implementation:
- Intrusive LIFO tcache layer (L1) before UnifiedCache
- TLS per-class bins (head pointer + count)
- Intrusive next pointers (via tiny_next_store/load SSOT)
- Cap: 64 blocks per class (default)
- ENV: HAKMEM_TINY_TCACHE=0/1 (default: 0, OFF)

A/B Test Results (Mixed 10-run):
- Baseline (TCACHE=0): 51,083,379 ops/s
- Optimized (TCACHE=1): 51,186,838 ops/s
- Mean delta: +0.20% (below +1.0% GO threshold)
- Median delta: +0.59%

Verdict: NEUTRAL - Freeze as research box (default OFF)

Root Cause (v1 wiring incomplete):
- Free side pushes to tcache via unified_cache_push()
- Alloc hot path (tiny_hot_alloc_fast) doesn't consume tcache
- tcache becomes "sink" without alloc-side pop → ROI not measurable

Files:
- Created: core/box/tiny_tcache_{env_box,box}.h, tiny_tcache_env_box.c
- Modified: core/front/tiny_unified_cache.h (integration)
- Modified: core/bench_profile.h (refresh sync)
- Modified: Makefile (build integration)
- Results: docs/analysis/PHASE14_POINTER_CHASE_REDUCTION_1_AB_TEST_RESULTS.md
- v2 Instructions: docs/analysis/PHASE14_POINTER_CHASE_REDUCTION_2_NEXT_INSTRUCTIONS.md

Next: Phase 14 v2 (connect tcache to tiny_front_hot_box alloc/free hot path)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 01:28:50 +09:00
+									  // Phase 14 v1: Sync tcache ENV cache after bench_profile putenv defaults.
 									  tiny_tcache_env_refresh_from_env();
-												Phase 15 v1: UnifiedCache FIFO→LIFO NEUTRAL (-0.70% Mixed, +0.42% C7)

Transform existing array-based UnifiedCache from FIFO ring to LIFO stack.

A/B Results:
- Mixed (16-1024B): -0.70% (52,965,966 → 52,593,948 ops/s)
- C7-only (1025-2048B): +0.42% (78,010,783 → 78,335,509 ops/s)

Verdict: NEUTRAL (both below +1.0% GO threshold) - freeze as research box

Implementation:
- L0 ENV gate: tiny_unified_lifo_env_box.{h,c} (HAKMEM_TINY_UNIFIED_LIFO=0/1)
- L1 LIFO ops: tiny_unified_lifo_box.h (unified_cache_try_pop/push_lifo)
- L2 integration: tiny_front_hot_box.h (mode check at entry)
- Reuses existing slots[] array (no intrusive pointers)

Root Causes:
1. Mode check overhead (tiny_unified_lifo_enabled() call)
2. Minimal LIFO vs FIFO locality delta in practice
3. Existing FIFO ring already well-optimized

Bonus Fix: LTO bug for tiny_c7_preserve_header_enabled() (Phase 13/14 latent issue)
- Converted static inline to extern + non-inline implementation
- Fixes undefined reference during LTO linking

Design: docs/analysis/PHASE15_UNIFIEDCACHE_LIFO_1_DESIGN.md
Results: docs/analysis/PHASE15_UNIFIEDCACHE_LIFO_1_AB_TEST_RESULTS.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 02:19:26 +09:00
+									  // Phase 15 v1: Sync LIFO ENV cache after bench_profile putenv defaults.
 									  tiny_unified_lifo_env_refresh_from_env();
-												Phase 16 v1 NEUTRAL, Phase 17 Case B confirmed, Phase 18 design added

## Phase 16 v1: Front FastLane Alloc LEGACY Direct — NEUTRAL (+0.62%)

Target: Reduce alloc-side fixed costs by adding LEGACY direct path to
FastLane entry, mirroring Phase 9/10 free-side winning pattern.

Result: +0.62% on Mixed (below +1.0% GO threshold) → NEUTRAL, freeze as
research box (default OFF).

Critical issue: Initial impl crashed (segfault) for C4-C7. Root cause:
unified_cache_refill() incompatibility. Safety fix: Limited to C0-C3
only (matching existing dualhot pattern).

Files:
- core/box/front_fastlane_alloc_legacy_direct_env_box.{h,c} (new)
- core/box/front_fastlane_box.h (LEGACY direct path, lines 93-119)
- core/bench_profile.h (env refresh sync)
- Makefile (new obj)
- docs/analysis/PHASE16_*.md (design/results/instructions)

ENV: HAKMEM_FRONT_FASTLANE_ALLOC_LEGACY_DIRECT=0 (default OFF, opt-in)

Verdict: Research box frozen. Phase 14-16 plateau confirms dispatch/
routing optimization ROI is exhausted post-Phase-6 FastLane collapse.

---

## Phase 17: FORCE_LIBC Gap Validation — Case B Confirmed

Purpose: Validate "system malloc faster" observation using same-binary
A/B testing to isolate allocator logic差 vs binary layout penalty.

Method:
- Same-binary toggle: HAKMEM_FORCE_LIBC_ALLOC=0/1 (bench_random_mixed_hakmem)
- System binary: bench_random_mixed_system (21K separate binary)
- Perf stat: Hardware counter analysis (I-cache, cycles, instructions)

Result: **Case B confirmed** — Allocator差 negligible, layout penalty dominates.

Gap breakdown (Mixed, 20M iters, ws=400):
- hakmem (FORCE_LIBC=0): 48.12M ops/s
- libc (FORCE_LIBC=1, same binary): 48.31M ops/s → +0.39% (noise level)
- system binary (21K): 83.85M ops/s → +73.57% vs libc, +74.26% vs hakmem

Perf stat (200M iters):
- I-cache misses: 153K (hakmem) → 68K (system) = -55% (smoking gun)
- Cycles: 17.9B → 10.2B = -43%
- Instructions: 41.3B → 21.5B = -48%
- Binary size: 653K → 21K (30x difference)

Root cause: Binary size (30x) causes I-cache thrashing. Code bloat >>
algorithmic efficiency.

Conclusion: Phase 12's "system malloc 1.6x faster" was real, but
misattributed. Gap is layout/I-cache, NOT allocator algorithm.

Files:
- docs/analysis/PHASE17_*.md (results/instructions)
- scripts/run_mixed_10_cleanenv.sh (Phase 9/10 defaults aligned)

Next: Phase 18 Hot Text Isolation (layout optimization, not algorithm opt)

---

## Phase 18: Hot Text Isolation — Design Added

Purpose: Reduce I-cache misses + instruction footprint via layout control
(binary optimization, not allocator algorithm changes).

Strategy (v1 → v2 progression):

v1 (TU split + hot/cold attrs + optional gc-sections):
- Target: +2% throughput (GO threshold, realistic for layout tweaks)
- Secondary: I-cache -10%, instructions -5% (direction confirmation)
- Risk: Low (reversible via build knob)
- Expected: +0-2% (NEUTRAL likely, but validates approach)

v2 (BENCH_MINIMAL compile-out):
- Target: +10-20% throughput (本命)
- Method: Conditional compilation removes stats/ENV/debug from hot path
- Expected: Instruction count -30-40% → significant I-cache improvement

Files:
- docs/analysis/PHASE18_*.md (design/instructions)
- CURRENT_TASK.md (Phase 17 complete, Phase 18 v1/v2 plan)

Build gate: HOT_TEXT_ISOLATION=0/1 (Makefile knob)

Next: Implement Phase 18 v1 (TU split first, BENCH_MINIMAL if v1 NEUTRAL)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 05:25:47 +09:00
+									  // Phase 16 v1: Sync LEGACY direct ENV cache after bench_profile putenv defaults.
 									  front_fastlane_alloc_legacy_direct_env_refresh_from_env();
-												Phase 17 v2 (FORCE_LIBC fix) + Phase 19-1b (FastLane Direct) — GO (+5.88%)

## Phase 17 v2: FORCE_LIBC Gap Validation Fix

**Critical bug fix**: Phase 17 v1 の測定が壊れていた

**Problem**: HAKMEM_FORCE_LIBC_ALLOC=1 が FastLane より後でしか見えず、
same-binary A/B が実質 "hakmem vs hakmem" になっていた（+0.39% 誤測定）

**Fix**: core/box/hak_wrappers.inc.h:171 と :645 に g_force_libc_alloc==1 の
early bypass を追加、__libc_malloc/__libc_free に最初に直行

**Result**: 正しい同一バイナリ A/B 測定
- hakmem (FORCE_LIBC=0): 48.99M ops/s
- libc (FORCE_LIBC=1): 79.72M ops/s (+62.7%)
- system binary: 88.06M ops/s (+10.5% vs libc)

**Gap 分解**:
- Allocator 差: +62.7% (主戦場)
- Layout penalty: +10.5% (副次的)

**Conclusion**: Case A 確定 (allocator dominant, NOT layout)
Phase 17 v1 の Case B 判定は誤り。

Files:
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_AB_TEST_RESULTS.md (v2)
- docs/analysis/PHASE17_FORCE_LIBC_GAP_VALIDATION_1_NEXT_INSTRUCTIONS.md (updated)

---

## Phase 19: FastLane Instruction Reduction Analysis

**Goal**: libc との instruction gap (-35% instructions, -56% branches) を削減

**perf stat 分析** (FORCE_LIBC=0 vs 1, 200M ops):
- hakmem: 209.09 instructions/op, 52.33 branches/op
- libc: 135.92 instructions/op, 22.93 branches/op
- Delta: +73.17 instructions/op (+53.8%), +29.40 branches/op (+128.2%)

**Hot path** (perf report):
- front_fastlane_try_free: 23.97% cycles
- malloc wrapper: 23.84% cycles
- free wrapper: 6.82% cycles
- **Wrapper overhead: ~55% of all cycles**

**Reduction candidates**:
- A: Wrapper layer 削除 (-17.5 inst/op, +10-15% 期待)
- B: ENV snapshot 統合 (-10.0 inst/op, +5-8%)
- C: Stats 削除 (-5.0 inst/op, +3-5%)
- D: Header inline (-4.0 inst/op, +2-3%)
- E: Route fast path (-3.5 inst/op, +2-3%)

Files:
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_1_DESIGN.md
- docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_2_NEXT_INSTRUCTIONS.md

---

## Phase 19-1b: FastLane Direct — GO (+5.88%)

**Strategy**: Wrapper layer を bypass し、core allocator を直接呼ぶ
- free() → free_tiny_fast() (not free_tiny_fast_hot)
- malloc() → malloc_tiny_fast()

**Phase 19-1 が NO-GO (-3.81%) だった原因**:
1. __builtin_expect(fastlane_direct_enabled(), 0) が逆効果（A/B 不公平）
2. free_tiny_fast_hot() が誤選択（free_tiny_fast() が勝ち筋）

**Phase 19-1b の修正**:
1. __builtin_expect() 削除
2. free_tiny_fast() を直接呼び出し

**Result** (Mixed, 10-run, 20M iters, ws=400):
- Baseline (FASTLANE_DIRECT=0): 49.17M ops/s
- Optimized (FASTLANE_DIRECT=1): 52.06M ops/s
- **Delta: +5.88%** (GO 基準 +5% クリア)

**perf stat** (200M iters):
- Instructions/op: 199.90 → 169.45 (-30.45, -15.23%)
- Branches/op: 51.49 → 41.52 (-9.97, -19.36%)
- Cycles/op: 88.88 → 84.37 (-4.51, -5.07%)
- I-cache miss: 111K → 98K (-11.79%)

**Trade-offs** (acceptable):
- iTLB miss: +41.46% (front-end cost)
- dTLB miss: +29.15% (backend cost)
- Overall gain (+5.88%) outweighs costs

**Implementation**:
1. **ENV gate**: core/box/fastlane_direct_env_box.{h,c}
   - HAKMEM_FASTLANE_DIRECT=0/1 (default: 0, opt-in)
   - Single _Atomic global (wrapper キャッシュ問題を解決)

2. **Wrapper 修正**: core/box/hak_wrappers.inc.h
   - malloc: direct call to malloc_tiny_fast() when FASTLANE_DIRECT=1
   - free: direct call to free_tiny_fast() when FASTLANE_DIRECT=1
   - Safety: !g_initialized では direct 使わない、fallback 維持

3. **Preset 昇格**: core/bench_profile.h:88
   - bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1")
   - Comment: +5.88% proven on Mixed, 10-run

4. **cleanenv 更新**: scripts/run_mixed_10_cleanenv.sh:22
   - HAKMEM_FASTLANE_DIRECT=${HAKMEM_FASTLANE_DIRECT:-1}
   - Phase 9/10 と同様に昇格

**Verdict**: GO — 本線採用、プリセット昇格完了

**Rollback**: HAKMEM_FASTLANE_DIRECT=0 で既存 FastLane path に戻る

Files:
- core/box/fastlane_direct_env_box.{h,c} (new)
- core/box/hak_wrappers.inc.h (modified)
- core/bench_profile.h (preset promotion)
- scripts/run_mixed_10_cleanenv.sh (ENV default aligned)
- Makefile (new obj)
- docs/analysis/PHASE19_1B_FASTLANE_DIRECT_REVISED_AB_TEST_RESULTS.md

---

## Cumulative Performance

- Baseline (all optimizations OFF): ~40M ops/s (estimated)
- Current (Phase 19-1b): 52.06M ops/s
- **Cumulative gain: ~+30% from baseline**

Remaining gap to libc (79.72M):
- Current: 52.06M ops/s
- Target: 79.72M ops/s
- **Gap: +53.2%** (was +62.7% before Phase 19-1b)

Next: Phase 19-2 (ENV snapshot consolidation, +5-8% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

											
										
										
											2025-12-15 11:28:40 +09:00
+									  // Phase 19-1: Sync FastLane Direct ENV cache after bench_profile putenv defaults.
 									  fastlane_direct_env_refresh_from_env();
-												Phase 6: promote Front FastLane (default ON)

											
										
										
											2025-12-14 16:28:23 +09:00
+								#endif
 									}