Phase 74-1/74-2: UnifiedCache LOCALIZE optimization (P1 frozen, NEUTRAL -0.87%)

Phase 74-1 (ENV-gated LOCALIZE):
- Result: +0.50% (NEUTRAL)
- Runtime branch overhead caused instructions/branches to increase
- Diagnosed: Branch tax dominates intended optimization

Phase 74-2 (compile-time LOCALIZE):
- Result: -0.87% (NEUTRAL, P1 frozen)
- Removed runtime branch → instructions -0.6%, branches -2.3% ✓
- But cache-misses +86% (register pressure/spill) → net loss
- Conclusion: LOCALIZE本体 works, but fragile to cache effects

Key finding:
- Dependency chain reduction (LOCALIZE) has low ROI due to cache-miss sensitivity
- P1 (LOCALIZE) frozen at default OFF
- Next: Phase 74-3 (P0: FASTAPI) - move branches outside hot loop

Files:
- core/hakmem_build_flags.h: HAKMEM_TINY_UC_LOCALIZE_COMPILED flag
- core/box/tiny_unified_cache_hitpath_env_box.h: ENV gate (frozen)
- core/front/tiny_unified_cache.h: compile-time #if blocks
- docs/analysis/PHASE74_*: Design, instructions, results
- CURRENT_TASK.md: P1 frozen, P0 next instructions

Also includes:
- Phase 69 refill tuning results (archived docs)
- PERFORMANCE_TARGETS_SCORECARD.md: Phase 69 baseline update
- PHASE70_REFILL_OBSERVABILITY_PREREQS_SSOT.md: Route banner docs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-18 07:47:44 +09:00
parent e4baa1894f
commit e9b97e9d8e
14 changed files with 840 additions and 210 deletions

View File

@ -31,6 +31,7 @@
#include "../box/ptr_type_box.h" // Phantom pointer types (BASE/USER)
#include "../box/tiny_front_config_box.h" // Phase 8-Step1: Config macros
#include "../box/tiny_tcache_box.h" // Phase 14 v1: Intrusive LIFO tcache
#include "../box/tiny_unified_cache_hitpath_env_box.h" // Phase 74: LOCALIZE ENV gate
// ============================================================================
// Phase 3 C2 Patch 3: Bounds Check Compile-out
@ -247,6 +248,30 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
}
#endif
// Phase 74-2: LOCALIZE optimization (compile-time gate, no runtime branch)
#if HAKMEM_TINY_UC_LOCALIZE_COMPILED
// LOCALIZE: Load head/tail/mask once into locals to avoid reload dependency chains
uint16_t head = cache->head;
uint16_t tail = cache->tail;
uint16_t mask = cache->mask;
uint16_t next_tail = (tail + 1) & mask;
if (__builtin_expect(next_tail == head, 0)) {
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_full[class_idx]++;
#endif
return 0; // Full
}
cache->slots[tail] = base_raw;
cache->tail = next_tail;
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_push[class_idx]++;
#endif
return 1; // SUCCESS (LOCALIZE path)
#else
// Default path: Original implementation
uint16_t next_tail = (cache->tail + 1) & cache->mask;
// Full check (leave 1 slot empty to distinguish full/empty)
@ -266,6 +291,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
#endif
return 1; // SUCCESS (2-3 cache misses total)
#endif // HAKMEM_TINY_UC_LOCALIZE_COMPILED
}
// ============================================================================
@ -316,6 +342,37 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
}
#endif
// Phase 74-2: LOCALIZE optimization (compile-time gate, no runtime branch)
#if HAKMEM_TINY_UC_LOCALIZE_COMPILED
// LOCALIZE: Load head/tail/mask once into locals to avoid reload dependency chains
uint16_t head = cache->head;
uint16_t tail = cache->tail;
uint16_t mask = cache->mask;
if (__builtin_expect(head != tail, 1)) {
void* base = cache->slots[head];
cache->head = (head + 1) & mask;
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_hit[class_idx]++;
#endif
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
if (__builtin_expect(unified_cache_measure_check(), 0)) {
atomic_fetch_add_explicit(&g_unified_cache_hits_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_hits_by_class[class_idx],
1, memory_order_relaxed);
}
#endif
return HAK_BASE_FROM_RAW(base); // Hit! (LOCALIZE path)
}
// Cache miss → Batch refill from SuperSlab
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_miss[class_idx]++;
#endif
return unified_cache_refill(class_idx);
#else
// Default path: Original implementation
// Tcache miss/disabled/compiled-out → try pop from array cache (fast path)
if (__builtin_expect(cache->head != cache->tail, 1)) {
void* base = cache->slots[cache->head]; // 1 cache miss (array access)
@ -341,6 +398,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
g_unified_cache_miss[class_idx]++;
#endif
return unified_cache_refill(class_idx); // Refill + return first block (BASE)
#endif // HAKMEM_TINY_UC_LOCALIZE_COMPILED
}
#endif // HAK_FRONT_TINY_UNIFIED_CACHE_H