Phase 83-1 + Allocator Comparison: Switch dispatch fixed (NO-GO +0.32%), PROFILE correction, SCORECARD update

Key changes:
- Phase 83-1: Switch dispatch fixed mode (tiny_inline_slots_switch_dispatch_fixed_box) - NO-GO (marginal +0.32%, branch reduction negligible)
  Reason: lazy-init pattern already optimal, Phase 78-1 pattern shows diminishing returns

- Allocator comparison baseline update (10-run SSOT, WS=400, ITERS=20M):
  tcmalloc: 115.26M (92.33% of mimalloc)
  jemalloc: 97.39M (77.96% of mimalloc)
  system: 85.20M (68.24% of mimalloc)
  mimalloc: 124.82M (baseline)

- hakmem PROFILE correction: scripts/run_mixed_10_cleanenv.sh + run_allocator_quick_matrix.sh
  PROFILE explicitly set to MIXED_TINYV3_C7_SAFE for hakmem measurements
  Result: baseline stabilized to 55.53M (44.46% of mimalloc)
  Previous unstable measurement (35.57M) was due to profile leak

- Documentation:
  * PERFORMANCE_TARGETS_SCORECARD.md: Reference allocators + M1/M2 milestone status
  * PHASE83_1_SWITCH_DISPATCH_FIXED_RESULTS.md: Phase 83-1 analysis (NO-GO)
  * ALLOCATOR_COMPARISON_QUICK_RUNBOOK.md: Quick comparison procedure
  * ALLOCATOR_COMPARISON_SSOT.md: Detailed SSOT methodology

- M2 milestone status: 44.46% (target 55%, gap -10.54pp) - structural improvements needed

🤖 Generated with Claude Code
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-18 18:50:00 +09:00
parent d5c1113b4c
commit 89a9212700
50 changed files with 4428 additions and 58 deletions

View File

@ -35,6 +35,15 @@
#include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API
#include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate
#include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API
#include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate
#include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API
#include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate
#include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API
#include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate
#include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API
#include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating
#include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6
#include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode
// ============================================================================
// Branch Prediction Macros (Pointer Safety - Prediction Hints)
@ -114,9 +123,93 @@ __attribute__((always_inline))
static inline void* tiny_hot_alloc_fast(int class_idx) {
extern __thread TinyUnifiedCache g_unified_cache[];
// Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
// Phase 83-1: Per-op branch removed via fixed-mode caching
// C2/C3 excluded (NO-GO from Phase 77-1/79-1)
if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
// Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
switch (class_idx) {
case 4:
if (tiny_c4_inline_slots_enabled_fast()) {
void* base = c4_inline_pop(c4_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
}
}
break;
case 5:
if (tiny_c5_inline_slots_enabled_fast()) {
void* base = c5_inline_pop(c5_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
}
}
break;
case 6:
if (tiny_c6_inline_slots_enabled_fast()) {
void* base = c6_inline_pop(c6_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
}
}
break;
default:
// C0-C3, C7: fall through to unified_cache
break;
}
// Switch mode: fall through to unified_cache after miss
} else {
// If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
// NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path
// Phase 77-1: C3 Inline Slots early-exit (ENV gated)
// Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
void* base = c3_inline_pop(c3_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
}
// C3 inline miss → fall through to C4/C5/C6/unified cache
}
// Phase 76-1: C4 Inline Slots early-exit (ENV gated)
// Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
void* base = c4_inline_pop(c4_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
}
// C4 inline miss → fall through to C5/C6/unified cache
}
// Phase 75-2: C5 Inline Slots early-exit (ENV gated)
// Try C5 inline slots FIRST (before C6 and unified cache) for class 5
if (class_idx == 5 && tiny_c5_inline_slots_enabled()) {
// Try C5 inline slots SECOND (before C6 and unified cache) for class 5
if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
void* base = c5_inline_pop(c5_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
@ -129,20 +222,21 @@ static inline void* tiny_hot_alloc_fast(int class_idx) {
// C5 inline miss → fall through to C6/unified cache
}
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
// Try C6 inline slots SECOND (before unified cache) for class 6
if (class_idx == 6 && tiny_c6_inline_slots_enabled()) {
void* base = c6_inline_pop(c6_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
// Try C6 inline slots THIRD (before unified cache) for class 6
if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
void* base = c6_inline_pop(c6_inline_tls());
if (TINY_HOT_LIKELY(base != NULL)) {
TINY_HOT_METRICS_HIT(class_idx);
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_header_finalize_alloc(base, class_idx);
#else
return base;
#endif
}
// C6 inline miss → fall through to unified cache
}
// C6 inline miss → fall through to unified cache
}
} // End of if-chain mode
// TLS cache access (1 cache miss)
// NOTE: Range check removed - caller (hak_tiny_size_to_class) guarantees valid class_idx