2025-11-17 05:29:08 +09:00
|
|
|
|
// malloc_tiny_fast.h - Phase 26: Front Gate Unification (Tiny Fast Path)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Goal: Eliminate 3-layer overhead (malloc → hak_alloc_at → wrapper → tiny_alloc_fast)
|
|
|
|
|
|
// Target: +10-15% performance (11.35M → 12.5-13.5M ops/s)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Design (ChatGPT analysis):
|
|
|
|
|
|
// - Replace: malloc → hak_alloc_at (236 lines) → wrapper (diagnostics) → tiny_alloc_fast
|
|
|
|
|
|
// - With: malloc → malloc_tiny_fast (single-layer, direct to Unified Cache)
|
|
|
|
|
|
// - Preserves: Safety checks (lock depth, initializing, LD_SAFE, jemalloc block)
|
|
|
|
|
|
// - Leverages: Phase 23 Unified Cache (tcache-style, 2-3 cache misses)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Performance:
|
|
|
|
|
|
// - Current overhead: malloc(8.97%) + routing + wrapper(3.63%) + tiny(5.37%) = 17.97%
|
|
|
|
|
|
// - BenchFast ceiling: 8-10 instructions (~1-2% overhead)
|
|
|
|
|
|
// - Gap: ~16%
|
|
|
|
|
|
// - Target: Close half the gap (+10-15% improvement)
|
|
|
|
|
|
//
|
|
|
|
|
|
// ENV Variables:
|
|
|
|
|
|
// HAKMEM_FRONT_GATE_UNIFIED=1 # Enable Front Gate Unification (default: 0, OFF)
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef HAK_FRONT_MALLOC_TINY_FAST_H
|
|
|
|
|
|
#define HAK_FRONT_MALLOC_TINY_FAST_H
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
#include <stdio.h>
|
2025-12-08 21:30:21 +09:00
|
|
|
|
#include <stdatomic.h>
|
2025-11-27 11:52:11 +09:00
|
|
|
|
#include <pthread.h> // For pthread_self() in cross-thread check
|
2025-11-17 05:29:08 +09:00
|
|
|
|
#include "../hakmem_build_flags.h"
|
|
|
|
|
|
#include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES
|
2025-11-27 11:52:11 +09:00
|
|
|
|
#include "../hakmem_super_registry.h" // For cross-thread owner check
|
2025-11-27 12:47:10 +09:00
|
|
|
|
#include "../superslab/superslab_inline.h" // For ss_fast_lookup, slab_index_for (Phase 12)
|
2025-11-27 11:52:11 +09:00
|
|
|
|
#include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get
|
|
|
|
|
|
#include "../box/free_remote_box.h" // For tiny_free_remote_box
|
2025-11-17 05:29:08 +09:00
|
|
|
|
#include "tiny_unified_cache.h" // For unified_cache_pop_or_refill
|
|
|
|
|
|
#include "../tiny_region_id.h" // For tiny_region_id_write_header
|
|
|
|
|
|
#include "../hakmem_tiny.h" // For hak_tiny_size_to_class
|
2025-12-13 18:46:11 +09:00
|
|
|
|
#include "../box/tiny_env_box.h" // For tiny_env_cfg() (ENV variables)
|
2025-11-29 11:58:37 +09:00
|
|
|
|
#include "../box/tiny_front_hot_box.h" // Phase 4-Step2: Hot Path Box
|
|
|
|
|
|
#include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box
|
2025-12-07 22:49:28 +09:00
|
|
|
|
#include "../box/tiny_c7_hotbox.h" // Optional: C7 専用ホットボックス
|
|
|
|
|
|
#include "../box/tiny_heap_box.h" // TinyHeap 汎用 Box
|
2025-12-08 21:30:21 +09:00
|
|
|
|
#include "../box/tiny_hotheap_v2_box.h" // TinyHotHeap v2 (Phase31 A/B)
|
2025-12-09 21:50:15 +09:00
|
|
|
|
#include "../box/smallobject_hotbox_v3_box.h" // SmallObject HotHeap v3 skeleton
|
2025-12-10 17:58:42 +09:00
|
|
|
|
#include "../box/smallobject_hotbox_v4_box.h" // SmallObject HotHeap v4 (C7 stub)
|
2025-12-11 03:25:37 +09:00
|
|
|
|
#include "../box/smallobject_hotbox_v5_box.h" // SmallObject HotHeap v5 (C6-only route stub, Phase v5-1)
|
2025-12-11 23:51:48 +09:00
|
|
|
|
#include "../box/smallobject_core_v6_box.h" // SmallObject Core v6 (Phase V6-HDR-2)
|
|
|
|
|
|
#include "../box/smallobject_v6_env_box.h" // SmallObject v6 ENV control (Phase V6-HDR-2)
|
2025-12-12 03:12:28 +09:00
|
|
|
|
#include "../box/smallobject_hotbox_v7_box.h" // SmallObject HotBox v7 stub (Phase v7-1)
|
2025-12-12 03:50:58 +09:00
|
|
|
|
#include "../box/smallobject_policy_v7_box.h" // Phase v7-4: Policy Box
|
2025-12-13 18:46:11 +09:00
|
|
|
|
#include "../box/tiny_static_route_box.h" // Phase 3 C3: Static routing (policy snapshot bypass)
|
2025-12-12 06:52:14 +09:00
|
|
|
|
#include "../box/smallobject_mid_v35_box.h" // Phase v11a-3: MID v3.5 HotBox
|
2025-12-10 22:57:26 +09:00
|
|
|
|
#include "../box/tiny_c7_ultra_box.h" // C7 ULTRA stub (UF-1, delegates to v3)
|
2025-12-11 18:34:27 +09:00
|
|
|
|
#include "../box/tiny_c6_ultra_free_box.h" // Phase 4-2: C6 ULTRA-free (free-only, C6-only)
|
2025-12-11 19:26:51 +09:00
|
|
|
|
#include "../box/tiny_c5_ultra_free_box.h" // Phase 5-1/5-2: C5 ULTRA-free + alloc integration
|
2025-12-11 19:38:27 +09:00
|
|
|
|
#include "../box/tiny_c4_ultra_free_box.h" // Phase 6: C4 ULTRA-free + alloc integration (cap=64)
|
2025-12-12 16:26:42 +09:00
|
|
|
|
#include "../box/tiny_ultra_tls_box.h" // Phase TLS-UNIFY-1: Unified ULTRA TLS API
|
2025-12-11 19:38:27 +09:00
|
|
|
|
#include "../box/tiny_ultra_classes_box.h" // Phase REFACTOR-1: Named constants for C4-C7
|
2025-12-11 19:01:59 +09:00
|
|
|
|
#include "../box/tiny_legacy_fallback_box.h" // Phase REFACTOR-2: Legacy fallback logic unification
|
2025-12-11 19:02:49 +09:00
|
|
|
|
#include "../box/tiny_ptr_convert_box.h" // Phase REFACTOR-3: Inline pointer macro centralization
|
2025-12-09 21:50:15 +09:00
|
|
|
|
#include "../box/tiny_front_v3_env_box.h" // Tiny front v3 snapshot gate
|
2025-12-07 22:49:28 +09:00
|
|
|
|
#include "../box/tiny_heap_env_box.h" // ENV gate for TinyHeap front (A/B)
|
2025-12-08 21:30:21 +09:00
|
|
|
|
#include "../box/tiny_route_env_box.h" // Route snapshot (Heap vs Legacy)
|
|
|
|
|
|
#include "../box/tiny_front_stats_box.h" // Front class distribution counters
|
2025-12-11 18:04:14 +09:00
|
|
|
|
#include "../box/free_path_stats_box.h" // Phase FREE-LEGACY-BREAKDOWN-1: Free path stats
|
2025-12-11 21:32:40 +09:00
|
|
|
|
#include "../box/alloc_gate_stats_box.h" // Phase ALLOC-GATE-OPT-1: Alloc gate stats
|
2025-12-12 18:40:08 +09:00
|
|
|
|
#include "../box/free_policy_fast_v2_box.h" // Phase POLICY-FAST-PATH-V2: Policy snapshot bypass
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
#include "../box/free_tiny_fast_hotcold_env_box.h" // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: ENV control
|
|
|
|
|
|
#include "../box/free_tiny_fast_hotcold_stats_box.h" // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Stats
|
2025-12-13 19:19:42 +09:00
|
|
|
|
#include "../box/tiny_metadata_cache_hot_box.h" // Phase 3 C2: Policy hot cache (metadata cache optimization)
|
2025-12-13 21:44:00 +09:00
|
|
|
|
#include "../box/tiny_free_route_cache_env_box.h" // Phase 3 D1: Free path route cache
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
#include "../box/hakmem_env_snapshot_box.h" // Phase 4 E1: ENV snapshot consolidation
|
2025-12-14 06:44:04 +09:00
|
|
|
|
#include "../box/free_cold_shape_env_box.h" // Phase 5 E5-3a: Free cold path shape optimization
|
|
|
|
|
|
#include "../box/free_cold_shape_stats_box.h" // Phase 5 E5-3a: Free cold shape stats
|
2025-12-14 19:16:49 +09:00
|
|
|
|
#include "../box/free_tiny_fast_mono_dualhot_env_box.h" // Phase 9: MONO DUALHOT ENV gate
|
2025-12-14 20:09:40 +09:00
|
|
|
|
#include "../box/free_tiny_fast_mono_legacy_direct_env_box.h" // Phase 10: MONO LEGACY DIRECT ENV gate
|
Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00
|
|
|
|
#include "../box/alloc_passdown_ssot_env_box.h" // Phase 60: Alloc pass-down SSOT
|
2025-11-17 05:29:08 +09:00
|
|
|
|
|
2025-11-27 11:52:11 +09:00
|
|
|
|
// Helper: current thread id (low 32 bits) for owner check
|
|
|
|
|
|
#ifndef TINY_SELF_U32_LOCAL_DEFINED
|
|
|
|
|
|
#define TINY_SELF_U32_LOCAL_DEFINED
|
|
|
|
|
|
static inline uint32_t tiny_self_u32_local(void) {
|
|
|
|
|
|
return (uint32_t)(uintptr_t)pthread_self();
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 60: Alloc Pass-Down Context (SSOT)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Alloc context: 入口で 1回だけ計算し、下流へ引き回す
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
|
const HakmemEnvSnapshot* env; // ENV snapshot (NULL if snapshot disabled)
|
|
|
|
|
|
SmallRouteKind route_kind; // Route kind (LEGACY/ULTRA/MID/V7)
|
|
|
|
|
|
bool c7_ultra_on; // C7 ULTRA enabled
|
|
|
|
|
|
bool alloc_dualhot_on; // Alloc DUALHOT enabled (C0-C3 direct path)
|
|
|
|
|
|
} alloc_passdown_context_t;
|
|
|
|
|
|
|
|
|
|
|
|
// Phase ALLOC-TINY-FAST-DUALHOT-2: Probe window ENV gate (safe from early putenv)
|
|
|
|
|
|
// Phase 39: BENCH_MINIMAL → 固定 0 (lazy-init 削除) — GO +1.98%
|
|
|
|
|
|
static inline int alloc_dualhot_enabled(void) {
|
|
|
|
|
|
#if HAKMEM_BENCH_MINIMAL
|
|
|
|
|
|
return 0; // FAST v3: 定数化 (default OFF)
|
|
|
|
|
|
#else
|
|
|
|
|
|
static int g = -1;
|
|
|
|
|
|
static int g_probe_left = 64; // Probe window: tolerate early putenv before gate init
|
|
|
|
|
|
if (__builtin_expect(g == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_ALLOC_DUALHOT");
|
|
|
|
|
|
if (e && *e && *e != '0') {
|
|
|
|
|
|
g = 1;
|
|
|
|
|
|
} else if (g_probe_left > 0) {
|
|
|
|
|
|
g_probe_left--;
|
|
|
|
|
|
// Still probing: return "not yet set" without committing 0
|
|
|
|
|
|
if (e == NULL) {
|
|
|
|
|
|
return 0; // Env not set (yet), but keep probing
|
|
|
|
|
|
}
|
|
|
|
|
|
g = 0; // Explicitly set to "0"
|
|
|
|
|
|
} else {
|
|
|
|
|
|
g = 0; // Probe window expired, commit OFF
|
|
|
|
|
|
}
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (g == 1) {
|
|
|
|
|
|
fprintf(stderr, "[DUALHOT-INIT] alloc_dualhot_enabled() = %d (probe_left=%d)\n", g, g_probe_left);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
return g;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// ENV Control (cached, lazy init)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
2025-12-16 15:01:56 +09:00
|
|
|
|
// Enable flag (default: ON)
|
|
|
|
|
|
// Phase 39: BENCH_MINIMAL → 固定 1 (lazy-init 削除) — GO +1.98%
|
2025-11-17 05:29:08 +09:00
|
|
|
|
static inline int front_gate_unified_enabled(void) {
|
2025-12-16 15:01:56 +09:00
|
|
|
|
#if HAKMEM_BENCH_MINIMAL
|
|
|
|
|
|
return 1; // FAST v3: 定数化
|
|
|
|
|
|
#else
|
2025-11-17 05:29:08 +09:00
|
|
|
|
static int g_enable = -1;
|
|
|
|
|
|
if (__builtin_expect(g_enable == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED");
|
2025-11-22 01:29:05 +09:00
|
|
|
|
g_enable = (e && *e && *e == '0') ? 0 : 1; // default ON
|
2025-11-17 05:29:08 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
if (g_enable) {
|
|
|
|
|
|
fprintf(stderr, "[FrontGate-INIT] front_gate_unified_enabled() = %d\n", g_enable);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_enable;
|
2025-12-16 15:01:56 +09:00
|
|
|
|
#endif
|
2025-11-17 05:29:08 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-11 18:34:27 +09:00
|
|
|
|
// ============================================================================
|
2025-12-11 19:01:59 +09:00
|
|
|
|
// Phase REFACTOR-2: Legacy free helper (unified in tiny_legacy_fallback_box.h)
|
2025-12-11 18:34:27 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
2025-12-11 19:01:59 +09:00
|
|
|
|
// Legacy free handling is encapsulated in tiny_legacy_fallback_box.h
|
|
|
|
|
|
// (Removed inline implementation to avoid duplication)
|
2025-12-11 18:34:27 +09:00
|
|
|
|
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// ============================================================================
|
2025-11-29 11:58:37 +09:00
|
|
|
|
// Phase 4-Step2: malloc_tiny_fast() - Hot/Cold Path Box (ACTIVE)
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
2025-11-29 11:58:37 +09:00
|
|
|
|
// Ultra-thin Tiny allocation using Hot/Cold Path Box (Phase 4-Step2)
|
|
|
|
|
|
//
|
|
|
|
|
|
// IMPROVEMENTS over Phase 26-A:
|
|
|
|
|
|
// - Branch reduction: Hot path has only 1 branch (cache empty check)
|
|
|
|
|
|
// - Branch hints: TINY_HOT_LIKELY/UNLIKELY for better CPU prediction
|
|
|
|
|
|
// - Hot/Cold separation: Keeps hot path small (better i-cache locality)
|
|
|
|
|
|
// - Explicit fallback: Clear hot→cold transition
|
|
|
|
|
|
//
|
|
|
|
|
|
// PERFORMANCE:
|
|
|
|
|
|
// - Baseline (Phase 26-A, no PGO): 53.3 M ops/s
|
|
|
|
|
|
// - Hot/Cold Box (no PGO): 57.2 M ops/s (+7.3%)
|
|
|
|
|
|
//
|
|
|
|
|
|
// DESIGN:
|
|
|
|
|
|
// 1. size → class_idx (same as Phase 26-A)
|
|
|
|
|
|
// 2. Hot path: tiny_hot_alloc_fast() - cache hit (1 branch)
|
|
|
|
|
|
// 3. Cold path: tiny_cold_refill_and_alloc() - cache miss (noinline, cold)
|
|
|
|
|
|
//
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// Preconditions:
|
|
|
|
|
|
// - Called AFTER malloc() safety checks (lock depth, initializing, LD_SAFE)
|
|
|
|
|
|
// - size <= tiny_get_max_size() (caller verified)
|
|
|
|
|
|
// Returns:
|
|
|
|
|
|
// - USER pointer on success
|
2025-11-29 11:58:37 +09:00
|
|
|
|
// - NULL on failure (caller falls back to normal path)
|
|
|
|
|
|
//
|
2025-12-13 04:28:52 +09:00
|
|
|
|
|
Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 60: Alloc context SSOT helper (入口で 1回だけ計算)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 60: 入口で ENV snapshot, route kind, C7 ULTRA, DUALHOT を 1回だけ取得
|
|
|
|
|
|
// Phase 43 教訓: Branch は store より高い → この関数自体は追加 branch なし(always_inline)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline alloc_passdown_context_t alloc_passdown_context_compute(int class_idx) {
|
|
|
|
|
|
alloc_passdown_context_t ctx;
|
|
|
|
|
|
|
|
|
|
|
|
// 1. ENV snapshot (入口で 1回だけ)
|
|
|
|
|
|
ctx.env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;
|
|
|
|
|
|
|
|
|
|
|
|
// 2. C7 ULTRA enabled (入口で 1回だけ)
|
|
|
|
|
|
ctx.c7_ultra_on = ctx.env ? ctx.env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env();
|
|
|
|
|
|
|
|
|
|
|
|
// 3. Alloc DUALHOT enabled (入口で 1回だけ)
|
|
|
|
|
|
ctx.alloc_dualhot_on = alloc_dualhot_enabled();
|
|
|
|
|
|
|
|
|
|
|
|
// 4. Route kind (入口で 1回だけ)
|
|
|
|
|
|
if (tiny_static_route_ready_fast()) {
|
|
|
|
|
|
ctx.route_kind = tiny_static_route_get_kind_fast(class_idx);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ctx.route_kind = tiny_policy_hot_get_route_with_env((uint32_t)class_idx, ctx.env);
|
2025-12-13 04:28:52 +09:00
|
|
|
|
}
|
Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00
|
|
|
|
|
|
|
|
|
|
return ctx;
|
2025-12-13 04:28:52 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 18:46:11 +09:00
|
|
|
|
// Phase 2 B3: tiny_alloc_route_cold() - Handle rare routes (V7, MID, ULTRA)
|
|
|
|
|
|
// NOTE: noinline to avoid code bloat in hot path, placed in cold section
|
|
|
|
|
|
__attribute__((noinline, cold))
|
|
|
|
|
|
static void* tiny_alloc_route_cold(SmallRouteKind route_kind, int class_idx, size_t size) {
|
|
|
|
|
|
switch (route_kind) {
|
|
|
|
|
|
case SMALL_ROUTE_ULTRA: {
|
|
|
|
|
|
// Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above)
|
|
|
|
|
|
void* base = tiny_ultra_tls_pop((uint8_t)class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(base != NULL)) {
|
|
|
|
|
|
if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit);
|
|
|
|
|
|
else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit);
|
|
|
|
|
|
else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit);
|
|
|
|
|
|
return tiny_base_to_user_inline(base);
|
|
|
|
|
|
}
|
|
|
|
|
|
// ULTRA miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V35: {
|
|
|
|
|
|
// Phase v11a-3: MID v3.5 allocation
|
|
|
|
|
|
void* v35p = small_mid_v35_alloc(class_idx, size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v35p != NULL)) {
|
|
|
|
|
|
return v35p;
|
|
|
|
|
|
}
|
|
|
|
|
|
// MID v3.5 miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_V7: {
|
|
|
|
|
|
// Phase v7: SmallObject v7 allocation (research box)
|
|
|
|
|
|
void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v7p != NULL)) {
|
|
|
|
|
|
return v7p;
|
|
|
|
|
|
}
|
|
|
|
|
|
// V7 miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V3: {
|
|
|
|
|
|
// Phase MID-V3: MID v3 allocation (257-768B, C5-C6)
|
|
|
|
|
|
// Note: MID v3 uses same segment infrastructure as MID v3.5
|
|
|
|
|
|
// For now, delegate to MID v3.5 which handles both
|
|
|
|
|
|
void* v3p = small_mid_v35_alloc(class_idx, size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v3p != NULL)) {
|
|
|
|
|
|
return v3p;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_LEGACY:
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Fallback: LEGACY unified cache hot/cold path
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00
|
|
|
|
// Phase 60: malloc_tiny_fast_for_class_ssot() - SSOT mode (context pre-computed)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline void* malloc_tiny_fast_for_class_ssot(size_t size, int class_idx, const alloc_passdown_context_t* ctx) {
|
|
|
|
|
|
// Stats (class_idx already validated by gate)
|
|
|
|
|
|
tiny_front_alloc_stat_inc(class_idx);
|
|
|
|
|
|
ALLOC_GATE_STAT_INC_CLASS(class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 60: Use pre-computed context (避免重複計算)
|
|
|
|
|
|
// C7 ULTRA early-exit (skip policy snapshot for common case)
|
|
|
|
|
|
if (class_idx == 7 && ctx->c7_ultra_on) {
|
|
|
|
|
|
void* ultra_p = tiny_c7_ultra_alloc(size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ultra_p != NULL)) {
|
|
|
|
|
|
return ultra_p;
|
|
|
|
|
|
}
|
|
|
|
|
|
// C7 ULTRA miss → fall through to policy-based routing
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// C0-C3 direct path (second hot path)
|
|
|
|
|
|
if ((unsigned)class_idx <= 3u) {
|
|
|
|
|
|
if (ctx->alloc_dualhot_on) {
|
|
|
|
|
|
// Direct to LEGACY unified cache (no policy snapshot)
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Routing dispatch: Use pre-computed route_kind from context
|
|
|
|
|
|
const tiny_env_cfg_t* env_cfg = tiny_env_cfg();
|
|
|
|
|
|
if (TINY_HOT_LIKELY(env_cfg->alloc_route_shape)) {
|
|
|
|
|
|
// B3 optimized: Prioritize LEGACY with LIKELY hint
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ctx->route_kind == SMALL_ROUTE_LEGACY)) {
|
|
|
|
|
|
// Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1)
|
|
|
|
|
|
if (__builtin_expect(env_cfg->tiny_prefetch, 0)) {
|
|
|
|
|
|
__builtin_prefetch(&g_unified_cache[class_idx], 0, 3);
|
|
|
|
|
|
}
|
|
|
|
|
|
// LEGACY fast path: Unified Cache hot/cold
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Rare routes: delegate to cold helper
|
|
|
|
|
|
return tiny_alloc_route_cold(ctx->route_kind, class_idx, size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Original dispatch (backward compatible, default)
|
|
|
|
|
|
switch (ctx->route_kind) {
|
|
|
|
|
|
case SMALL_ROUTE_ULTRA: {
|
|
|
|
|
|
// Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above)
|
|
|
|
|
|
void* base = tiny_ultra_tls_pop((uint8_t)class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(base != NULL)) {
|
|
|
|
|
|
if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit);
|
|
|
|
|
|
else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit);
|
|
|
|
|
|
else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit);
|
|
|
|
|
|
return tiny_base_to_user_inline(base);
|
|
|
|
|
|
}
|
|
|
|
|
|
// ULTRA miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V35: {
|
|
|
|
|
|
// Phase v11a-3: MID v3.5 allocation
|
|
|
|
|
|
void* v35p = small_mid_v35_alloc(class_idx, size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v35p != NULL)) {
|
|
|
|
|
|
return v35p;
|
|
|
|
|
|
}
|
|
|
|
|
|
// MID v3.5 miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_V7: {
|
|
|
|
|
|
// Phase v7: SmallObject v7 allocation (research box)
|
|
|
|
|
|
void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v7p != NULL)) {
|
|
|
|
|
|
return v7p;
|
|
|
|
|
|
}
|
|
|
|
|
|
// V7 miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V3: {
|
|
|
|
|
|
// Phase MID-V3: MID v3 allocation (257-768B, C5-C6)
|
|
|
|
|
|
void* v3p = small_mid_v35_alloc(class_idx, size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v3p != NULL)) {
|
|
|
|
|
|
return v3p;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_LEGACY:
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1)
|
|
|
|
|
|
if (__builtin_expect(env_cfg->tiny_prefetch, 0)) {
|
|
|
|
|
|
__builtin_prefetch(&g_unified_cache[class_idx], 0, 3);
|
|
|
|
|
|
}
|
|
|
|
|
|
// LEGACY fallback: Unified Cache hot/cold path
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 06:50:39 +09:00
|
|
|
|
// Phase ALLOC-GATE-SSOT-1: malloc_tiny_fast_for_class() - body (class_idx already known)
|
2025-11-17 05:29:08 +09:00
|
|
|
|
__attribute__((always_inline))
|
2025-12-13 06:50:39 +09:00
|
|
|
|
static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) {
|
Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00
|
|
|
|
// Phase 60: SSOT mode (ENV gated)
|
|
|
|
|
|
if (alloc_passdown_ssot_enabled()) {
|
|
|
|
|
|
alloc_passdown_context_t ctx = alloc_passdown_context_compute(class_idx);
|
|
|
|
|
|
return malloc_tiny_fast_for_class_ssot(size, class_idx, &ctx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Original path (backward compatible, default)
|
2025-12-13 06:50:39 +09:00
|
|
|
|
// Stats (class_idx already validated by gate)
|
2025-12-08 21:30:21 +09:00
|
|
|
|
tiny_front_alloc_stat_inc(class_idx);
|
2025-12-11 21:32:40 +09:00
|
|
|
|
ALLOC_GATE_STAT_INC_CLASS(class_idx);
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// Phase v11a-5b: C7 ULTRA early-exit (skip policy snapshot for common case)
|
|
|
|
|
|
// This is the most common hot path - avoids TLS policy overhead
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
// Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1)
|
2025-12-15 12:29:27 +09:00
|
|
|
|
// Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards)
|
2025-12-15 12:50:16 +09:00
|
|
|
|
const HakmemEnvSnapshot* env = NULL;
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
bool c7_ultra_on;
|
2025-12-15 12:29:27 +09:00
|
|
|
|
if (hakmem_env_snapshot_enabled()) {
|
2025-12-15 12:50:16 +09:00
|
|
|
|
env = hakmem_env_snapshot();
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
c7_ultra_on = env->tiny_c7_ultra_enabled;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
c7_ultra_on = tiny_c7_ultra_enabled_env();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (class_idx == 7 && c7_ultra_on) {
|
2025-12-10 22:57:26 +09:00
|
|
|
|
void* ultra_p = tiny_c7_ultra_alloc(size);
|
2025-12-11 20:39:46 +09:00
|
|
|
|
if (TINY_HOT_LIKELY(ultra_p != NULL)) {
|
2025-12-10 22:57:26 +09:00
|
|
|
|
return ultra_p;
|
|
|
|
|
|
}
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// C7 ULTRA miss → fall through to policy-based routing
|
2025-12-11 19:38:27 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 06:50:39 +09:00
|
|
|
|
// Phase ALLOC-TINY-FAST-DUALHOT-2: C0-C3 direct path (second hot path)
|
2025-12-13 04:28:52 +09:00
|
|
|
|
// Skip expensive policy snapshot and route determination for C0-C3.
|
2025-12-13 06:50:39 +09:00
|
|
|
|
// NOTE: Branch only taken if class_idx <= 3 (rare when OFF, frequent when ON)
|
|
|
|
|
|
if ((unsigned)class_idx <= 3u) {
|
|
|
|
|
|
if (alloc_dualhot_enabled()) {
|
2025-12-13 05:10:45 +09:00
|
|
|
|
// Direct to LEGACY unified cache (no policy snapshot)
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
2025-12-13 04:28:52 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 19:19:42 +09:00
|
|
|
|
// 2. Route selection: Static route table (Phase 3 C3) or policy hot cache (Phase 3 C2) or policy snapshot (default)
|
2025-12-13 18:46:11 +09:00
|
|
|
|
SmallRouteKind route_kind;
|
|
|
|
|
|
if (tiny_static_route_ready_fast()) {
|
|
|
|
|
|
route_kind = tiny_static_route_get_kind_fast(class_idx);
|
|
|
|
|
|
} else {
|
2025-12-13 19:19:42 +09:00
|
|
|
|
// Phase 3 C2: Use policy hot cache if enabled (eliminates policy_snapshot() call)
|
2025-12-15 12:50:16 +09:00
|
|
|
|
route_kind = tiny_policy_hot_get_route_with_env((uint32_t)class_idx, env);
|
2025-12-13 18:46:11 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2 B3: Routing dispatch (ENV gate HAKMEM_TINY_ALLOC_ROUTE_SHAPE)
|
|
|
|
|
|
// Optimized: LIKELY on LEGACY (common case), cold helper for rare routes
|
|
|
|
|
|
const tiny_env_cfg_t* env_cfg = tiny_env_cfg();
|
|
|
|
|
|
if (TINY_HOT_LIKELY(env_cfg->alloc_route_shape)) {
|
|
|
|
|
|
// B3 optimized: Prioritize LEGACY with LIKELY hint
|
|
|
|
|
|
if (TINY_HOT_LIKELY(route_kind == SMALL_ROUTE_LEGACY)) {
|
2025-12-13 19:01:57 +09:00
|
|
|
|
// Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1)
|
|
|
|
|
|
if (__builtin_expect(env_cfg->tiny_prefetch, 0)) {
|
|
|
|
|
|
__builtin_prefetch(&g_unified_cache[class_idx], 0, 3);
|
|
|
|
|
|
}
|
2025-12-13 18:46:11 +09:00
|
|
|
|
// LEGACY fast path: Unified Cache hot/cold
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Rare routes: delegate to cold helper
|
|
|
|
|
|
return tiny_alloc_route_cold(route_kind, class_idx, size);
|
|
|
|
|
|
}
|
2025-12-12 16:26:42 +09:00
|
|
|
|
|
2025-12-13 18:46:11 +09:00
|
|
|
|
// Original dispatch (backward compatible, default)
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// 3. Single switch on route_kind (all ENV checks moved to Policy init)
|
|
|
|
|
|
switch (route_kind) {
|
|
|
|
|
|
case SMALL_ROUTE_ULTRA: {
|
|
|
|
|
|
// Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above)
|
|
|
|
|
|
void* base = tiny_ultra_tls_pop((uint8_t)class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(base != NULL)) {
|
|
|
|
|
|
if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit);
|
|
|
|
|
|
else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit);
|
|
|
|
|
|
else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit);
|
|
|
|
|
|
return tiny_base_to_user_inline(base);
|
|
|
|
|
|
}
|
|
|
|
|
|
// ULTRA miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
2025-12-12 06:52:14 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
case SMALL_ROUTE_MID_V35: {
|
|
|
|
|
|
// Phase v11a-3: MID v3.5 allocation
|
|
|
|
|
|
void* v35p = small_mid_v35_alloc(class_idx, size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v35p != NULL)) {
|
|
|
|
|
|
return v35p;
|
|
|
|
|
|
}
|
|
|
|
|
|
// MID v3.5 miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
2025-12-12 03:50:58 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
case SMALL_ROUTE_V7: {
|
|
|
|
|
|
// Phase v7: SmallObject v7 allocation (research box)
|
2025-12-12 03:12:28 +09:00
|
|
|
|
void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v7p != NULL)) {
|
|
|
|
|
|
return v7p;
|
|
|
|
|
|
}
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// V7 miss → fallback to LEGACY
|
2025-12-12 03:12:28 +09:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2025-12-12 16:26:42 +09:00
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V3: {
|
|
|
|
|
|
// Phase MID-V3: MID v3 allocation (257-768B, C5-C6)
|
|
|
|
|
|
// Note: MID v3 uses same segment infrastructure as MID v3.5
|
|
|
|
|
|
// For now, delegate to MID v3.5 which handles both
|
|
|
|
|
|
void* v3p = small_mid_v35_alloc(class_idx, size);
|
|
|
|
|
|
if (TINY_HOT_LIKELY(v3p != NULL)) {
|
|
|
|
|
|
return v3p;
|
2025-12-08 21:30:21 +09:00
|
|
|
|
}
|
|
|
|
|
|
break;
|
2025-12-07 22:49:28 +09:00
|
|
|
|
}
|
2025-12-12 16:26:42 +09:00
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_LEGACY:
|
2025-12-08 21:30:21 +09:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
2025-12-07 03:12:27 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 19:01:57 +09:00
|
|
|
|
// Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1)
|
|
|
|
|
|
if (__builtin_expect(env_cfg->tiny_prefetch, 0)) {
|
|
|
|
|
|
__builtin_prefetch(&g_unified_cache[class_idx], 0, 3);
|
|
|
|
|
|
}
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// LEGACY fallback: Unified Cache hot/cold path
|
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
2025-11-29 11:58:37 +09:00
|
|
|
|
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
|
|
|
|
|
return ptr;
|
2025-11-17 05:29:08 +09:00
|
|
|
|
}
|
2025-11-29 11:58:37 +09:00
|
|
|
|
return tiny_cold_refill_and_alloc(class_idx);
|
2025-11-17 05:29:08 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 06:50:39 +09:00
|
|
|
|
// Wrapper: size → class_idx conversion (SSOT)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline void* malloc_tiny_fast(size_t size) {
|
|
|
|
|
|
// Phase ALLOC-GATE-OPT-1: カウンタ散布 (1. 関数入口)
|
|
|
|
|
|
ALLOC_GATE_STAT_INC(total_calls);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase ALLOC-GATE-SSOT-1: Single size→class conversion (SSOT)
|
|
|
|
|
|
ALLOC_GATE_STAT_INC(size_to_class_calls);
|
|
|
|
|
|
int class_idx = hak_tiny_size_to_class(size);
|
|
|
|
|
|
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Delegate to *_for_class (stats tracked inside)
|
|
|
|
|
|
return malloc_tiny_fast_for_class(size, class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split helpers
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
2025-12-15 21:36:30 +09:00
|
|
|
|
// Phase 19-6C Helper: Compute tiny_route and use_tiny_heap once, reuse in hot/cold/legacy paths
|
|
|
|
|
|
static inline void free_tiny_fast_compute_route_and_heap(
|
|
|
|
|
|
int class_idx,
|
|
|
|
|
|
tiny_route_kind_t* out_route,
|
|
|
|
|
|
int* out_use_tiny_heap)
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
{
|
2025-12-13 21:44:00 +09:00
|
|
|
|
// Phase 3 D1: Free path route cache (eliminate tiny_route_for_class overhead)
|
|
|
|
|
|
tiny_route_kind_t route;
|
|
|
|
|
|
if (__builtin_expect(tiny_free_static_route_enabled(), 0)) {
|
|
|
|
|
|
// Use cached route (bypasses tiny_route_for_class())
|
|
|
|
|
|
route = g_tiny_route_class[(unsigned)class_idx & 7u];
|
|
|
|
|
|
if (__builtin_expect(route == TINY_ROUTE_LEGACY && !g_tiny_route_snapshot_done, 0)) {
|
|
|
|
|
|
// Fallback if uninitialized
|
|
|
|
|
|
route = tiny_route_for_class((uint8_t)class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// Standard path
|
|
|
|
|
|
route = tiny_route_for_class((uint8_t)class_idx);
|
|
|
|
|
|
}
|
2025-12-15 21:36:30 +09:00
|
|
|
|
*out_route = route;
|
|
|
|
|
|
*out_use_tiny_heap = tiny_route_is_heap_kind(route);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Cold path: Cross-thread free, TinyHeap routes, and legacy fallback
|
|
|
|
|
|
// (noinline,cold to keep hot path small and I-cache clean)
|
|
|
|
|
|
// Phase 19-6C: Accept pre-computed route + use_tiny_heap to eliminate redundant computation
|
|
|
|
|
|
__attribute__((noinline,cold))
|
|
|
|
|
|
static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const HakmemEnvSnapshot* env,
|
|
|
|
|
|
tiny_route_kind_t route, int use_tiny_heap)
|
|
|
|
|
|
{
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_hit);
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
|
|
|
|
|
|
// TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast.
|
|
|
|
|
|
// In Release builds, we trust header magic (0xA0) as sufficient validation.
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
// Superslab 登録確認(誤分類防止)
|
|
|
|
|
|
SuperSlab* ss_guard = hak_super_lookup(ptr);
|
|
|
|
|
|
if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) {
|
|
|
|
|
|
return 0; // hakmem 管理外 → 通常 free 経路へ
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif // !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
|
2025-12-14 06:44:04 +09:00
|
|
|
|
// Phase 5 E5-3a: Optimized cold path shape
|
|
|
|
|
|
// Strategy: Handle common LEGACY path first (use_tiny_heap==0 in Mixed ~90%+)
|
|
|
|
|
|
// Defer expensive LARSON/cross-thread checks to only when heap routing needed
|
|
|
|
|
|
static __thread int g_cold_shape = -1;
|
|
|
|
|
|
if (__builtin_expect(g_cold_shape == -1, 0)) {
|
|
|
|
|
|
g_cold_shape = free_cold_shape_enabled() ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (g_cold_shape == 1) {
|
|
|
|
|
|
// Optimized shape: Check use_tiny_heap FIRST
|
|
|
|
|
|
if (__builtin_expect(!use_tiny_heap, 1)) {
|
|
|
|
|
|
// Most common case in Mixed: LEGACY path, no heap routing
|
|
|
|
|
|
// Skip LARSON/cross-thread check entirely (not needed for legacy)
|
|
|
|
|
|
FREE_COLD_SHAPE_STAT_INC(legacy_fast);
|
|
|
|
|
|
FREE_COLD_SHAPE_STAT_INC(enabled_count);
|
|
|
|
|
|
goto legacy_fallback;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Rare: heap routing needed, do full validation
|
|
|
|
|
|
FREE_COLD_SHAPE_STAT_INC(heap_path);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Baseline shape: LARSON check first (current behavior)
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
// Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path
|
|
|
|
|
|
{
|
|
|
|
|
|
static __thread int g_larson_fix = -1;
|
|
|
|
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|
|
|
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL");
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) {
|
|
|
|
|
|
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
|
|
|
|
|
|
SuperSlab* ss = ss_fast_lookup(base);
|
|
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (5. super_lookup 呼び出し)
|
|
|
|
|
|
FREE_PATH_STAT_INC(super_lookup_called);
|
|
|
|
|
|
if (ss) {
|
|
|
|
|
|
int slab_idx = slab_index_for(ss, base);
|
|
|
|
|
|
if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) {
|
|
|
|
|
|
uint32_t self_tid = tiny_self_u32_local();
|
|
|
|
|
|
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
|
|
|
|
|
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
|
|
|
|
|
// LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes)
|
|
|
|
|
|
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
static _Atomic uint64_t g_owner_check_count = 0;
|
|
|
|
|
|
uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1);
|
|
|
|
|
|
if (oc < 10) {
|
|
|
|
|
|
fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n",
|
|
|
|
|
|
ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp));
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
|
|
|
|
|
|
// Cross-thread free → route to remote queue instead of poisoning TLS cache
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_cross_thread);
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
static _Atomic uint64_t g_cross_thread_count = 0;
|
|
|
|
|
|
uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1);
|
|
|
|
|
|
if (ct < 20) {
|
|
|
|
|
|
fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n",
|
|
|
|
|
|
ptr, owner_tid_low, self_tid_cmp, self_tid);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
|
|
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (6. cross-thread free)
|
|
|
|
|
|
FREE_PATH_STAT_INC(remote_free);
|
|
|
|
|
|
return 1; // handled via remote queue
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0; // remote push failed; fall back to normal path
|
|
|
|
|
|
}
|
2025-12-14 06:44:04 +09:00
|
|
|
|
// Same-thread + TinyHeap route → route-based free
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
if (__builtin_expect(use_tiny_heap, 0)) {
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_tinyheap);
|
|
|
|
|
|
switch (route) {
|
|
|
|
|
|
case TINY_ROUTE_SMALL_HEAP_V7: {
|
|
|
|
|
|
// Phase v7-1: C6-only v7 stub (MID v3 fallback)
|
|
|
|
|
|
if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
break; // fallthrough to legacy
|
|
|
|
|
|
}
|
|
|
|
|
|
case TINY_ROUTE_SMALL_HEAP_V6: {
|
|
|
|
|
|
// Phase V6-HDR-2: Headerless free (ENV gated)
|
|
|
|
|
|
if (small_v6_headerless_route_enabled((uint8_t)class_idx)) {
|
|
|
|
|
|
SmallHeapCtxV6* ctx_v6 = small_heap_ctx_v6();
|
|
|
|
|
|
if (small_v6_headerless_free(ctx_v6, ptr, (uint8_t)class_idx)) {
|
|
|
|
|
|
return 1; // Handled by v6
|
|
|
|
|
|
}
|
|
|
|
|
|
// v6 returned false -> fallback to legacy
|
|
|
|
|
|
}
|
|
|
|
|
|
break; // fallthrough to legacy
|
|
|
|
|
|
}
|
|
|
|
|
|
// Phase v10: v3/v4/v5 removed - routes now handled as LEGACY
|
|
|
|
|
|
case TINY_ROUTE_HOTHEAP_V2:
|
|
|
|
|
|
tiny_hotheap_v2_free((uint8_t)class_idx, base, meta);
|
|
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (v2 は tiny_heap_v1 にカウント)
|
|
|
|
|
|
FREE_PATH_STAT_INC(tiny_heap_v1_fast);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
case TINY_ROUTE_HEAP: {
|
|
|
|
|
|
tiny_heap_ctx_t* ctx = tiny_heap_ctx_for_thread();
|
|
|
|
|
|
if (class_idx == 7) {
|
|
|
|
|
|
tiny_c7_free_fast_with_meta(ss, slab_idx, base);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
tiny_heap_free_class_fast_with_meta(ctx, class_idx, ss, slab_idx, base);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (9. TinyHeap v1 route)
|
|
|
|
|
|
FREE_PATH_STAT_INC(tiny_heap_v1_fast);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (use_tiny_heap) {
|
|
|
|
|
|
// fallback: lookup failed but TinyHeap front is ON → use generic TinyHeap free
|
|
|
|
|
|
if (route == TINY_ROUTE_HOTHEAP_V2) {
|
|
|
|
|
|
tiny_hotheap_v2_record_free_fallback((uint8_t)class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Phase v10: v3/v4 removed - no special fallback
|
|
|
|
|
|
tiny_heap_free_class_fast(tiny_heap_ctx_for_thread(), class_idx, ptr);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Debug: Log free operations (first 5000, all classes)
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
{
|
|
|
|
|
|
extern _Atomic uint64_t g_debug_op_count;
|
|
|
|
|
|
extern __thread TinyTLSSLL g_tls_sll[];
|
|
|
|
|
|
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
|
|
|
|
|
|
// Note: Shares g_debug_op_count with alloc logging, so bump the window.
|
|
|
|
|
|
if (op < 5000) {
|
|
|
|
|
|
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast_cold tls_count_before=%u\n",
|
|
|
|
|
|
(unsigned long)op, class_idx, ptr, base,
|
|
|
|
|
|
g_tls_sll[class_idx].count);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Phase REFACTOR-2: Legacy fallback (use unified helper)
|
2025-12-14 06:44:04 +09:00
|
|
|
|
legacy_fallback:
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_legacy_fallback);
|
2025-12-15 12:50:16 +09:00
|
|
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Hot path: Fast-path validation + ULTRA/MID/V7 routes
|
|
|
|
|
|
// (always_inline to minimize overhead on critical path)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline int free_tiny_fast_hot(void* ptr) {
|
|
|
|
|
|
if (__builtin_expect(!ptr, 0)) {
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_null_ptr);
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_TINY_HEADER_CLASSIDX
|
|
|
|
|
|
// 1. ページ境界ガード:
|
|
|
|
|
|
// ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。
|
|
|
|
|
|
// その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。
|
|
|
|
|
|
uintptr_t off = (uintptr_t)ptr & 0xFFFu;
|
|
|
|
|
|
if (__builtin_expect(off == 0, 0)) {
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_page_boundary);
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 2. Fast header magic validation (必須)
|
|
|
|
|
|
// Release ビルドでは tiny_region_id_read_header() が magic を省略するため、
|
|
|
|
|
|
// ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。
|
|
|
|
|
|
uint8_t* header_ptr = (uint8_t*)ptr - 1;
|
|
|
|
|
|
uint8_t header = *header_ptr;
|
|
|
|
|
|
uint8_t magic = header & 0xF0u;
|
|
|
|
|
|
if (__builtin_expect(magic != HEADER_MAGIC, 0)) {
|
|
|
|
|
|
// Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_bad_magic);
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 3. class_idx 抽出(下位4bit)
|
|
|
|
|
|
int class_idx = (int)(header & HEADER_CLASS_MASK);
|
|
|
|
|
|
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_bad_class);
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 4. BASE を計算して Unified Cache に push
|
|
|
|
|
|
void* base = tiny_user_to_base_inline(ptr);
|
|
|
|
|
|
tiny_front_free_stat_inc(class_idx);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口)
|
|
|
|
|
|
FREE_PATH_STAT_INC(total_calls);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case)
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
// Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1)
|
2025-12-15 12:29:27 +09:00
|
|
|
|
// Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards)
|
2025-12-15 12:50:16 +09:00
|
|
|
|
const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;
|
|
|
|
|
|
const bool c7_ultra_free = env ? env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env();
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
|
|
|
|
|
|
if (class_idx == 7 && c7_ultra_free) {
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_c7_ultra);
|
|
|
|
|
|
tiny_c7_ultra_free(ptr);
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-13 03:46:36 +09:00
|
|
|
|
// Phase FREE-TINY-FAST-DUALHOT-1: C0-C3 direct path (48% of calls)
|
|
|
|
|
|
// Skip expensive policy snapshot and route determination, direct to legacy fallback.
|
|
|
|
|
|
// Safety: Check Larson mode (cross-thread free handling requires full validation path)
|
|
|
|
|
|
{
|
|
|
|
|
|
static __thread int g_larson_fix = -1;
|
|
|
|
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|
|
|
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(class_idx <= 3 && !g_larson_fix, 1)) {
|
|
|
|
|
|
// C0-C3 + Larson mode OFF → Direct to legacy (no policy snapshot overhead)
|
2025-12-15 12:50:16 +09:00
|
|
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
2025-12-13 03:46:36 +09:00
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
// Phase POLICY-FAST-PATH-V2: Skip policy snapshot for known-legacy classes
|
|
|
|
|
|
if (free_policy_fast_v2_can_skip((uint8_t)class_idx)) {
|
|
|
|
|
|
FREE_PATH_STAT_INC(policy_fast_v2_skip);
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_policy_fast_skip);
|
|
|
|
|
|
goto cold_path; // Delegate to cold path for legacy handling
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Phase v11b-1: Policy-based single switch (replaces serial ULTRA checks)
|
|
|
|
|
|
const SmallPolicyV7* policy_free = small_policy_v7_snapshot();
|
|
|
|
|
|
SmallRouteKind route_kind_free = policy_free->route_kind[class_idx];
|
|
|
|
|
|
|
|
|
|
|
|
switch (route_kind_free) {
|
|
|
|
|
|
case SMALL_ROUTE_ULTRA: {
|
|
|
|
|
|
// Phase TLS-UNIFY-1: Unified ULTRA TLS push for C4-C6 (C7 handled above)
|
|
|
|
|
|
if (class_idx >= 4 && class_idx <= 6) {
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_ultra_tls);
|
|
|
|
|
|
tiny_ultra_tls_push((uint8_t)class_idx, base);
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
// ULTRA for other classes → fallback to cold path
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V35: {
|
|
|
|
|
|
// Phase v11a-3: MID v3.5 free
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_mid_v35);
|
|
|
|
|
|
small_mid_v35_free(ptr, class_idx);
|
|
|
|
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_V7: {
|
|
|
|
|
|
// Phase v7: SmallObject v7 free (research box)
|
|
|
|
|
|
if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) {
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_v7);
|
|
|
|
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
// V7 miss → fallback to cold path
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_MID_V3: {
|
|
|
|
|
|
// Phase MID-V3: delegate to MID v3.5
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_mid_v35);
|
|
|
|
|
|
small_mid_v35_free(ptr, class_idx);
|
|
|
|
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|
|
|
|
|
FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SMALL_ROUTE_LEGACY:
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
cold_path:
|
|
|
|
|
|
// Delegate to cold path for cross-thread, TinyHeap, and legacy handling
|
2025-12-15 21:36:30 +09:00
|
|
|
|
// Phase 19-6C: Compute route once, pass to cold path to avoid redundant lookup
|
|
|
|
|
|
tiny_route_kind_t route_for_cold;
|
|
|
|
|
|
int use_tiny_heap_for_cold;
|
|
|
|
|
|
free_tiny_fast_compute_route_and_heap(class_idx, &route_for_cold, &use_tiny_heap_for_cold);
|
|
|
|
|
|
return free_tiny_fast_cold(ptr, base, class_idx, env, route_for_cold, use_tiny_heap_for_cold);
|
Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split for free_tiny_fast [RESEARCH BOX - FREEZE]
Split free_tiny_fast() into hot and cold paths to reduce I-cache pressure:
- free_tiny_fast_hot(): always_inline, fast-path validation + ULTRA/MID/V7
- free_tiny_fast_cold(): noinline,cold, cross-thread + TinyHeap + legacy
ENV: HAKMEM_FREE_TINY_FAST_HOTCOLD=0/1 (default 0)
Stats: HAKMEM_FREE_TINY_FAST_HOTCOLD_STATS=0/1 (TLS only, exit dump)
## Benchmark Results (random mixed, 100M ops)
HOTCOLD=0 (legacy): 49.35M, 50.18M, 50.25M ops/s (median: 50.18M)
HOTCOLD=1 (split): 43.54M, 43.59M, 43.62M ops/s (median: 43.59M)
**Regression: -13.1%** (NO-GO)
## Stats Analysis (10M ops, HOTCOLD_STATS=1)
Hot path: 50.11% (C7 ULTRA early-exit)
Cold path: 48.43% (legacy fallback)
## Root Cause
Design assumption FAILED: "Cold path is rare"
Reality: Cold path is 48% (almost as common as hot path)
The split introduces:
1. Extra dispatch overhead in hot path
2. Function call overhead to cold for ~48% of frees
3. "Cold" is NOT rare - it's the legacy fallback for non-ULTRA classes
## Conclusion
**FREEZE as research box (default OFF)**
Box Theory value:
- Validated hot/cold distribution via TLS stats
- Confirmed that legacy fallback is NOT rare (48%)
- Demonstrated that naive hot/cold split hurts when "cold" is common
Alternative approaches for future work:
1. Inline the legacy fallback in hot path (no split)
2. Route-specific specialization (C7 vs non-C7 separate paths)
3. Policy-based early routing (before header validation)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 03:16:54 +09:00
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
// No header mode - fall back to normal free
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 26-B: free_tiny_fast() - Ultra-thin Tiny deallocation
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Single-layer Tiny deallocation (bypasses hak_free_at + wrapper + diagnostics)
|
|
|
|
|
|
// Preconditions:
|
|
|
|
|
|
// - ptr is from malloc_tiny_fast() (has valid header)
|
|
|
|
|
|
// - Front Gate Unified is enabled
|
|
|
|
|
|
// Returns:
|
|
|
|
|
|
// - 1 on success (pushed to Unified Cache)
|
|
|
|
|
|
// - 0 on failure (caller falls back to normal free path)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline int free_tiny_fast(void* ptr) {
|
|
|
|
|
|
if (__builtin_expect(!ptr, 0)) return 0;
|
|
|
|
|
|
|
2025-12-08 21:30:21 +09:00
|
|
|
|
#if HAKMEM_TINY_HEADER_CLASSIDX
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// 1. ページ境界ガード:
|
|
|
|
|
|
// ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。
|
|
|
|
|
|
// その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。
|
|
|
|
|
|
uintptr_t off = (uintptr_t)ptr & 0xFFFu;
|
|
|
|
|
|
if (__builtin_expect(off == 0, 0)) {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 2. Fast header magic validation (必須)
|
|
|
|
|
|
// Release ビルドでは tiny_region_id_read_header() が magic を省略するため、
|
|
|
|
|
|
// ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。
|
|
|
|
|
|
uint8_t* header_ptr = (uint8_t*)ptr - 1;
|
|
|
|
|
|
uint8_t header = *header_ptr;
|
|
|
|
|
|
uint8_t magic = header & 0xF0u;
|
|
|
|
|
|
if (__builtin_expect(magic != HEADER_MAGIC, 0)) {
|
|
|
|
|
|
// Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 3. class_idx 抽出(下位4bit)
|
|
|
|
|
|
int class_idx = (int)(header & HEADER_CLASS_MASK);
|
|
|
|
|
|
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 4. BASE を計算して Unified Cache に push
|
2025-12-11 19:02:49 +09:00
|
|
|
|
void* base = tiny_user_to_base_inline(ptr);
|
2025-12-08 21:30:21 +09:00
|
|
|
|
tiny_front_free_stat_inc(class_idx);
|
2025-12-10 09:08:18 +09:00
|
|
|
|
|
2025-12-11 18:04:14 +09:00
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口)
|
|
|
|
|
|
FREE_PATH_STAT_INC(total_calls);
|
|
|
|
|
|
|
2025-12-15 12:50:16 +09:00
|
|
|
|
// Phase 19-3b: Consolidate ENV snapshot reads (capture once per free_tiny_fast call).
|
|
|
|
|
|
const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;
|
|
|
|
|
|
|
2025-12-14 19:16:49 +09:00
|
|
|
|
// Phase 9: MONO DUALHOT early-exit for C0-C3 (skip policy snapshot, direct to legacy)
|
|
|
|
|
|
// Conditions:
|
|
|
|
|
|
// - ENV: HAKMEM_FREE_TINY_FAST_MONO_DUALHOT=1
|
|
|
|
|
|
// - class_idx <= 3 (C0-C3)
|
|
|
|
|
|
// - !HAKMEM_TINY_LARSON_FIX (cross-thread handling requires full validation)
|
|
|
|
|
|
// - g_tiny_route_snapshot_done == 1 && route == TINY_ROUTE_LEGACY (断定できないときは既存経路)
|
|
|
|
|
|
if ((unsigned)class_idx <= 3u) {
|
|
|
|
|
|
if (free_tiny_fast_mono_dualhot_enabled()) {
|
|
|
|
|
|
static __thread int g_larson_fix = -1;
|
|
|
|
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|
|
|
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!g_larson_fix &&
|
|
|
|
|
|
g_tiny_route_snapshot_done == 1 &&
|
|
|
|
|
|
g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) {
|
|
|
|
|
|
// Direct path: Skip policy snapshot, go straight to legacy fallback
|
|
|
|
|
|
FREE_PATH_STAT_INC(mono_dualhot_hit);
|
2025-12-15 12:50:16 +09:00
|
|
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
2025-12-14 19:16:49 +09:00
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-14 20:09:40 +09:00
|
|
|
|
// Phase 10: MONO LEGACY DIRECT early-exit for C4-C7 (skip policy snapshot, direct to legacy)
|
|
|
|
|
|
// Conditions:
|
|
|
|
|
|
// - ENV: HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT=1
|
|
|
|
|
|
// - cached nonlegacy_mask: class is NOT in non-legacy mask (= ULTRA/MID/V7 not active)
|
|
|
|
|
|
// - g_tiny_route_snapshot_done == 1 && route == TINY_ROUTE_LEGACY (断定できないときは既存経路)
|
|
|
|
|
|
// - !HAKMEM_TINY_LARSON_FIX (cross-thread handling requires full validation)
|
|
|
|
|
|
if (free_tiny_fast_mono_legacy_direct_enabled()) {
|
|
|
|
|
|
// 1. Check nonlegacy mask (computed once at init)
|
|
|
|
|
|
uint8_t nonlegacy_mask = free_tiny_fast_mono_legacy_direct_nonlegacy_mask();
|
|
|
|
|
|
if ((nonlegacy_mask & (1u << class_idx)) == 0) {
|
|
|
|
|
|
// 2. Check route snapshot
|
|
|
|
|
|
if (g_tiny_route_snapshot_done == 1 && g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) {
|
|
|
|
|
|
// 3. Check Larson fix
|
|
|
|
|
|
static __thread int g_larson_fix = -1;
|
|
|
|
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|
|
|
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!g_larson_fix) {
|
|
|
|
|
|
// Direct path: Skip policy snapshot, go straight to legacy fallback
|
|
|
|
|
|
FREE_PATH_STAT_INC(mono_legacy_direct_hit);
|
2025-12-15 12:50:16 +09:00
|
|
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
2025-12-14 20:09:40 +09:00
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case)
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
// Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1)
|
2025-12-15 12:29:27 +09:00
|
|
|
|
// Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards)
|
2025-12-15 12:50:16 +09:00
|
|
|
|
const bool c7_ultra_free = env ? env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env();
|
Phase 4 E1: ENV Snapshot Consolidation - GO (+3.92% avg, +4.01% median)
Target: Consolidate 3 ENV gate TLS reads → 1 TLS read
- tiny_c7_ultra_enabled_env(): 1.28% self
- tiny_front_v3_enabled(): 1.01% self
- tiny_metadata_cache_enabled(): 0.97% self
- Total overhead: 3.26% self (perf profile analysis)
Implementation:
- core/box/hakmem_env_snapshot_box.h (new): ENV snapshot struct & API
- core/box/hakmem_env_snapshot_box.c (new): TLS snapshot implementation
- core/front/malloc_tiny_fast.h: Migrated 5 call sites to snapshot
- core/box/tiny_legacy_fallback_box.h: Migrated 2 call sites
- core/box/tiny_metadata_cache_hot_box.h: Migrated 1 call site
- core/bench_profile.h: Added hakmem_env_snapshot_refresh_from_env()
- Makefile: Added hakmem_env_snapshot_box.o to build
- ENV gate: HAKMEM_ENV_SNAPSHOT=0/1 (default: 0, research box)
A/B Test Results (Mixed, 10-run, 20M iters):
- Baseline (E1=0): 43,617,549 ops/s (avg), 43,562,895 ops/s (median)
- Optimized (E1=1): 45,327,239 ops/s (avg), 45,309,218 ops/s (median)
- Improvement: avg +3.92%, median +4.01%
Decision: GO (+3.92% >= +2.5% threshold)
- Action: Keep as research box (default OFF) for Phase 4
- Next: Consider promotion to default in MIXED_TINYV3_C7_SAFE preset
Design Rationale:
- Shape optimizations (B3, D3) reached saturation (+0.56% NEUTRAL)
- Shift to memory/TLS overhead optimization (new optimization frontier)
- Pattern: Similar to existing tiny_front_v3_snapshot (proven approach)
- Expected: +1-3% from 3.26% ENV overhead → Achieved: +3.92%
Technical Details:
- Consolidation: 3 TLS reads → 1 TLS read (66% reduction)
- Learner interlock: tiny_metadata_cache_eff pre-computed in snapshot
- Version sync: Refreshes on small_policy_v7_version_changed()
- Fallback safety: Existing ENV gates still available when E1=0
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-14 00:59:12 +09:00
|
|
|
|
|
|
|
|
|
|
if (class_idx == 7 && c7_ultra_free) {
|
2025-12-11 20:39:46 +09:00
|
|
|
|
tiny_c7_ultra_free(ptr);
|
2025-12-10 22:57:26 +09:00
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 18:40:08 +09:00
|
|
|
|
// Phase POLICY-FAST-PATH-V2: Skip policy snapshot for known-legacy classes
|
|
|
|
|
|
if (free_policy_fast_v2_can_skip((uint8_t)class_idx)) {
|
|
|
|
|
|
FREE_PATH_STAT_INC(policy_fast_v2_skip);
|
|
|
|
|
|
goto legacy_fallback;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// Phase v11b-1: Policy-based single switch (replaces serial ULTRA checks)
|
2025-12-12 06:52:14 +09:00
|
|
|
|
const SmallPolicyV7* policy_free = small_policy_v7_snapshot();
|
2025-12-12 16:26:42 +09:00
|
|
|
|
SmallRouteKind route_kind_free = policy_free->route_kind[class_idx];
|
|
|
|
|
|
|
|
|
|
|
|
switch (route_kind_free) {
|
|
|
|
|
|
case SMALL_ROUTE_ULTRA: {
|
|
|
|
|
|
// Phase TLS-UNIFY-1: Unified ULTRA TLS push for C4-C6 (C7 handled above)
|
|
|
|
|
|
if (class_idx >= 4 && class_idx <= 6) {
|
|
|
|
|
|
tiny_ultra_tls_push((uint8_t)class_idx, base);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
// ULTRA for other classes → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2025-12-12 06:52:14 +09:00
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
case SMALL_ROUTE_MID_V35: {
|
|
|
|
|
|
// Phase v11a-3: MID v3.5 free
|
|
|
|
|
|
small_mid_v35_free(ptr, class_idx);
|
2025-12-12 03:50:58 +09:00
|
|
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
case SMALL_ROUTE_V7: {
|
|
|
|
|
|
// Phase v7: SmallObject v7 free (research box)
|
|
|
|
|
|
if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) {
|
|
|
|
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
// V7 miss → fallback to LEGACY
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2025-12-10 19:14:38 +09:00
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
case SMALL_ROUTE_MID_V3: {
|
|
|
|
|
|
// Phase MID-V3: delegate to MID v3.5
|
|
|
|
|
|
small_mid_v35_free(ptr, class_idx);
|
2025-12-12 03:12:28 +09:00
|
|
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 16:26:42 +09:00
|
|
|
|
case SMALL_ROUTE_LEGACY:
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
2025-12-10 19:14:38 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-12 18:40:08 +09:00
|
|
|
|
legacy_fallback:
|
2025-12-12 16:26:42 +09:00
|
|
|
|
// LEGACY fallback path
|
2025-12-15 21:36:30 +09:00
|
|
|
|
// Phase 19-6C: Compute route once using helper (avoid redundant tiny_route_for_class)
|
2025-12-13 21:44:00 +09:00
|
|
|
|
tiny_route_kind_t route;
|
2025-12-15 21:36:30 +09:00
|
|
|
|
int use_tiny_heap;
|
|
|
|
|
|
free_tiny_fast_compute_route_and_heap(class_idx, &route, &use_tiny_heap);
|
2025-11-27 11:52:11 +09:00
|
|
|
|
|
2025-12-04 18:53:04 +09:00
|
|
|
|
// TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast.
|
|
|
|
|
|
// In Release builds, we trust header magic (0xA0) as sufficient validation.
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-12-01 22:06:10 +09:00
|
|
|
|
// 5. Superslab 登録確認(誤分類防止)
|
|
|
|
|
|
SuperSlab* ss_guard = hak_super_lookup(ptr);
|
|
|
|
|
|
if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) {
|
|
|
|
|
|
return 0; // hakmem 管理外 → 通常 free 経路へ
|
|
|
|
|
|
}
|
2025-12-04 18:53:04 +09:00
|
|
|
|
#endif // !HAKMEM_BUILD_RELEASE
|
2025-12-01 22:06:10 +09:00
|
|
|
|
|
2025-12-07 22:49:28 +09:00
|
|
|
|
// Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path
|
2025-11-27 11:52:11 +09:00
|
|
|
|
{
|
|
|
|
|
|
static __thread int g_larson_fix = -1;
|
|
|
|
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|
|
|
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL");
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-07 22:49:28 +09:00
|
|
|
|
if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) {
|
2025-11-27 12:47:10 +09:00
|
|
|
|
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
|
|
|
|
|
|
SuperSlab* ss = ss_fast_lookup(base);
|
2025-12-11 18:04:14 +09:00
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (5. super_lookup 呼び出し)
|
|
|
|
|
|
FREE_PATH_STAT_INC(super_lookup_called);
|
2025-11-27 12:47:10 +09:00
|
|
|
|
if (ss) {
|
2025-11-27 11:52:11 +09:00
|
|
|
|
int slab_idx = slab_index_for(ss, base);
|
|
|
|
|
|
if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) {
|
|
|
|
|
|
uint32_t self_tid = tiny_self_u32_local();
|
|
|
|
|
|
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
|
2025-12-08 21:30:21 +09:00
|
|
|
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
2025-11-27 11:52:11 +09:00
|
|
|
|
// LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes)
|
|
|
|
|
|
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
static _Atomic uint64_t g_owner_check_count = 0;
|
|
|
|
|
|
uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1);
|
|
|
|
|
|
if (oc < 10) {
|
|
|
|
|
|
fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n",
|
|
|
|
|
|
ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp));
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
|
|
|
|
|
|
// Cross-thread free → route to remote queue instead of poisoning TLS cache
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
static _Atomic uint64_t g_cross_thread_count = 0;
|
|
|
|
|
|
uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1);
|
|
|
|
|
|
if (ct < 20) {
|
|
|
|
|
|
fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n",
|
|
|
|
|
|
ptr, owner_tid_low, self_tid_cmp, self_tid);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
|
2025-12-11 18:04:14 +09:00
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (6. cross-thread free)
|
|
|
|
|
|
FREE_PATH_STAT_INC(remote_free);
|
2025-11-27 11:52:11 +09:00
|
|
|
|
return 1; // handled via remote queue
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0; // remote push failed; fall back to normal path
|
2025-12-08 21:30:21 +09:00
|
|
|
|
}
|
|
|
|
|
|
// Same-thread + TinyHeap route → route-based free
|
|
|
|
|
|
if (__builtin_expect(use_tiny_heap, 0)) {
|
|
|
|
|
|
switch (route) {
|
2025-12-12 03:12:28 +09:00
|
|
|
|
case TINY_ROUTE_SMALL_HEAP_V7: {
|
|
|
|
|
|
// Phase v7-1: C6-only v7 stub (MID v3 fallback)
|
|
|
|
|
|
if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
break; // fallthrough to legacy
|
|
|
|
|
|
}
|
Phase v6-1/2/3/4: SmallObject Core v6 - C6-only implementation + refactor
Phase v6-1: C6-only route stub (v1/pool fallback)
Phase v6-2: Segment v6 + ColdIface v6 + Core v6 HotPath implementation
- 2MiB segment / 64KiB page allocation
- O(1) ptr→page_meta lookup with segment masking
- C6-heavy A/B: SEGV-free but -44% performance (15.3M ops/s)
Phase v6-3: Thin-layer optimization (TLS ownership check + batch header + refill batching)
- TLS ownership fast-path skip page_meta for 90%+ of frees
- Batch header writes during refill (32 allocs = 1 header write)
- TLS batch refill (1/32 refill frequency)
- C6-heavy A/B: v6-2 15.3M → v6-3 27.1M ops/s (±0% vs baseline) ✅
Phase v6-4: Mixed hang fix (segment metadata lookup correction)
- Root cause: metadata lookup was reading mmap region instead of TLS slot
- Fix: use TLS slot descriptor with in_use validation
- Mixed health: 5M iterations SEGV-free, 35.8M ops/s ✅
Phase v6-refactor: Code quality improvements (macro unification + inline + docs)
- Add SMALL_V6_* prefix macros (header, pointer conversion, page index)
- Extract inline validation functions (small_page_v6_valid, small_ptr_in_segment_v6)
- Doxygen-style comments for all public functions
- Result: 0 compiler warnings, maintained +1.2% performance
Files:
- core/box/smallobject_core_v6_box.h (new, type & API definitions)
- core/box/smallobject_cold_iface_v6.h (new, cold iface API)
- core/box/smallsegment_v6_box.h (new, segment type definitions)
- core/smallobject_core_v6.c (new, C6 alloc/free implementation)
- core/smallobject_cold_iface_v6.c (new, refill/retire logic)
- core/smallsegment_v6.c (new, segment allocator)
- docs/analysis/SMALLOBJECT_CORE_V6_DESIGN.md (new, design document)
- core/box/tiny_route_env_box.h (modified, v6 route added)
- core/front/malloc_tiny_fast.h (modified, v6 case in route switch)
- Makefile (modified, v6 objects added)
- CURRENT_TASK.md (modified, v6 status added)
Status:
- C6-heavy: v6 OFF 27.1M → v6-3 ON 27.1M ops/s (±0%) ✅
- Mixed: v6 ON 35.8M ops/s (C6-only, other classes via v1) ✅
- Build: 0 warnings, fully documented ✅
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 15:29:59 +09:00
|
|
|
|
case TINY_ROUTE_SMALL_HEAP_V6: {
|
2025-12-11 23:51:48 +09:00
|
|
|
|
// Phase V6-HDR-2: Headerless free (ENV gated)
|
|
|
|
|
|
if (small_v6_headerless_route_enabled((uint8_t)class_idx)) {
|
|
|
|
|
|
SmallHeapCtxV6* ctx_v6 = small_heap_ctx_v6();
|
|
|
|
|
|
if (small_v6_headerless_free(ctx_v6, ptr, (uint8_t)class_idx)) {
|
|
|
|
|
|
return 1; // Handled by v6
|
|
|
|
|
|
}
|
|
|
|
|
|
// v6 returned false -> fallback to legacy
|
|
|
|
|
|
}
|
|
|
|
|
|
break; // fallthrough to legacy
|
Phase v6-1/2/3/4: SmallObject Core v6 - C6-only implementation + refactor
Phase v6-1: C6-only route stub (v1/pool fallback)
Phase v6-2: Segment v6 + ColdIface v6 + Core v6 HotPath implementation
- 2MiB segment / 64KiB page allocation
- O(1) ptr→page_meta lookup with segment masking
- C6-heavy A/B: SEGV-free but -44% performance (15.3M ops/s)
Phase v6-3: Thin-layer optimization (TLS ownership check + batch header + refill batching)
- TLS ownership fast-path skip page_meta for 90%+ of frees
- Batch header writes during refill (32 allocs = 1 header write)
- TLS batch refill (1/32 refill frequency)
- C6-heavy A/B: v6-2 15.3M → v6-3 27.1M ops/s (±0% vs baseline) ✅
Phase v6-4: Mixed hang fix (segment metadata lookup correction)
- Root cause: metadata lookup was reading mmap region instead of TLS slot
- Fix: use TLS slot descriptor with in_use validation
- Mixed health: 5M iterations SEGV-free, 35.8M ops/s ✅
Phase v6-refactor: Code quality improvements (macro unification + inline + docs)
- Add SMALL_V6_* prefix macros (header, pointer conversion, page index)
- Extract inline validation functions (small_page_v6_valid, small_ptr_in_segment_v6)
- Doxygen-style comments for all public functions
- Result: 0 compiler warnings, maintained +1.2% performance
Files:
- core/box/smallobject_core_v6_box.h (new, type & API definitions)
- core/box/smallobject_cold_iface_v6.h (new, cold iface API)
- core/box/smallsegment_v6_box.h (new, segment type definitions)
- core/smallobject_core_v6.c (new, C6 alloc/free implementation)
- core/smallobject_cold_iface_v6.c (new, refill/retire logic)
- core/smallsegment_v6.c (new, segment allocator)
- docs/analysis/SMALLOBJECT_CORE_V6_DESIGN.md (new, design document)
- core/box/tiny_route_env_box.h (modified, v6 route added)
- core/front/malloc_tiny_fast.h (modified, v6 case in route switch)
- Makefile (modified, v6 objects added)
- CURRENT_TASK.md (modified, v6 status added)
Status:
- C6-heavy: v6 OFF 27.1M → v6-3 ON 27.1M ops/s (±0%) ✅
- Mixed: v6 ON 35.8M ops/s (C6-only, other classes via v1) ✅
- Build: 0 warnings, fully documented ✅
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 15:29:59 +09:00
|
|
|
|
}
|
2025-12-12 06:09:12 +09:00
|
|
|
|
// Phase v10: v3/v4/v5 removed - routes now handled as LEGACY
|
2025-12-08 21:30:21 +09:00
|
|
|
|
case TINY_ROUTE_HOTHEAP_V2:
|
|
|
|
|
|
tiny_hotheap_v2_free((uint8_t)class_idx, base, meta);
|
2025-12-11 18:04:14 +09:00
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (v2 は tiny_heap_v1 にカウント)
|
|
|
|
|
|
FREE_PATH_STAT_INC(tiny_heap_v1_fast);
|
2025-12-08 21:30:21 +09:00
|
|
|
|
return 1;
|
|
|
|
|
|
case TINY_ROUTE_HEAP: {
|
|
|
|
|
|
tiny_heap_ctx_t* ctx = tiny_heap_ctx_for_thread();
|
|
|
|
|
|
if (class_idx == 7) {
|
|
|
|
|
|
tiny_c7_free_fast_with_meta(ss, slab_idx, base);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
tiny_heap_free_class_fast_with_meta(ctx, class_idx, ss, slab_idx, base);
|
|
|
|
|
|
}
|
2025-12-11 18:04:14 +09:00
|
|
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (9. TinyHeap v1 route)
|
|
|
|
|
|
FREE_PATH_STAT_INC(tiny_heap_v1_fast);
|
2025-12-08 21:30:21 +09:00
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
2025-12-07 22:49:28 +09:00
|
|
|
|
}
|
2025-11-27 11:52:11 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-12-07 22:49:28 +09:00
|
|
|
|
if (use_tiny_heap) {
|
|
|
|
|
|
// fallback: lookup failed but TinyHeap front is ON → use generic TinyHeap free
|
2025-12-08 21:30:21 +09:00
|
|
|
|
if (route == TINY_ROUTE_HOTHEAP_V2) {
|
2025-12-09 21:50:15 +09:00
|
|
|
|
tiny_hotheap_v2_record_free_fallback((uint8_t)class_idx);
|
2025-12-08 21:30:21 +09:00
|
|
|
|
}
|
2025-12-12 06:09:12 +09:00
|
|
|
|
// Phase v10: v3/v4 removed - no special fallback
|
2025-12-07 22:49:28 +09:00
|
|
|
|
tiny_heap_free_class_fast(tiny_heap_ctx_for_thread(), class_idx, ptr);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
2025-11-27 11:52:11 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Debug: Log free operations (first 5000, all classes)
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
{
|
|
|
|
|
|
extern _Atomic uint64_t g_debug_op_count;
|
|
|
|
|
|
extern __thread TinyTLSSLL g_tls_sll[];
|
|
|
|
|
|
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
|
|
|
|
|
|
// Note: Shares g_debug_op_count with alloc logging, so bump the window.
|
|
|
|
|
|
if (op < 5000) {
|
|
|
|
|
|
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n",
|
|
|
|
|
|
(unsigned long)op, class_idx, ptr, base,
|
|
|
|
|
|
g_tls_sll[class_idx].count);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-12-11 19:01:59 +09:00
|
|
|
|
// Phase REFACTOR-2: Legacy fallback (use unified helper)
|
2025-12-15 12:50:16 +09:00
|
|
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
2025-12-11 18:34:27 +09:00
|
|
|
|
return 1;
|
2025-11-17 05:29:08 +09:00
|
|
|
|
#else
|
|
|
|
|
|
// No header mode - fall back to normal free
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#endif // HAK_FRONT_MALLOC_TINY_FAST_H
|