2025-12-14 16:28:23 +09:00
|
|
|
#ifndef HAK_FRONT_FASTLANE_BOX_H
|
|
|
|
|
#define HAK_FRONT_FASTLANE_BOX_H
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Phase 6: Front FastLane Box (Hot Inline / Try API)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
//
|
|
|
|
|
// Purpose: Single-box entry point for malloc/free hot paths
|
|
|
|
|
// Collapses wrapper→gate→policy→route layers into one
|
|
|
|
|
//
|
|
|
|
|
// API:
|
|
|
|
|
// void* front_fastlane_try_malloc(size_t size)
|
|
|
|
|
// - Returns non-NULL on success (handled by FastLane)
|
|
|
|
|
// - Returns NULL on failure (fallback to existing wrapper path)
|
|
|
|
|
//
|
|
|
|
|
// bool front_fastlane_try_free(void* ptr)
|
|
|
|
|
// - Returns true if handled (success)
|
|
|
|
|
// - Returns false if not handled (fallback to existing wrapper path)
|
|
|
|
|
//
|
|
|
|
|
// Box Theory:
|
|
|
|
|
// - L0: ENV gate (front_fastlane_env_box.h)
|
|
|
|
|
// - L1: This file (hot inline handlers)
|
|
|
|
|
// - L2: Stats (front_fastlane_stats_box.h, cold helpers in .c)
|
|
|
|
|
//
|
|
|
|
|
// Strategy:
|
|
|
|
|
// - Read existing "winning boxes" only once
|
|
|
|
|
// - Call existing hot handlers (malloc_tiny_fast_for_class, free_tiny_fast)
|
|
|
|
|
// - No duplicate checks (deduplicate existing wrapper logic)
|
|
|
|
|
// - Fail-fast: Any uncertainty → return not-handled
|
|
|
|
|
//
|
|
|
|
|
// Safety:
|
2025-12-14 16:30:32 +09:00
|
|
|
// - ENV-gated (default ON, opt-out)
|
2025-12-14 16:28:23 +09:00
|
|
|
// - Single fallback boundary (FastLane → ColdFallback)
|
|
|
|
|
// - Reversible (ENV toggle)
|
|
|
|
|
//
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
#include <stddef.h>
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include "front_fastlane_env_box.h"
|
|
|
|
|
#include "front_fastlane_stats_box.h"
|
|
|
|
|
#include "../hakmem_tiny.h" // hak_tiny_size_to_class, tiny_get_max_size
|
|
|
|
|
#include "../front/malloc_tiny_fast.h" // malloc_tiny_fast_for_class
|
Phase 16 v1 NEUTRAL, Phase 17 Case B confirmed, Phase 18 design added
## Phase 16 v1: Front FastLane Alloc LEGACY Direct — NEUTRAL (+0.62%)
Target: Reduce alloc-side fixed costs by adding LEGACY direct path to
FastLane entry, mirroring Phase 9/10 free-side winning pattern.
Result: +0.62% on Mixed (below +1.0% GO threshold) → NEUTRAL, freeze as
research box (default OFF).
Critical issue: Initial impl crashed (segfault) for C4-C7. Root cause:
unified_cache_refill() incompatibility. Safety fix: Limited to C0-C3
only (matching existing dualhot pattern).
Files:
- core/box/front_fastlane_alloc_legacy_direct_env_box.{h,c} (new)
- core/box/front_fastlane_box.h (LEGACY direct path, lines 93-119)
- core/bench_profile.h (env refresh sync)
- Makefile (new obj)
- docs/analysis/PHASE16_*.md (design/results/instructions)
ENV: HAKMEM_FRONT_FASTLANE_ALLOC_LEGACY_DIRECT=0 (default OFF, opt-in)
Verdict: Research box frozen. Phase 14-16 plateau confirms dispatch/
routing optimization ROI is exhausted post-Phase-6 FastLane collapse.
---
## Phase 17: FORCE_LIBC Gap Validation — Case B Confirmed
Purpose: Validate "system malloc faster" observation using same-binary
A/B testing to isolate allocator logic差 vs binary layout penalty.
Method:
- Same-binary toggle: HAKMEM_FORCE_LIBC_ALLOC=0/1 (bench_random_mixed_hakmem)
- System binary: bench_random_mixed_system (21K separate binary)
- Perf stat: Hardware counter analysis (I-cache, cycles, instructions)
Result: **Case B confirmed** — Allocator差 negligible, layout penalty dominates.
Gap breakdown (Mixed, 20M iters, ws=400):
- hakmem (FORCE_LIBC=0): 48.12M ops/s
- libc (FORCE_LIBC=1, same binary): 48.31M ops/s → +0.39% (noise level)
- system binary (21K): 83.85M ops/s → +73.57% vs libc, +74.26% vs hakmem
Perf stat (200M iters):
- I-cache misses: 153K (hakmem) → 68K (system) = -55% (smoking gun)
- Cycles: 17.9B → 10.2B = -43%
- Instructions: 41.3B → 21.5B = -48%
- Binary size: 653K → 21K (30x difference)
Root cause: Binary size (30x) causes I-cache thrashing. Code bloat >>
algorithmic efficiency.
Conclusion: Phase 12's "system malloc 1.6x faster" was real, but
misattributed. Gap is layout/I-cache, NOT allocator algorithm.
Files:
- docs/analysis/PHASE17_*.md (results/instructions)
- scripts/run_mixed_10_cleanenv.sh (Phase 9/10 defaults aligned)
Next: Phase 18 Hot Text Isolation (layout optimization, not algorithm opt)
---
## Phase 18: Hot Text Isolation — Design Added
Purpose: Reduce I-cache misses + instruction footprint via layout control
(binary optimization, not allocator algorithm changes).
Strategy (v1 → v2 progression):
v1 (TU split + hot/cold attrs + optional gc-sections):
- Target: +2% throughput (GO threshold, realistic for layout tweaks)
- Secondary: I-cache -10%, instructions -5% (direction confirmation)
- Risk: Low (reversible via build knob)
- Expected: +0-2% (NEUTRAL likely, but validates approach)
v2 (BENCH_MINIMAL compile-out):
- Target: +10-20% throughput (本命)
- Method: Conditional compilation removes stats/ENV/debug from hot path
- Expected: Instruction count -30-40% → significant I-cache improvement
Files:
- docs/analysis/PHASE18_*.md (design/instructions)
- CURRENT_TASK.md (Phase 17 complete, Phase 18 v1/v2 plan)
Build gate: HOT_TEXT_ISOLATION=0/1 (Makefile knob)
Next: Implement Phase 18 v1 (TU split first, BENCH_MINIMAL if v1 NEUTRAL)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-15 05:25:47 +09:00
|
|
|
#include "front_fastlane_alloc_legacy_direct_env_box.h" // Phase 16 v1: LEGACY direct
|
|
|
|
|
#include "tiny_static_route_box.h" // tiny_static_route_ready_fast, tiny_static_route_get_kind_fast
|
|
|
|
|
#include "tiny_front_hot_box.h" // tiny_hot_alloc_fast
|
|
|
|
|
#include "tiny_front_cold_box.h" // tiny_cold_refill_and_alloc
|
|
|
|
|
#include "smallobject_policy_v7_box.h" // SMALL_ROUTE_LEGACY
|
2025-12-14 16:28:23 +09:00
|
|
|
|
|
|
|
|
// FastLane is only safe after global init completes.
|
|
|
|
|
// Before init, wrappers must handle recursion guards + syscall init.
|
|
|
|
|
extern int g_initialized;
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Hot Inline: try_malloc
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// Patch 4: Actual Tiny routing implementation
|
|
|
|
|
// Strategy: Read existing winning boxes only once, call existing hot handlers
|
|
|
|
|
// No duplicate checks (deduplicate existing wrapper logic)
|
|
|
|
|
static inline void* front_fastlane_try_malloc(size_t size) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_total);
|
|
|
|
|
|
|
|
|
|
// Fail-fast: do not enter FastLane before init completes.
|
|
|
|
|
if (__builtin_expect(!g_initialized, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_fallback_other);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fast path: Size check (Tiny range only)
|
|
|
|
|
// Use cached max size (typically 256 or 1024)
|
|
|
|
|
size_t max_size = tiny_get_max_size();
|
|
|
|
|
if (__builtin_expect(size > max_size, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_fallback_size);
|
|
|
|
|
return NULL; // Not Tiny → fallback
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Class calculation (single LUT lookup, no branches)
|
|
|
|
|
int class_idx = hak_tiny_size_to_class(size);
|
|
|
|
|
if (__builtin_expect(class_idx < 0 || class_idx >= 8, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_fallback_class);
|
|
|
|
|
return NULL; // Invalid class → fallback
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Class mask check (gradual rollout support)
|
|
|
|
|
uint8_t mask = front_fastlane_class_mask();
|
|
|
|
|
if (__builtin_expect(((mask >> class_idx) & 1) == 0, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_fallback_other);
|
|
|
|
|
return NULL; // Class not enabled → fallback
|
|
|
|
|
}
|
|
|
|
|
|
Phase 16 v1 NEUTRAL, Phase 17 Case B confirmed, Phase 18 design added
## Phase 16 v1: Front FastLane Alloc LEGACY Direct — NEUTRAL (+0.62%)
Target: Reduce alloc-side fixed costs by adding LEGACY direct path to
FastLane entry, mirroring Phase 9/10 free-side winning pattern.
Result: +0.62% on Mixed (below +1.0% GO threshold) → NEUTRAL, freeze as
research box (default OFF).
Critical issue: Initial impl crashed (segfault) for C4-C7. Root cause:
unified_cache_refill() incompatibility. Safety fix: Limited to C0-C3
only (matching existing dualhot pattern).
Files:
- core/box/front_fastlane_alloc_legacy_direct_env_box.{h,c} (new)
- core/box/front_fastlane_box.h (LEGACY direct path, lines 93-119)
- core/bench_profile.h (env refresh sync)
- Makefile (new obj)
- docs/analysis/PHASE16_*.md (design/results/instructions)
ENV: HAKMEM_FRONT_FASTLANE_ALLOC_LEGACY_DIRECT=0 (default OFF, opt-in)
Verdict: Research box frozen. Phase 14-16 plateau confirms dispatch/
routing optimization ROI is exhausted post-Phase-6 FastLane collapse.
---
## Phase 17: FORCE_LIBC Gap Validation — Case B Confirmed
Purpose: Validate "system malloc faster" observation using same-binary
A/B testing to isolate allocator logic差 vs binary layout penalty.
Method:
- Same-binary toggle: HAKMEM_FORCE_LIBC_ALLOC=0/1 (bench_random_mixed_hakmem)
- System binary: bench_random_mixed_system (21K separate binary)
- Perf stat: Hardware counter analysis (I-cache, cycles, instructions)
Result: **Case B confirmed** — Allocator差 negligible, layout penalty dominates.
Gap breakdown (Mixed, 20M iters, ws=400):
- hakmem (FORCE_LIBC=0): 48.12M ops/s
- libc (FORCE_LIBC=1, same binary): 48.31M ops/s → +0.39% (noise level)
- system binary (21K): 83.85M ops/s → +73.57% vs libc, +74.26% vs hakmem
Perf stat (200M iters):
- I-cache misses: 153K (hakmem) → 68K (system) = -55% (smoking gun)
- Cycles: 17.9B → 10.2B = -43%
- Instructions: 41.3B → 21.5B = -48%
- Binary size: 653K → 21K (30x difference)
Root cause: Binary size (30x) causes I-cache thrashing. Code bloat >>
algorithmic efficiency.
Conclusion: Phase 12's "system malloc 1.6x faster" was real, but
misattributed. Gap is layout/I-cache, NOT allocator algorithm.
Files:
- docs/analysis/PHASE17_*.md (results/instructions)
- scripts/run_mixed_10_cleanenv.sh (Phase 9/10 defaults aligned)
Next: Phase 18 Hot Text Isolation (layout optimization, not algorithm opt)
---
## Phase 18: Hot Text Isolation — Design Added
Purpose: Reduce I-cache misses + instruction footprint via layout control
(binary optimization, not allocator algorithm changes).
Strategy (v1 → v2 progression):
v1 (TU split + hot/cold attrs + optional gc-sections):
- Target: +2% throughput (GO threshold, realistic for layout tweaks)
- Secondary: I-cache -10%, instructions -5% (direction confirmation)
- Risk: Low (reversible via build knob)
- Expected: +0-2% (NEUTRAL likely, but validates approach)
v2 (BENCH_MINIMAL compile-out):
- Target: +10-20% throughput (本命)
- Method: Conditional compilation removes stats/ENV/debug from hot path
- Expected: Instruction count -30-40% → significant I-cache improvement
Files:
- docs/analysis/PHASE18_*.md (design/instructions)
- CURRENT_TASK.md (Phase 17 complete, Phase 18 v1/v2 plan)
Build gate: HOT_TEXT_ISOLATION=0/1 (Makefile knob)
Next: Implement Phase 18 v1 (TU split first, BENCH_MINIMAL if v1 NEUTRAL)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-15 05:25:47 +09:00
|
|
|
// Phase 16 v1: LEGACY direct path (early-exit optimization)
|
|
|
|
|
// Try direct allocation for LEGACY routes only (skip route/policy overhead)
|
|
|
|
|
// TEMPORARY SAFETY: Limit to C0-C3 (match dualhot pattern) until refill issue debugged
|
|
|
|
|
if (__builtin_expect(front_fastlane_alloc_legacy_direct_enabled() && (unsigned)class_idx <= 3u, 0)) {
|
|
|
|
|
// Condition 1: Static route must be ready (Learner interlock check)
|
|
|
|
|
// Condition 2: Route must be LEGACY (断定可能な場合のみ)
|
|
|
|
|
if (tiny_static_route_ready_fast() &&
|
|
|
|
|
tiny_static_route_get_kind_fast(class_idx) == SMALL_ROUTE_LEGACY) {
|
|
|
|
|
|
|
|
|
|
// Hot path: Try UnifiedCache first
|
|
|
|
|
void* ptr = tiny_hot_alloc_fast(class_idx);
|
|
|
|
|
if (__builtin_expect(ptr != NULL, 1)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_hit);
|
|
|
|
|
return ptr; // Success (cache hit)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Cold path: Refill UnifiedCache and retry
|
|
|
|
|
ptr = tiny_cold_refill_and_alloc(class_idx);
|
|
|
|
|
if (__builtin_expect(ptr != NULL, 1)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_hit);
|
|
|
|
|
return ptr; // Success (after refill)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fallback: Direct path failed → use existing route (safety)
|
|
|
|
|
// This handles edge cases (Learner transition, policy changes, etc.)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-14 16:28:23 +09:00
|
|
|
// Call existing hot handler (no duplication)
|
|
|
|
|
// This is the winning path from E5-4 / Phase 4 E2
|
|
|
|
|
void* ptr = malloc_tiny_fast_for_class(size, class_idx);
|
|
|
|
|
if (__builtin_expect(ptr != NULL, 1)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_hit);
|
|
|
|
|
return ptr; // Success
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Allocation failed (refill needed, TLS exhausted, etc.)
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(malloc_fallback_alloc);
|
|
|
|
|
return NULL; // Fallback to cold path
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Hot Inline: try_free
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
2025-12-14 17:38:21 +09:00
|
|
|
// Phase 6-2: Free DeDup optimization
|
|
|
|
|
// Strategy:
|
|
|
|
|
// - When dedup=1 and class_mask=0xFF: Direct call to free_tiny_fast() (no duplicate header check)
|
|
|
|
|
// - Otherwise: Existing header validation path (backward compatible)
|
2025-12-14 16:28:23 +09:00
|
|
|
static inline bool front_fastlane_try_free(void* ptr) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_total);
|
|
|
|
|
|
|
|
|
|
// Fail-fast: do not enter FastLane before init completes.
|
|
|
|
|
if (__builtin_expect(!g_initialized, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_other);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_TINY_HEADER_CLASSIDX
|
2025-12-14 17:38:21 +09:00
|
|
|
// Phase 6-2: DeDup path (eliminate duplicate header validation)
|
|
|
|
|
// Conditions:
|
|
|
|
|
// 1. Free DeDup enabled (ENV=1)
|
|
|
|
|
// 2. All classes enabled (mask=0xFF, no gradual rollout)
|
|
|
|
|
if (__builtin_expect(front_fastlane_free_dedup_enabled() && front_fastlane_class_mask() == 0xFF, 1)) {
|
|
|
|
|
// Direct call to free_tiny_fast() (handles all validation internally)
|
|
|
|
|
// free_tiny_fast() is static inline in malloc_tiny_fast.h, no extern needed
|
|
|
|
|
int result = free_tiny_fast(ptr);
|
|
|
|
|
if (__builtin_expect(result, 1)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_hit);
|
|
|
|
|
return true; // Handled
|
|
|
|
|
}
|
|
|
|
|
// Not handled → fallback
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_failure);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Traditional path (backward compatible, for class mask filtering or dedup=0)
|
2025-12-14 16:28:23 +09:00
|
|
|
// Page boundary guard: ptr must not be page-aligned
|
|
|
|
|
// (Accessing ptr-1 when ptr is page-aligned could segfault)
|
|
|
|
|
uintptr_t off = (uintptr_t)ptr & 0xFFFu;
|
|
|
|
|
if (__builtin_expect(off == 0, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_aligned);
|
|
|
|
|
return false; // Page-aligned → fallback (unsafe to read header)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fast header validation (1 load, 1 compare)
|
|
|
|
|
uint8_t header = *((uint8_t*)ptr - 1);
|
|
|
|
|
uint8_t magic = header & 0xF0u;
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(magic != 0xA0u, 0)) {
|
|
|
|
|
// Not Tiny header (could be Mid/Pool/Large or external allocation)
|
|
|
|
|
if (magic != 0) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_header);
|
|
|
|
|
}
|
|
|
|
|
return false; // Not Tiny → fallback
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Extract class index from header (lower 4 bits)
|
|
|
|
|
int class_idx = (int)(header & 0x0Fu);
|
|
|
|
|
if (__builtin_expect(class_idx >= 8, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_class);
|
|
|
|
|
return false; // Invalid class → fallback
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Class mask check (gradual rollout support)
|
|
|
|
|
uint8_t mask = front_fastlane_class_mask();
|
|
|
|
|
if (__builtin_expect(((mask >> class_idx) & 1) == 0, 0)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_other);
|
|
|
|
|
return false; // Class not enabled → fallback
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Call existing hot handler (no duplication)
|
|
|
|
|
// This is the winning path from E5-1 (free_tiny_fast returns 1 on success)
|
2025-12-14 17:38:21 +09:00
|
|
|
// free_tiny_fast() is static inline in malloc_tiny_fast.h, no extern needed
|
2025-12-14 16:28:23 +09:00
|
|
|
if (__builtin_expect(free_tiny_fast(ptr), 1)) {
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_hit);
|
|
|
|
|
return true; // Success
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Free failed (cold path needed - refill, full TLS, etc.)
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_failure);
|
|
|
|
|
return false; // Fallback to cold path
|
|
|
|
|
#else
|
|
|
|
|
// No header support → always fallback
|
|
|
|
|
FRONT_FASTLANE_STAT_INC(free_fallback_other);
|
|
|
|
|
return false;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // HAK_FRONT_FASTLANE_BOX_H
|