Files
hakmem/core/front/malloc_tiny_fast.h

425 lines
18 KiB
C
Raw Normal View History

2025-11-17 05:29:08 +09:00
// malloc_tiny_fast.h - Phase 26: Front Gate Unification (Tiny Fast Path)
//
// Goal: Eliminate 3-layer overhead (malloc → hak_alloc_at → wrapper → tiny_alloc_fast)
// Target: +10-15% performance (11.35M → 12.5-13.5M ops/s)
//
// Design (ChatGPT analysis):
// - Replace: malloc → hak_alloc_at (236 lines) → wrapper (diagnostics) → tiny_alloc_fast
// - With: malloc → malloc_tiny_fast (single-layer, direct to Unified Cache)
// - Preserves: Safety checks (lock depth, initializing, LD_SAFE, jemalloc block)
// - Leverages: Phase 23 Unified Cache (tcache-style, 2-3 cache misses)
//
// Performance:
// - Current overhead: malloc(8.97%) + routing + wrapper(3.63%) + tiny(5.37%) = 17.97%
// - BenchFast ceiling: 8-10 instructions (~1-2% overhead)
// - Gap: ~16%
// - Target: Close half the gap (+10-15% improvement)
//
// ENV Variables:
// HAKMEM_FRONT_GATE_UNIFIED=1 # Enable Front Gate Unification (default: 0, OFF)
#ifndef HAK_FRONT_MALLOC_TINY_FAST_H
#define HAK_FRONT_MALLOC_TINY_FAST_H
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdatomic.h>
#include <pthread.h> // For pthread_self() in cross-thread check
2025-11-17 05:29:08 +09:00
#include "../hakmem_build_flags.h"
#include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES
#include "../hakmem_super_registry.h" // For cross-thread owner check
#include "../superslab/superslab_inline.h" // For ss_fast_lookup, slab_index_for (Phase 12)
#include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get
#include "../box/free_remote_box.h" // For tiny_free_remote_box
2025-11-17 05:29:08 +09:00
#include "tiny_unified_cache.h" // For unified_cache_pop_or_refill
#include "../tiny_region_id.h" // For tiny_region_id_write_header
#include "../hakmem_tiny.h" // For hak_tiny_size_to_class
Phase 4-Step2: Add Hot/Cold Path Box (+7.3% performance) Implemented Hot/Cold Path separation using Box pattern for Tiny allocations: Performance Improvement (without PGO): - Baseline (Phase 26-A): 53.3 M ops/s - Hot/Cold Box (Phase 4-Step2): 57.2 M ops/s - Gain: +7.3% (+3.9 M ops/s) Implementation: 1. core/box/tiny_front_hot_box.h - Ultra-fast hot path (1 branch) - Removed range check (caller guarantees valid class_idx) - Inline cache hit path with branch prediction hints - Debug metrics with zero overhead in Release builds 2. core/box/tiny_front_cold_box.h - Slow cold path (noinline, cold) - Refill logic (batch allocation from SuperSlab) - Drain logic (batch free to SuperSlab) - Error reporting and diagnostics 3. core/front/malloc_tiny_fast.h - Updated to use Hot/Cold Boxes - Hot path: tiny_hot_alloc_fast() (1 branch: cache empty check) - Cold path: tiny_cold_refill_and_alloc() (noinline, cold attribute) - Clear separation improves i-cache locality Branch Analysis: - Baseline: 4-5 branches in hot path (range check + cache check + refill logic mixed) - Hot/Cold Box: 1 branch in hot path (cache empty check only) - Reduction: 3-4 branches eliminated from hot path Design Principles (Box Pattern): ✅ Single Responsibility: Hot path = cache hit only, Cold path = refill/errors ✅ Clear Contract: Hot returns NULL on miss, Cold handles miss ✅ Observable: Debug metrics (TINY_HOT_METRICS_*) gated by NDEBUG ✅ Safe: Branch prediction hints (TINY_HOT_LIKELY/UNLIKELY) ✅ Testable: Isolated hot/cold paths, easy A/B testing PGO Status: - Temporarily disabled (build issues with __gcov_merge_time_profile) - Will re-enable PGO in future commit after resolving gcc/lto issues - Current benchmarks are without PGO (fair A/B comparison) Other Changes: - .gitignore: Added *.d files (dependency files, auto-generated) - Makefile: PGO targets temporarily disabled (show informational message) - build_pgo.sh: Temporarily disabled (show "PGO paused" message) Next: Phase 4-Step3 (Front Config Box, target +5-8%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 11:58:37 +09:00
#include "../box/tiny_front_hot_box.h" // Phase 4-Step2: Hot Path Box
#include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box
#include "../box/tiny_c7_hotbox.h" // Optional: C7 専用ホットボックス
#include "../box/tiny_heap_box.h" // TinyHeap 汎用 Box
#include "../box/tiny_hotheap_v2_box.h" // TinyHotHeap v2 (Phase31 A/B)
#include "../box/smallobject_hotbox_v3_box.h" // SmallObject HotHeap v3 skeleton
#include "../box/smallobject_hotbox_v4_box.h" // SmallObject HotHeap v4 (C7 stub)
#include "../box/tiny_c7_ultra_box.h" // C7 ULTRA stub (UF-1, delegates to v3)
#include "../box/tiny_front_v3_env_box.h" // Tiny front v3 snapshot gate
#include "../box/tiny_heap_env_box.h" // ENV gate for TinyHeap front (A/B)
#include "../box/tiny_route_env_box.h" // Route snapshot (Heap vs Legacy)
#include "../box/tiny_front_stats_box.h" // Front class distribution counters
2025-11-17 05:29:08 +09:00
// Helper: current thread id (low 32 bits) for owner check
#ifndef TINY_SELF_U32_LOCAL_DEFINED
#define TINY_SELF_U32_LOCAL_DEFINED
static inline uint32_t tiny_self_u32_local(void) {
return (uint32_t)(uintptr_t)pthread_self();
}
#endif
2025-11-17 05:29:08 +09:00
// ============================================================================
// ENV Control (cached, lazy init)
// ============================================================================
// Enable flag (default: 0, OFF)
static inline int front_gate_unified_enabled(void) {
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED");
Enable performance optimizations by default (+557% improvement) ## Performance Impact **Before** (optimizations OFF): - Random Mixed 256B: 9.4M ops/s - System malloc ratio: 10.6% (9.5x slower) **After** (optimizations ON): - Random Mixed 256B: 61.8M ops/s (+557%) - System malloc ratio: 70.0% (1.43x slower) ✅ - 3-run average: 60.1M - 62.8M ops/s (±2.2% variance) ## Changes Enabled 3 critical optimizations by default: ### 1. HAKMEM_SS_EMPTY_REUSE (hakmem_shared_pool.c:810) ```c // BEFORE: default OFF empty_reuse_enabled = (e && *e && *e != '0') ? 1 : 0; // AFTER: default ON empty_reuse_enabled = (e && *e && *e == '0') ? 0 : 1; ``` **Impact**: Reuse empty slabs before mmap, reduces syscall overhead ### 2. HAKMEM_TINY_UNIFIED_CACHE (tiny_unified_cache.h:69) ```c // BEFORE: default OFF g_enable = (e && *e && *e != '0') ? 1 : 0; // AFTER: default ON g_enable = (e && *e && *e == '0') ? 0 : 1; ``` **Impact**: Unified TLS cache improves hit rate ### 3. HAKMEM_FRONT_GATE_UNIFIED (malloc_tiny_fast.h:42) ```c // BEFORE: default OFF g_enable = (e && *e && *e != '0') ? 1 : 0; // AFTER: default ON g_enable = (e && *e && *e == '0') ? 0 : 1; ``` **Impact**: Unified front gate reduces dispatch overhead ## ENV Override Users can still disable optimizations if needed: ```bash export HAKMEM_SS_EMPTY_REUSE=0 # Disable empty slab reuse export HAKMEM_TINY_UNIFIED_CACHE=0 # Disable unified cache export HAKMEM_FRONT_GATE_UNIFIED=0 # Disable unified front gate ``` ## Comparison to Competitors ``` mimalloc: 113.34M ops/s (1.83x faster than HAKMEM) System malloc: 88.20M ops/s (1.43x faster than HAKMEM) HAKMEM: 61.80M ops/s ✅ Competitive performance ``` ## Files Modified - core/hakmem_shared_pool.c - EMPTY_REUSE default ON - core/front/tiny_unified_cache.h - UNIFIED_CACHE default ON - core/front/malloc_tiny_fast.h - FRONT_GATE_UNIFIED default ON 🚀 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 01:29:05 +09:00
g_enable = (e && *e && *e == '0') ? 0 : 1; // default ON
2025-11-17 05:29:08 +09:00
#if !HAKMEM_BUILD_RELEASE
if (g_enable) {
fprintf(stderr, "[FrontGate-INIT] front_gate_unified_enabled() = %d\n", g_enable);
fflush(stderr);
}
#endif
}
return g_enable;
}
// ============================================================================
Phase 4-Step2: Add Hot/Cold Path Box (+7.3% performance) Implemented Hot/Cold Path separation using Box pattern for Tiny allocations: Performance Improvement (without PGO): - Baseline (Phase 26-A): 53.3 M ops/s - Hot/Cold Box (Phase 4-Step2): 57.2 M ops/s - Gain: +7.3% (+3.9 M ops/s) Implementation: 1. core/box/tiny_front_hot_box.h - Ultra-fast hot path (1 branch) - Removed range check (caller guarantees valid class_idx) - Inline cache hit path with branch prediction hints - Debug metrics with zero overhead in Release builds 2. core/box/tiny_front_cold_box.h - Slow cold path (noinline, cold) - Refill logic (batch allocation from SuperSlab) - Drain logic (batch free to SuperSlab) - Error reporting and diagnostics 3. core/front/malloc_tiny_fast.h - Updated to use Hot/Cold Boxes - Hot path: tiny_hot_alloc_fast() (1 branch: cache empty check) - Cold path: tiny_cold_refill_and_alloc() (noinline, cold attribute) - Clear separation improves i-cache locality Branch Analysis: - Baseline: 4-5 branches in hot path (range check + cache check + refill logic mixed) - Hot/Cold Box: 1 branch in hot path (cache empty check only) - Reduction: 3-4 branches eliminated from hot path Design Principles (Box Pattern): ✅ Single Responsibility: Hot path = cache hit only, Cold path = refill/errors ✅ Clear Contract: Hot returns NULL on miss, Cold handles miss ✅ Observable: Debug metrics (TINY_HOT_METRICS_*) gated by NDEBUG ✅ Safe: Branch prediction hints (TINY_HOT_LIKELY/UNLIKELY) ✅ Testable: Isolated hot/cold paths, easy A/B testing PGO Status: - Temporarily disabled (build issues with __gcov_merge_time_profile) - Will re-enable PGO in future commit after resolving gcc/lto issues - Current benchmarks are without PGO (fair A/B comparison) Other Changes: - .gitignore: Added *.d files (dependency files, auto-generated) - Makefile: PGO targets temporarily disabled (show informational message) - build_pgo.sh: Temporarily disabled (show "PGO paused" message) Next: Phase 4-Step3 (Front Config Box, target +5-8%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 11:58:37 +09:00
// Phase 4-Step2: malloc_tiny_fast() - Hot/Cold Path Box (ACTIVE)
2025-11-17 05:29:08 +09:00
// ============================================================================
Phase 4-Step2: Add Hot/Cold Path Box (+7.3% performance) Implemented Hot/Cold Path separation using Box pattern for Tiny allocations: Performance Improvement (without PGO): - Baseline (Phase 26-A): 53.3 M ops/s - Hot/Cold Box (Phase 4-Step2): 57.2 M ops/s - Gain: +7.3% (+3.9 M ops/s) Implementation: 1. core/box/tiny_front_hot_box.h - Ultra-fast hot path (1 branch) - Removed range check (caller guarantees valid class_idx) - Inline cache hit path with branch prediction hints - Debug metrics with zero overhead in Release builds 2. core/box/tiny_front_cold_box.h - Slow cold path (noinline, cold) - Refill logic (batch allocation from SuperSlab) - Drain logic (batch free to SuperSlab) - Error reporting and diagnostics 3. core/front/malloc_tiny_fast.h - Updated to use Hot/Cold Boxes - Hot path: tiny_hot_alloc_fast() (1 branch: cache empty check) - Cold path: tiny_cold_refill_and_alloc() (noinline, cold attribute) - Clear separation improves i-cache locality Branch Analysis: - Baseline: 4-5 branches in hot path (range check + cache check + refill logic mixed) - Hot/Cold Box: 1 branch in hot path (cache empty check only) - Reduction: 3-4 branches eliminated from hot path Design Principles (Box Pattern): ✅ Single Responsibility: Hot path = cache hit only, Cold path = refill/errors ✅ Clear Contract: Hot returns NULL on miss, Cold handles miss ✅ Observable: Debug metrics (TINY_HOT_METRICS_*) gated by NDEBUG ✅ Safe: Branch prediction hints (TINY_HOT_LIKELY/UNLIKELY) ✅ Testable: Isolated hot/cold paths, easy A/B testing PGO Status: - Temporarily disabled (build issues with __gcov_merge_time_profile) - Will re-enable PGO in future commit after resolving gcc/lto issues - Current benchmarks are without PGO (fair A/B comparison) Other Changes: - .gitignore: Added *.d files (dependency files, auto-generated) - Makefile: PGO targets temporarily disabled (show informational message) - build_pgo.sh: Temporarily disabled (show "PGO paused" message) Next: Phase 4-Step3 (Front Config Box, target +5-8%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 11:58:37 +09:00
// Ultra-thin Tiny allocation using Hot/Cold Path Box (Phase 4-Step2)
//
// IMPROVEMENTS over Phase 26-A:
// - Branch reduction: Hot path has only 1 branch (cache empty check)
// - Branch hints: TINY_HOT_LIKELY/UNLIKELY for better CPU prediction
// - Hot/Cold separation: Keeps hot path small (better i-cache locality)
// - Explicit fallback: Clear hot→cold transition
//
// PERFORMANCE:
// - Baseline (Phase 26-A, no PGO): 53.3 M ops/s
// - Hot/Cold Box (no PGO): 57.2 M ops/s (+7.3%)
//
// DESIGN:
// 1. size → class_idx (same as Phase 26-A)
// 2. Hot path: tiny_hot_alloc_fast() - cache hit (1 branch)
// 3. Cold path: tiny_cold_refill_and_alloc() - cache miss (noinline, cold)
//
2025-11-17 05:29:08 +09:00
// Preconditions:
// - Called AFTER malloc() safety checks (lock depth, initializing, LD_SAFE)
// - size <= tiny_get_max_size() (caller verified)
// Returns:
// - USER pointer on success
Phase 4-Step2: Add Hot/Cold Path Box (+7.3% performance) Implemented Hot/Cold Path separation using Box pattern for Tiny allocations: Performance Improvement (without PGO): - Baseline (Phase 26-A): 53.3 M ops/s - Hot/Cold Box (Phase 4-Step2): 57.2 M ops/s - Gain: +7.3% (+3.9 M ops/s) Implementation: 1. core/box/tiny_front_hot_box.h - Ultra-fast hot path (1 branch) - Removed range check (caller guarantees valid class_idx) - Inline cache hit path with branch prediction hints - Debug metrics with zero overhead in Release builds 2. core/box/tiny_front_cold_box.h - Slow cold path (noinline, cold) - Refill logic (batch allocation from SuperSlab) - Drain logic (batch free to SuperSlab) - Error reporting and diagnostics 3. core/front/malloc_tiny_fast.h - Updated to use Hot/Cold Boxes - Hot path: tiny_hot_alloc_fast() (1 branch: cache empty check) - Cold path: tiny_cold_refill_and_alloc() (noinline, cold attribute) - Clear separation improves i-cache locality Branch Analysis: - Baseline: 4-5 branches in hot path (range check + cache check + refill logic mixed) - Hot/Cold Box: 1 branch in hot path (cache empty check only) - Reduction: 3-4 branches eliminated from hot path Design Principles (Box Pattern): ✅ Single Responsibility: Hot path = cache hit only, Cold path = refill/errors ✅ Clear Contract: Hot returns NULL on miss, Cold handles miss ✅ Observable: Debug metrics (TINY_HOT_METRICS_*) gated by NDEBUG ✅ Safe: Branch prediction hints (TINY_HOT_LIKELY/UNLIKELY) ✅ Testable: Isolated hot/cold paths, easy A/B testing PGO Status: - Temporarily disabled (build issues with __gcov_merge_time_profile) - Will re-enable PGO in future commit after resolving gcc/lto issues - Current benchmarks are without PGO (fair A/B comparison) Other Changes: - .gitignore: Added *.d files (dependency files, auto-generated) - Makefile: PGO targets temporarily disabled (show informational message) - build_pgo.sh: Temporarily disabled (show "PGO paused" message) Next: Phase 4-Step3 (Front Config Box, target +5-8%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 11:58:37 +09:00
// - NULL on failure (caller falls back to normal path)
//
2025-11-17 05:29:08 +09:00
__attribute__((always_inline))
static inline void* malloc_tiny_fast(size_t size) {
const int front_v3_on = tiny_front_v3_enabled();
const TinyFrontV3Snapshot* front_snap =
__builtin_expect(front_v3_on, 0) ? tiny_front_v3_snapshot_get() : NULL;
const bool route_fast_on = front_v3_on && tiny_front_v3_lut_enabled() &&
tiny_front_v3_route_fast_enabled();
int class_idx = -1;
tiny_route_kind_t route = TINY_ROUTE_LEGACY;
bool route_trusted = false;
if (front_v3_on && tiny_front_v3_lut_enabled()) {
const TinyFrontV3SizeClassEntry* e = tiny_front_v3_lut_lookup(size);
if (e && e->class_idx != TINY_FRONT_V3_INVALID_CLASS) {
class_idx = (int)e->class_idx;
route = (tiny_route_kind_t)e->route_kind;
route_trusted = route_fast_on;
}
}
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
class_idx = hak_tiny_size_to_class(size);
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
return NULL;
}
route = tiny_route_for_class((uint8_t)class_idx);
route_trusted = false;
} else if (!route_trusted &&
route != TINY_ROUTE_LEGACY && route != TINY_ROUTE_HEAP &&
route != TINY_ROUTE_HOTHEAP_V2 && route != TINY_ROUTE_SMALL_HEAP_V3 &&
route != TINY_ROUTE_SMALL_HEAP_V4) {
route = tiny_route_for_class((uint8_t)class_idx);
}
tiny_front_alloc_stat_inc(class_idx);
2025-11-17 05:29:08 +09:00
// C7 ULTRA stub (UF-1): delegates to v3, ENV gated
if (class_idx == 7 &&
tiny_front_v3_enabled() &&
tiny_front_v3_c7_ultra_enabled() &&
small_heap_v3_c7_enabled()) {
void* ultra_p = tiny_c7_ultra_alloc(size);
if (ultra_p) {
return ultra_p;
}
// fallback to existing route on miss
}
switch (route) {
case TINY_ROUTE_SMALL_HEAP_V3: {
void* v3p = so_alloc((uint32_t)class_idx);
if (TINY_HOT_LIKELY(v3p != NULL)) {
return v3p;
}
so_v3_record_alloc_fallback((uint8_t)class_idx);
// fallthrough to v2/v1
__attribute__((fallthrough));
}
case TINY_ROUTE_SMALL_HEAP_V4: {
void* v4p = small_heap_alloc_fast_v4(small_heap_ctx_v4_get(), class_idx);
if (TINY_HOT_LIKELY(v4p != NULL)) {
return v4p;
}
so_v3_record_alloc_fallback((uint8_t)class_idx);
// fallthrough to v2/v1
__attribute__((fallthrough));
}
case TINY_ROUTE_HOTHEAP_V2: {
void* v2p = tiny_hotheap_v2_alloc((uint8_t)class_idx);
if (TINY_HOT_LIKELY(v2p != NULL)) {
return v2p;
}
tiny_hotheap_v2_record_route_fallback((uint8_t)class_idx);
// fallthrough to TinyHeap v1
__attribute__((fallthrough));
}
case TINY_ROUTE_HEAP: {
void* heap_ptr = NULL;
if (class_idx == 7) {
heap_ptr = tiny_c7_alloc_fast(size);
} else {
heap_ptr = tiny_heap_alloc_class_fast(tiny_heap_ctx_for_thread(), class_idx, size);
}
if (heap_ptr) {
return heap_ptr;
}
break;
}
case TINY_ROUTE_LEGACY:
default:
break;
}
// Legacy Tiny front
void* ptr = NULL;
if (!front_snap || front_snap->unified_cache_on) {
ptr = tiny_hot_alloc_fast(class_idx);
}
Phase 4-Step2: Add Hot/Cold Path Box (+7.3% performance) Implemented Hot/Cold Path separation using Box pattern for Tiny allocations: Performance Improvement (without PGO): - Baseline (Phase 26-A): 53.3 M ops/s - Hot/Cold Box (Phase 4-Step2): 57.2 M ops/s - Gain: +7.3% (+3.9 M ops/s) Implementation: 1. core/box/tiny_front_hot_box.h - Ultra-fast hot path (1 branch) - Removed range check (caller guarantees valid class_idx) - Inline cache hit path with branch prediction hints - Debug metrics with zero overhead in Release builds 2. core/box/tiny_front_cold_box.h - Slow cold path (noinline, cold) - Refill logic (batch allocation from SuperSlab) - Drain logic (batch free to SuperSlab) - Error reporting and diagnostics 3. core/front/malloc_tiny_fast.h - Updated to use Hot/Cold Boxes - Hot path: tiny_hot_alloc_fast() (1 branch: cache empty check) - Cold path: tiny_cold_refill_and_alloc() (noinline, cold attribute) - Clear separation improves i-cache locality Branch Analysis: - Baseline: 4-5 branches in hot path (range check + cache check + refill logic mixed) - Hot/Cold Box: 1 branch in hot path (cache empty check only) - Reduction: 3-4 branches eliminated from hot path Design Principles (Box Pattern): ✅ Single Responsibility: Hot path = cache hit only, Cold path = refill/errors ✅ Clear Contract: Hot returns NULL on miss, Cold handles miss ✅ Observable: Debug metrics (TINY_HOT_METRICS_*) gated by NDEBUG ✅ Safe: Branch prediction hints (TINY_HOT_LIKELY/UNLIKELY) ✅ Testable: Isolated hot/cold paths, easy A/B testing PGO Status: - Temporarily disabled (build issues with __gcov_merge_time_profile) - Will re-enable PGO in future commit after resolving gcc/lto issues - Current benchmarks are without PGO (fair A/B comparison) Other Changes: - .gitignore: Added *.d files (dependency files, auto-generated) - Makefile: PGO targets temporarily disabled (show informational message) - build_pgo.sh: Temporarily disabled (show "PGO paused" message) Next: Phase 4-Step3 (Front Config Box, target +5-8%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 11:58:37 +09:00
if (TINY_HOT_LIKELY(ptr != NULL)) {
return ptr;
2025-11-17 05:29:08 +09:00
}
Phase 4-Step2: Add Hot/Cold Path Box (+7.3% performance) Implemented Hot/Cold Path separation using Box pattern for Tiny allocations: Performance Improvement (without PGO): - Baseline (Phase 26-A): 53.3 M ops/s - Hot/Cold Box (Phase 4-Step2): 57.2 M ops/s - Gain: +7.3% (+3.9 M ops/s) Implementation: 1. core/box/tiny_front_hot_box.h - Ultra-fast hot path (1 branch) - Removed range check (caller guarantees valid class_idx) - Inline cache hit path with branch prediction hints - Debug metrics with zero overhead in Release builds 2. core/box/tiny_front_cold_box.h - Slow cold path (noinline, cold) - Refill logic (batch allocation from SuperSlab) - Drain logic (batch free to SuperSlab) - Error reporting and diagnostics 3. core/front/malloc_tiny_fast.h - Updated to use Hot/Cold Boxes - Hot path: tiny_hot_alloc_fast() (1 branch: cache empty check) - Cold path: tiny_cold_refill_and_alloc() (noinline, cold attribute) - Clear separation improves i-cache locality Branch Analysis: - Baseline: 4-5 branches in hot path (range check + cache check + refill logic mixed) - Hot/Cold Box: 1 branch in hot path (cache empty check only) - Reduction: 3-4 branches eliminated from hot path Design Principles (Box Pattern): ✅ Single Responsibility: Hot path = cache hit only, Cold path = refill/errors ✅ Clear Contract: Hot returns NULL on miss, Cold handles miss ✅ Observable: Debug metrics (TINY_HOT_METRICS_*) gated by NDEBUG ✅ Safe: Branch prediction hints (TINY_HOT_LIKELY/UNLIKELY) ✅ Testable: Isolated hot/cold paths, easy A/B testing PGO Status: - Temporarily disabled (build issues with __gcov_merge_time_profile) - Will re-enable PGO in future commit after resolving gcc/lto issues - Current benchmarks are without PGO (fair A/B comparison) Other Changes: - .gitignore: Added *.d files (dependency files, auto-generated) - Makefile: PGO targets temporarily disabled (show informational message) - build_pgo.sh: Temporarily disabled (show "PGO paused" message) Next: Phase 4-Step3 (Front Config Box, target +5-8%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 11:58:37 +09:00
return tiny_cold_refill_and_alloc(class_idx);
2025-11-17 05:29:08 +09:00
}
// ============================================================================
// Phase 26-B: free_tiny_fast() - Ultra-thin Tiny deallocation
// ============================================================================
// Single-layer Tiny deallocation (bypasses hak_free_at + wrapper + diagnostics)
// Preconditions:
// - ptr is from malloc_tiny_fast() (has valid header)
// - Front Gate Unified is enabled
// Returns:
// - 1 on success (pushed to Unified Cache)
// - 0 on failure (caller falls back to normal free path)
__attribute__((always_inline))
static inline int free_tiny_fast(void* ptr) {
if (__builtin_expect(!ptr, 0)) return 0;
#if HAKMEM_TINY_HEADER_CLASSIDX
2025-11-17 05:29:08 +09:00
// 1. ページ境界ガード:
// ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。
// その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。
uintptr_t off = (uintptr_t)ptr & 0xFFFu;
if (__builtin_expect(off == 0, 0)) {
return 0;
}
// 2. Fast header magic validation (必須)
// Release ビルドでは tiny_region_id_read_header() が magic を省略するため、
// ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。
uint8_t* header_ptr = (uint8_t*)ptr - 1;
uint8_t header = *header_ptr;
uint8_t magic = header & 0xF0u;
if (__builtin_expect(magic != HEADER_MAGIC, 0)) {
// Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ
return 0;
}
// 3. class_idx 抽出下位4bit
int class_idx = (int)(header & HEADER_CLASS_MASK);
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
return 0;
}
// 4. BASE を計算して Unified Cache に push
void* base = (void*)((char*)ptr - 1);
tiny_front_free_stat_inc(class_idx);
// C7 ULTRA stub (UF-1): delegates to v3, ENV gated
if (class_idx == 7 &&
tiny_front_v3_enabled() &&
tiny_front_v3_c7_ultra_enabled() &&
small_heap_v3_c7_enabled()) {
tiny_c7_ultra_free(base);
return 1;
}
// C7 v3 fast classify: bypass classify_ptr/ss_map_lookup for clear hits
if (class_idx == 7 &&
tiny_front_v3_enabled() &&
tiny_ptr_fast_classify_enabled() &&
small_heap_v3_c7_enabled() &&
smallobject_hotbox_v3_can_own_c7(base)) {
so_free(7, base);
return 1;
}
tiny_route_kind_t route = tiny_route_for_class((uint8_t)class_idx);
if ((class_idx == 7 || class_idx == 6) &&
route == TINY_ROUTE_SMALL_HEAP_V4 &&
tiny_ptr_fast_classify_v4_enabled() &&
smallobject_hotbox_v4_can_own(class_idx, base)) {
small_heap_free_fast_v4(small_heap_ctx_v4_get(), class_idx, base);
return 1;
}
const int use_tiny_heap = tiny_route_is_heap_kind(route);
const TinyFrontV3Snapshot* front_snap =
__builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL;
// TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast.
// In Release builds, we trust header magic (0xA0) as sufficient validation.
#if !HAKMEM_BUILD_RELEASE
// 5. Superslab 登録確認(誤分類防止)
SuperSlab* ss_guard = hak_super_lookup(ptr);
if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) {
return 0; // hakmem 管理外 → 通常 free 経路へ
}
#endif // !HAKMEM_BUILD_RELEASE
// Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path
{
static __thread int g_larson_fix = -1;
if (__builtin_expect(g_larson_fix == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL");
fflush(stderr);
#endif
}
if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) {
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
SuperSlab* ss = ss_fast_lookup(base);
if (ss) {
int slab_idx = slab_index_for(ss, base);
if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) {
uint32_t self_tid = tiny_self_u32_local();
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
TinySlabMeta* meta = &ss->slabs[slab_idx];
// LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes)
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
#if !HAKMEM_BUILD_RELEASE
static _Atomic uint64_t g_owner_check_count = 0;
uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1);
if (oc < 10) {
fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n",
ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp));
fflush(stderr);
}
#endif
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
// Cross-thread free → route to remote queue instead of poisoning TLS cache
#if !HAKMEM_BUILD_RELEASE
static _Atomic uint64_t g_cross_thread_count = 0;
uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1);
if (ct < 20) {
fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n",
ptr, owner_tid_low, self_tid_cmp, self_tid);
fflush(stderr);
}
#endif
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
return 1; // handled via remote queue
}
return 0; // remote push failed; fall back to normal path
}
// Same-thread + TinyHeap route → route-based free
if (__builtin_expect(use_tiny_heap, 0)) {
switch (route) {
case TINY_ROUTE_SMALL_HEAP_V4:
if (class_idx == 7 || class_idx == 6 || class_idx == 5) {
small_heap_free_fast_v4(small_heap_ctx_v4_get(), class_idx, base);
return 1;
}
__attribute__((fallthrough));
case TINY_ROUTE_SMALL_HEAP_V3:
so_free((uint32_t)class_idx, base);
return 1;
case TINY_ROUTE_HOTHEAP_V2:
tiny_hotheap_v2_free((uint8_t)class_idx, base, meta);
return 1;
case TINY_ROUTE_HEAP: {
tiny_heap_ctx_t* ctx = tiny_heap_ctx_for_thread();
if (class_idx == 7) {
tiny_c7_free_fast_with_meta(ss, slab_idx, base);
} else {
tiny_heap_free_class_fast_with_meta(ctx, class_idx, ss, slab_idx, base);
}
return 1;
}
default:
break;
}
}
}
}
if (use_tiny_heap) {
// fallback: lookup failed but TinyHeap front is ON → use generic TinyHeap free
if (route == TINY_ROUTE_HOTHEAP_V2) {
tiny_hotheap_v2_record_free_fallback((uint8_t)class_idx);
} else if (route == TINY_ROUTE_SMALL_HEAP_V3 || route == TINY_ROUTE_SMALL_HEAP_V4) {
so_v3_record_free_fallback((uint8_t)class_idx);
}
tiny_heap_free_class_fast(tiny_heap_ctx_for_thread(), class_idx, ptr);
return 1;
}
}
}
// Debug: Log free operations (first 5000, all classes)
#if !HAKMEM_BUILD_RELEASE
{
extern _Atomic uint64_t g_debug_op_count;
extern __thread TinyTLSSLL g_tls_sll[];
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
// Note: Shares g_debug_op_count with alloc logging, so bump the window.
if (op < 5000) {
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n",
(unsigned long)op, class_idx, ptr, base,
g_tls_sll[class_idx].count);
fflush(stderr);
}
}
#endif
int pushed = 0;
if (!front_snap || front_snap->unified_cache_on) {
pushed = unified_cache_push(class_idx, HAK_BASE_FROM_RAW(base));
}
2025-11-17 05:29:08 +09:00
if (__builtin_expect(pushed, 1)) {
return 1; // Success
}
// Unified Cache full → 通常 free 経路へ
return 0;
#else
// No header mode - fall back to normal free
return 0;
#endif
}
#endif // HAK_FRONT_MALLOC_TINY_FAST_H