Files
hakmem/core/tiny_free_fast_v2.inc.h
Moe Charm (CI) 8052e8b320 Phase 24-26: Hot path atomic telemetry prune (+2.00% cumulative)
Summary:
- Phase 24 (alloc stats): +0.93% GO
- Phase 25 (free stats): +1.07% GO
- Phase 26 (diagnostics): -0.33% NEUTRAL (code cleanliness)
- Total: 11 atomics compiled-out, +2.00% improvement

Phase 24: OBSERVE tax prune (tiny_class_stats_box.h)
- Added HAKMEM_TINY_CLASS_STATS_COMPILED (default: 0)
- Wrapped 5 stats functions: uc_miss, warm_hit, shared_lock, tls_carve_*
- Result: +0.93% (baseline 56.675M vs compiled-in 56.151M ops/s)

Phase 25: Tiny free stats prune (tiny_superslab_free.inc.h)
- Added HAKMEM_TINY_FREE_STATS_COMPILED (default: 0)
- Wrapped g_free_ss_enter atomic in free hot path
- Result: +1.07% (baseline 57.017M vs compiled-in 56.415M ops/s)

Phase 26: Hot path diagnostic atomics prune
- Added 5 compile gates for low-frequency error counters:
  - HAKMEM_TINY_C7_FREE_COUNT_COMPILED
  - HAKMEM_TINY_HDR_MISMATCH_LOG_COMPILED
  - HAKMEM_TINY_HDR_META_MISMATCH_COMPILED
  - HAKMEM_TINY_METRIC_BAD_CLASS_COMPILED
  - HAKMEM_TINY_HDR_META_FAST_COMPILED
- Result: -0.33% NEUTRAL (within noise, kept for cleanliness)

Alignment with mimalloc principles:
- "No atomics on hot path" - telemetry moved to compile-time opt-in
- Fixed per-op tax elimination
- Production builds: maximum performance (atomics compiled-out)
- Research builds: full diagnostics (COMPILED=1)

Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-16 05:35:11 +09:00

445 lines
20 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_free_fast_v2.inc.h - Phase 7: Ultra-Fast Free Path (Header-based)
// Purpose: Eliminate SuperSlab lookup bottleneck (52.63% CPU → <5%)
// Design: Read class_idx from inline header (O(1), 2-3 cycles)
// Performance: 1.2M → 40-60M ops/s (30-50x improvement)
//
// Key Innovation: Smart Headers
// - 1-byte header before each block stores class_idx
// - Slab[0]: 0% overhead (reuses 960B wasted padding)
// - Other slabs: ~1.5% overhead (1 byte per block)
// - Total: <2% memory overhead for 30-50x speed gain
//
// Flow (3-5 instructions, 5-10 cycles):
// 1. Read class_idx from header (ptr-1) [1 instruction, 2-3 cycles]
// 2. Push to TLS freelist [2-3 instructions, 3-5 cycles]
// 3. Done! (No lookup, no validation, no atomic)
#pragma once
#include <stdlib.h> // For getenv() in cross-thread check ENV gate
#include <pthread.h> // For pthread_self() in cross-thread check
#include "tiny_region_id.h"
#include "hakmem_build_flags.h"
#include "hakmem_tiny_config.h" // For TINY_TLS_MAG_CAP, TINY_NUM_CLASSES
#include "box/tls_sll_box.h" // Box TLS-SLL API
#include "box/tls_sll_drain_box.h" // Box TLS-SLL Drain (Option B)
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
#include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
// Ring Cache and Unified Cache removed (A/B test: OFF is faster)
#include "hakmem_super_registry.h" // For hak_super_lookup (cross-thread check)
#include "superslab/superslab_inline.h" // For slab_index_for (cross-thread check)
#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary
#include "box/free_remote_box.h" // For tiny_free_remote_box (cross-thread routing)
#include "box/ptr_conversion_box.h" // Phase 10: Correct pointer arithmetic
// Phase 7: Header-based ultra-fast free
#if HAKMEM_TINY_HEADER_CLASSIDX
// External TLS variables (defined in hakmem_tiny.c)
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern int g_tls_sll_enable; // Honored for fast free: when 0, fall back to slow path
// External functions
extern void hak_tiny_free(void* ptr); // Fallback for non-header allocations
// Inline helper: Get current thread ID (lower 32 bits)
#ifndef TINY_SELF_U32_LOCAL_DEFINED
#define TINY_SELF_U32_LOCAL_DEFINED
static inline uint32_t tiny_self_u32_local(void) {
return (uint32_t)(uintptr_t)pthread_self();
}
#endif
// ========== Ultra-Fast Free (Header-based) ==========
// Ultra-fast free for header-based allocations
// Returns: 1 if handled, 0 if needs slow path
//
// Performance: 3-5 instructions, 5-10 cycles
// vs Current: 330+ lines, 500+ cycles (100x faster!)
//
// Assembly (x86-64, release build):
// movzbl -0x1(%rdi),%eax // Read header (class_idx)
// mov g_tls_sll_head(,%rax,8),%rdx // Load head
// mov %rdx,(%rdi) // ptr->next = head
// mov %rdi,g_tls_sll_head(,%rax,8) // head = ptr
// addl $0x1,g_tls_sll_count(,%rax,4) // count++
// ret
//
// Expected: 3-5 instructions, 5-10 cycles (L1 hit)
static inline int hak_tiny_free_fast_v2(void* ptr) {
if (__builtin_expect(!ptr, 0)) return 0;
// Respect global SLL toggle: when disabled, do not use TLS SLL fast path.
if (__builtin_expect(!g_tls_sll_enable, 0)) {
return 0; // Force slow path
}
// Phase E3-1: Remove registry lookup (50-100 cycles overhead)
// Reason: Phase E1 added headers to C7, making this check redundant
// Header magic validation (2-3 cycles) is now sufficient for all classes
// Expected: 9M → 30-50M ops/s recovery (+226-443%)
// CRITICAL: Check if header is accessible before reading
// FIX: Use ptr directly, not ptr-1, for validation if possible, or trust lookup
// void* header_addr = (char*)ptr - 1; // <-- Dangerous for C0
#if !HAKMEM_BUILD_RELEASE
// Debug: Validate header accessibility (metadata-based check)
// Phase 9: mincore() REMOVED - no syscall overhead (0 cycles)
// Strategy: Trust internal metadata (registry ensures memory is valid)
// Benefit: Catch invalid pointers via header magic validation below
extern int hak_is_memory_readable(void* addr);
if (!hak_is_memory_readable(ptr)) { // Check ptr, not header_addr
return 0; // Header not accessible - not a Tiny allocation
}
#else
// Release: Phase 9 optimization - mincore() completely removed
// OLD: Page boundary check + mincore() syscall (~634 cycles)
// NEW: No check needed - trust internal metadata (0 cycles)
// Safety: Header magic validation below catches invalid pointers
// Performance: 841 syscalls → 0 (100% elimination)
// (Page boundary check removed - adds 1-2 cycles without benefit)
#endif
// 1. Read class_idx from header (2-3 cycles, L1 hit)
// Note: In release mode, tiny_region_id_read_header() skips magic validation (saves 2-3 cycles)
#if HAKMEM_DEBUG_VERBOSE
static _Atomic int debug_calls = 0;
if (atomic_fetch_add(&debug_calls, 1) < 5) {
fprintf(stderr, "[TINY_FREE_V2] Before read_header, ptr=%p\n", ptr);
}
#endif
// P2.1: Use class_map instead of Header to avoid Header/Next contention
// ENV: HAKMEM_TINY_NO_CLASS_MAP=1 to disable (default: ON - class_map is preferred)
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
int class_idx = -1;
{
// P2.1: Default is ON (use class_map), HAK_ENV returns inverted logic
int g_use_class_map = !HAK_ENV_TINY_NO_CLASS_MAP();
if (__builtin_expect(g_use_class_map, 1)) {
// P1.2: class_map path - avoid Header read
// FIX: Use ptr (USER) for lookup, NOT ptr-1
SuperSlab* ss = ss_fast_lookup(ptr);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
// FIX: Use ptr (USER) for slab index
int slab_idx = slab_index_for(ss, ptr);
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
int map_class = tiny_get_class_from_ss(ss, slab_idx);
if (map_class < TINY_NUM_CLASSES) {
class_idx = map_class;
#if HAKMEM_DEBUG_VERBOSE
if (atomic_load(&debug_calls) <= 5) {
fprintf(stderr, "[TINY_FREE_V2] class_map lookup: class_idx=%d\n", class_idx);
}
#endif
}
}
}
// Fallback to Header if class_map lookup failed
if (class_idx < 0) {
class_idx = tiny_region_id_read_header(ptr);
#if HAKMEM_DEBUG_VERBOSE
if (atomic_load(&debug_calls) <= 5) {
fprintf(stderr, "[TINY_FREE_V2] class_map failed, Header fallback: class_idx=%d\n", class_idx);
}
#endif
}
} else {
// P2.1: Fallback to Header read (disabled class_map mode)
class_idx = tiny_region_id_read_header(ptr);
#if HAKMEM_DEBUG_VERBOSE
if (atomic_load(&debug_calls) <= 5) {
fprintf(stderr, "[TINY_FREE_V2] Header read: class_idx=%d\n", class_idx);
}
#endif
}
}
#if HAKMEM_DEBUG_VERBOSE
if (atomic_load(&debug_calls) <= 5) {
fprintf(stderr, "[TINY_FREE_V2] After read_header, class_idx=%d\n", class_idx);
}
#endif
// TWO-SPEED: Header/meta cross-check is DEBUG-ONLY to keep HOT PATH fast.
// In Release builds, we trust the header-based classification.
#if !HAKMEM_BUILD_RELEASE
// Cross-check header class vs meta class (if available from fast lookup)
do {
// Try fast owner slab lookup to get meta->class_idx for comparison
// FIX: Use ptr (USER)
SuperSlab* ss = hak_super_lookup(ptr);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
// FIX: Use ptr (USER)
int sidx = slab_index_for(ss, ptr);
if (sidx >= 0 && sidx < ss_slabs_capacity(ss)) {
TinySlabMeta* m = &ss->slabs[sidx];
uint8_t meta_cls = m->class_idx;
if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) {
// Phase 26E: Compile-out g_hdr_meta_fast atomic (default OFF)
#if HAKMEM_HDR_META_FAST_COMPILED
static _Atomic uint32_t g_hdr_meta_fast = 0;
uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_fast, 1, memory_order_relaxed);
#else
uint32_t n = 0; // No-op when compiled out
#endif
if (n < 16) {
fprintf(stderr,
"[FREE_FAST_HDR_META_MISMATCH] hdr_cls=%d meta_cls=%u ptr=%p slab_idx=%d ss=%p\n",
class_idx, (unsigned)meta_cls, ptr, sidx, (void*)ss);
if (n < 4) {
void* bt[8];
int frames = backtrace(bt, 8);
backtrace_symbols_fd(bt, frames, fileno(stderr));
}
fflush(stderr);
}
}
}
}
} while (0);
#endif // !HAKMEM_BUILD_RELEASE
// Check if header read failed (invalid magic in debug, or out-of-bounds class_idx)
if (__builtin_expect(class_idx < 0, 0)) {
// Invalid header - route to slow path (non-header allocation or corrupted header)
return 0;
}
// PRIORITY 1: Bounds check on class_idx from header
if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
fprintf(stderr, "[TINY_FREE_V2] FATAL: class_idx=%d out of bounds (from header at %p)\n",
class_idx, ptr);
fflush(stderr);
assert(0 && "class_idx from header out of bounds");
return 0;
}
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
#endif
// 2. Check TLS freelist capacity (defense in depth - ALWAYS ENABLED)
// CRITICAL: Enable in both debug and release to prevent corruption accumulation
// Reason: If C7 slips through magic validation, capacity limit prevents unbounded growth
// Cost: 1 comparison (~1 cycle, predict-not-taken)
// Benefit: Fail-safe against TLS SLL pollution from false positives
uint32_t cap = (uint32_t)TINY_TLS_MAG_CAP;
if (__builtin_expect(g_tls_sll[class_idx].count >= cap, 0)) {
return 0; // Route to slow path for spill (Front Gate will catch corruption)
}
// 3. Push base to TLS freelist (4 instructions, 5-7 cycles)
// Must push base (block start) not user pointer!
// Phase E1: ALL classes (C0-C7) have 1-byte header → base = ptr-1
// FIX: Use ptr_user_to_base(ptr, class_idx) logic
void* base = HAK_BASE_TO_RAW(ptr_user_to_base(HAK_USER_FROM_RAW(ptr), class_idx));
// Phase 14-C: UltraHot は free 時に横取りしないBorrowing 設計)
// → 正史TLS SLLの在庫を正しく保つ
// → UltraHot refill は alloc 側で TLS SLL から借りる
// LARSON FIX (2025-11-16): Cross-thread free detection - ENV GATED
// Problem: Larson MT crash - TLS SLL poison (0xbada55...) from cross-thread free
// Root cause: Block allocated by Thread A, freed by Thread B → pushed to B's TLS SLL
// → B allocates the block → metadata still points to A's SuperSlab → corruption
// Solution: Check owner_tid_low, route cross-thread free to remote queue
// Status: ENV-gated for performance (HAKMEM_TINY_LARSON_FIX=1 to enable)
// Performance: OFF=5-10 cycles/free, ON=110-520 cycles/free (registry lookup overhead)
{
// Priority-2: Use cached ENV (eliminate lazy-init TLS syscall overhead)
if (__builtin_expect(HAK_ENV_TINY_LARSON_FIX(), 0)) {
// Cross-thread check enabled - MT safe mode
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
SuperSlab* ss = ss_fast_lookup(base);
if (__builtin_expect(ss != NULL, 1)) {
// FIX: slab_index_for on BASE (since base is correct now)
int slab_idx = slab_index_for(ss, base);
if (__builtin_expect(slab_idx >= 0, 1)) {
uint32_t self_tid = tiny_self_u32_local();
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
// Check if this is a cross-thread free (compare bits 8-15; low 8 bits are 0 on glibc)
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
// Cross-thread free → remote queue routing
TinySlabMeta* meta = &ss->slabs[slab_idx];
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
// Successfully queued to remote, done
return 1;
}
// Remote push failed → fall through to slow path
return 0;
}
// Same-thread free → continue to TLS SLL fast path below
}
}
// SuperSlab lookup failed → fall through to TLS SLL (may be headerless C7)
}
}
// REVERT E3-2: Use Box TLS-SLL for all builds (testing hypothesis)
// Hypothesis: Box TLS-SLL acts as verification layer, masking underlying bugs
#if !HAKMEM_BUILD_RELEASE
// Address watcher: Check if this is the watched address being freed
{
extern uintptr_t get_watch_addr(void);
uintptr_t watch = get_watch_addr();
if (watch != 0 && (uintptr_t)base == watch) {
extern _Atomic uint64_t g_debug_op_count;
extern __thread TinyTLSSLL g_tls_sll[];
uint64_t op = atomic_load(&g_debug_op_count);
fprintf(stderr, "\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "[WATCH_FREE_HIT] Address %p freed!\n", base);
fprintf(stderr, "========================================\n");
fprintf(stderr, " Operation: #%lu\n", (unsigned long)op);
fprintf(stderr, " Class: %d\n", class_idx);
fprintf(stderr, " User ptr: %p\n", ptr);
fprintf(stderr, " Base ptr: %p\n", base);
fprintf(stderr, " TLS count: %u (before free)\n", g_tls_sll[class_idx].count);
fprintf(stderr, " TLS head: %p\n", g_tls_sll[class_idx].head);
fprintf(stderr, "========================================\n");
fprintf(stderr, "\n");
fflush(stderr);
// Print backtrace
void* bt[16];
int frames = backtrace(bt, 16);
fprintf(stderr, "[WATCH_FREE_BACKTRACE] %d frames:\n", frames);
backtrace_symbols_fd(bt, frames, fileno(stderr));
fprintf(stderr, "\n");
fflush(stderr);
// Abort to preserve state
fprintf(stderr, "[WATCH_ABORT] Aborting on watched free...\n");
fflush(stderr);
abort();
}
}
// Debug: Log free operations (first 2000, ALL classes)
{
extern _Atomic uint64_t g_debug_op_count;
extern __thread TinyTLSSLL g_tls_sll[];
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
if (op < 2000) { // ALL classes, not just class 1
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p tls_count_before=%u\n",
(unsigned long)op, class_idx, ptr, base,
g_tls_sll[class_idx].count);
fflush(stderr);
}
}
#endif
if (!tls_sll_push(class_idx, base, UINT32_MAX)) {
// C7 rejected or capacity exceeded - route to slow path
return 0;
}
// P1.3/P2.2: Track active/tls_cached when block is freed (user gives it back)
// ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance)
// Flow: User → TLS SLL means active--, tls_cached++
// Priority-2: Use cached ENV (eliminate lazy-init TLS syscall overhead)
{
if (__builtin_expect(HAK_ENV_TINY_ACTIVE_TRACK(), 0)) {
// Lookup the actual slab meta for this block
SuperSlab* ss = ss_fast_lookup(base);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
int slab_idx = slab_index_for(ss, base);
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
atomic_fetch_sub_explicit(&meta->active, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&meta->tls_cached, 1, memory_order_relaxed); // P2.2
}
}
}
}
// Option B: Periodic TLS SLL Drain (restore slab accounting consistency)
// Purpose: Every N frees (default: 1024), drain TLS SLL → slab freelist
// Impact: Enables empty detection → SuperSlabs freed → LRU cache functional
// Cost: 2-3 cycles (counter increment + comparison, predict-not-taken)
// Benefit: +1,300-1,700% throughput (563K → 8-10M ops/s expected)
tiny_tls_sll_try_drain(class_idx);
return 1; // Success - handled in fast path
}
// ========== Free Entry Point ==========
// Entry point for free() - tries fast path first, falls back to slow path
//
// Flow:
// 1. Try ultra-fast free (header-based) → 95-99% hit rate
// 2. Miss → Fallback to slow path → 1-5% (non-header, cache full)
//
// Performance:
// - Fast path: 5-10 cycles (header read + TLS push)
// - Slow path: 500+ cycles (SuperSlab lookup + validation)
// - Weighted average: ~10-30 cycles (vs 500+ current)
static inline void hak_free_fast_v2_entry(void* ptr) {
// Try ultra-fast free (header-based)
if (__builtin_expect(hak_tiny_free_fast_v2(ptr), 1)) {
return; // Success - done in 5-10 cycles!
}
// Slow path: Non-header allocation or TLS cache full
hak_tiny_free(ptr);
}
// ========== Performance Counters (Debug) ==========
#if !HAKMEM_BUILD_RELEASE
// Performance counters (TLS, lightweight)
static __thread uint64_t g_free_v2_fast_hits = 0;
static __thread uint64_t g_free_v2_slow_hits = 0;
// Track fast path hit rate
static inline void hak_free_v2_track_fast(void) {
g_free_v2_fast_hits++;
}
static inline void hak_free_v2_track_slow(void) {
g_free_v2_slow_hits++;
}
// Print stats at exit
static void hak_free_v2_print_stats(void) __attribute__((destructor));
static void hak_free_v2_print_stats(void) {
uint64_t total = g_free_v2_fast_hits + g_free_v2_slow_hits;
if (total == 0) return;
double hit_rate = (double)g_free_v2_fast_hits / total * 100.0;
fprintf(stderr, "[FREE_V2] Fast hits: %lu, Slow hits: %lu, Hit rate: %.2f%%\n",
g_free_v2_fast_hits, g_free_v2_slow_hits, hit_rate);
}
#else
// Release: No tracking overhead
static inline void hak_free_v2_track_fast(void) {}
static inline void hak_free_v2_track_slow(void) {}
#endif
// ========== Benchmark Comparison ==========
//
// Current (hak_tiny_free_superslab):
// - 2x SuperSlab lookup: 200+ cycles
// - Safety checks (O(n) duplicate scan): 100+ cycles
// - Validation, atomics, diagnostics: 200+ cycles
// - Total: 500+ cycles
// - Throughput: 1.2M ops/s
//
// Phase 7 (hak_tiny_free_fast_v2):
// - Header read: 2-3 cycles
// - TLS push: 3-5 cycles
// - Total: 5-10 cycles (100x faster!)
// - Throughput: 40-60M ops/s (30-50x improvement)
//
// vs System malloc tcache:
// - System: 10-15 cycles (3-4 instructions)
// - HAKMEM: 5-10 cycles (3-5 instructions)
// - Result: 70-110% of System speed (互角〜勝ち!)
#endif // HAKMEM_TINY_HEADER_CLASSIDX