Add active field to TinySlabMeta to track blocks currently held by users (not in TLS SLL or freelist caches). This enables accurate empty slab detection that accounts for TLS SLL cached blocks. Changes: - superslab_types.h: Add _Atomic uint16_t active field - ss_allocation_box.c, hakmem_tiny_superslab.c: Initialize active=0 - tiny_free_fast_v2.inc.h: Decrement active on TLS SLL push - tiny_alloc_fast.inc.h: Add tiny_active_track_alloc() helper, increment active on TLS SLL pop (all code paths) - ss_hot_cold_box.h: ss_is_slab_empty() uses active when enabled All tracking is ENV-gated: HAKMEM_TINY_ACTIVE_TRACK=1 to enable. Default is off for zero performance impact. Invariant: active = used - tls_cached (active <= used) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
438 lines
19 KiB
C
438 lines
19 KiB
C
// tiny_free_fast_v2.inc.h - Phase 7: Ultra-Fast Free Path (Header-based)
|
||
// Purpose: Eliminate SuperSlab lookup bottleneck (52.63% CPU → <5%)
|
||
// Design: Read class_idx from inline header (O(1), 2-3 cycles)
|
||
// Performance: 1.2M → 40-60M ops/s (30-50x improvement)
|
||
//
|
||
// Key Innovation: Smart Headers
|
||
// - 1-byte header before each block stores class_idx
|
||
// - Slab[0]: 0% overhead (reuses 960B wasted padding)
|
||
// - Other slabs: ~1.5% overhead (1 byte per block)
|
||
// - Total: <2% memory overhead for 30-50x speed gain
|
||
//
|
||
// Flow (3-5 instructions, 5-10 cycles):
|
||
// 1. Read class_idx from header (ptr-1) [1 instruction, 2-3 cycles]
|
||
// 2. Push to TLS freelist [2-3 instructions, 3-5 cycles]
|
||
// 3. Done! (No lookup, no validation, no atomic)
|
||
|
||
#pragma once
|
||
#include <stdlib.h> // For getenv() in cross-thread check ENV gate
|
||
#include <pthread.h> // For pthread_self() in cross-thread check
|
||
#include "tiny_region_id.h"
|
||
#include "hakmem_build_flags.h"
|
||
#include "hakmem_tiny_config.h" // For TINY_TLS_MAG_CAP, TINY_NUM_CLASSES
|
||
#include "box/tls_sll_box.h" // Box TLS-SLL API
|
||
#include "box/tls_sll_drain_box.h" // Box TLS-SLL Drain (Option B)
|
||
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
|
||
// Ring Cache and Unified Cache removed (A/B test: OFF is faster)
|
||
#include "hakmem_super_registry.h" // For hak_super_lookup (cross-thread check)
|
||
#include "superslab/superslab_inline.h" // For slab_index_for (cross-thread check)
|
||
#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary
|
||
#include "box/free_remote_box.h" // For tiny_free_remote_box (cross-thread routing)
|
||
|
||
// Phase 7: Header-based ultra-fast free
|
||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||
|
||
// External TLS variables (defined in hakmem_tiny.c)
|
||
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
||
extern int g_tls_sll_enable; // Honored for fast free: when 0, fall back to slow path
|
||
|
||
// External functions
|
||
extern void hak_tiny_free(void* ptr); // Fallback for non-header allocations
|
||
|
||
// Inline helper: Get current thread ID (lower 32 bits)
|
||
#ifndef TINY_SELF_U32_LOCAL_DEFINED
|
||
#define TINY_SELF_U32_LOCAL_DEFINED
|
||
static inline uint32_t tiny_self_u32_local(void) {
|
||
return (uint32_t)(uintptr_t)pthread_self();
|
||
}
|
||
#endif
|
||
|
||
// ========== Ultra-Fast Free (Header-based) ==========
|
||
|
||
// Ultra-fast free for header-based allocations
|
||
// Returns: 1 if handled, 0 if needs slow path
|
||
//
|
||
// Performance: 3-5 instructions, 5-10 cycles
|
||
// vs Current: 330+ lines, 500+ cycles (100x faster!)
|
||
//
|
||
// Assembly (x86-64, release build):
|
||
// movzbl -0x1(%rdi),%eax # Read header (class_idx)
|
||
// mov g_tls_sll_head(,%rax,8),%rdx # Load head
|
||
// mov %rdx,(%rdi) # ptr->next = head
|
||
// mov %rdi,g_tls_sll_head(,%rax,8) # head = ptr
|
||
// addl $0x1,g_tls_sll_count(,%rax,4) # count++
|
||
// ret
|
||
//
|
||
// Expected: 3-5 instructions, 5-10 cycles (L1 hit)
|
||
static inline int hak_tiny_free_fast_v2(void* ptr) {
|
||
if (__builtin_expect(!ptr, 0)) return 0;
|
||
|
||
// Respect global SLL toggle: when disabled, do not use TLS SLL fast path.
|
||
if (__builtin_expect(!g_tls_sll_enable, 0)) {
|
||
return 0; // Force slow path
|
||
}
|
||
|
||
// Phase E3-1: Remove registry lookup (50-100 cycles overhead)
|
||
// Reason: Phase E1 added headers to C7, making this check redundant
|
||
// Header magic validation (2-3 cycles) is now sufficient for all classes
|
||
// Expected: 9M → 30-50M ops/s recovery (+226-443%)
|
||
|
||
// CRITICAL: Check if header is accessible before reading
|
||
void* header_addr = (char*)ptr - 1;
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Debug: Validate header accessibility (metadata-based check)
|
||
// Phase 9: mincore() REMOVED - no syscall overhead (0 cycles)
|
||
// Strategy: Trust internal metadata (registry ensures memory is valid)
|
||
// Benefit: Catch invalid pointers via header magic validation below
|
||
extern int hak_is_memory_readable(void* addr);
|
||
if (!hak_is_memory_readable(header_addr)) {
|
||
return 0; // Header not accessible - not a Tiny allocation
|
||
}
|
||
#else
|
||
// Release: Phase 9 optimization - mincore() completely removed
|
||
// OLD: Page boundary check + mincore() syscall (~634 cycles)
|
||
// NEW: No check needed - trust internal metadata (0 cycles)
|
||
// Safety: Header magic validation below catches invalid pointers
|
||
// Performance: 841 syscalls → 0 (100% elimination)
|
||
// (Page boundary check removed - adds 1-2 cycles without benefit)
|
||
#endif
|
||
|
||
// 1. Read class_idx from header (2-3 cycles, L1 hit)
|
||
// Note: In release mode, tiny_region_id_read_header() skips magic validation (saves 2-3 cycles)
|
||
#if HAKMEM_DEBUG_VERBOSE
|
||
static _Atomic int debug_calls = 0;
|
||
if (atomic_fetch_add(&debug_calls, 1) < 5) {
|
||
fprintf(stderr, "[TINY_FREE_V2] Before read_header, ptr=%p\n", ptr);
|
||
}
|
||
#endif
|
||
|
||
// P1.2: Use class_map instead of Header to avoid Header/Next contention
|
||
// ENV: HAKMEM_TINY_USE_CLASS_MAP=1 to enable (default: 0 for compatibility)
|
||
int class_idx = -1;
|
||
{
|
||
static __thread int g_use_class_map = -1;
|
||
if (__builtin_expect(g_use_class_map == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_USE_CLASS_MAP");
|
||
g_use_class_map = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
|
||
if (__builtin_expect(g_use_class_map, 0)) {
|
||
// P1.2: class_map path - avoid Header read
|
||
SuperSlab* ss = ss_fast_lookup((uint8_t*)ptr - 1);
|
||
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
||
int slab_idx = slab_index_for(ss, (uint8_t*)ptr - 1);
|
||
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
|
||
int map_class = tiny_get_class_from_ss(ss, slab_idx);
|
||
if (map_class < TINY_NUM_CLASSES) {
|
||
class_idx = map_class;
|
||
#if HAKMEM_DEBUG_VERBOSE
|
||
if (atomic_load(&debug_calls) <= 5) {
|
||
fprintf(stderr, "[TINY_FREE_V2] class_map lookup: class_idx=%d\n", class_idx);
|
||
}
|
||
#endif
|
||
}
|
||
}
|
||
}
|
||
// Fallback to Header if class_map lookup failed
|
||
if (class_idx < 0) {
|
||
class_idx = tiny_region_id_read_header(ptr);
|
||
#if HAKMEM_DEBUG_VERBOSE
|
||
if (atomic_load(&debug_calls) <= 5) {
|
||
fprintf(stderr, "[TINY_FREE_V2] class_map failed, Header fallback: class_idx=%d\n", class_idx);
|
||
}
|
||
#endif
|
||
}
|
||
} else {
|
||
// Default: Header read (existing behavior)
|
||
class_idx = tiny_region_id_read_header(ptr);
|
||
#if HAKMEM_DEBUG_VERBOSE
|
||
if (atomic_load(&debug_calls) <= 5) {
|
||
fprintf(stderr, "[TINY_FREE_V2] Header read: class_idx=%d\n", class_idx);
|
||
}
|
||
#endif
|
||
}
|
||
}
|
||
|
||
#if HAKMEM_DEBUG_VERBOSE
|
||
if (atomic_load(&debug_calls) <= 5) {
|
||
fprintf(stderr, "[TINY_FREE_V2] After read_header, class_idx=%d\n", class_idx);
|
||
}
|
||
#endif
|
||
// Cross-check header class vs meta class (if available from fast lookup)
|
||
do {
|
||
// Try fast owner slab lookup to get meta->class_idx for comparison
|
||
SuperSlab* ss = hak_super_lookup((uint8_t*)ptr - 1);
|
||
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
||
int sidx = slab_index_for(ss, (uint8_t*)ptr - 1);
|
||
if (sidx >= 0 && sidx < ss_slabs_capacity(ss)) {
|
||
TinySlabMeta* m = &ss->slabs[sidx];
|
||
uint8_t meta_cls = m->class_idx;
|
||
if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) {
|
||
static _Atomic uint32_t g_hdr_meta_fast = 0;
|
||
uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_fast, 1, memory_order_relaxed);
|
||
if (n < 16) {
|
||
fprintf(stderr,
|
||
"[FREE_FAST_HDR_META_MISMATCH] hdr_cls=%d meta_cls=%u ptr=%p slab_idx=%d ss=%p\n",
|
||
class_idx, (unsigned)meta_cls, ptr, sidx, (void*)ss);
|
||
if (n < 4) {
|
||
void* bt[8];
|
||
int frames = backtrace(bt, 8);
|
||
backtrace_symbols_fd(bt, frames, fileno(stderr));
|
||
}
|
||
fflush(stderr);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
} while (0);
|
||
|
||
// Check if header read failed (invalid magic in debug, or out-of-bounds class_idx)
|
||
if (__builtin_expect(class_idx < 0, 0)) {
|
||
// Invalid header - route to slow path (non-header allocation or corrupted header)
|
||
return 0;
|
||
}
|
||
|
||
// PRIORITY 1: Bounds check on class_idx from header
|
||
if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
|
||
fprintf(stderr, "[TINY_FREE_V2] FATAL: class_idx=%d out of bounds (from header at %p)\n",
|
||
class_idx, ptr);
|
||
fflush(stderr);
|
||
assert(0 && "class_idx from header out of bounds");
|
||
return 0;
|
||
}
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
|
||
#endif
|
||
|
||
// 2. Check TLS freelist capacity (defense in depth - ALWAYS ENABLED)
|
||
// CRITICAL: Enable in both debug and release to prevent corruption accumulation
|
||
// Reason: If C7 slips through magic validation, capacity limit prevents unbounded growth
|
||
// Cost: 1 comparison (~1 cycle, predict-not-taken)
|
||
// Benefit: Fail-safe against TLS SLL pollution from false positives
|
||
uint32_t cap = (uint32_t)TINY_TLS_MAG_CAP;
|
||
if (__builtin_expect(g_tls_sll[class_idx].count >= cap, 0)) {
|
||
return 0; // Route to slow path for spill (Front Gate will catch corruption)
|
||
}
|
||
|
||
// 3. Push base to TLS freelist (4 instructions, 5-7 cycles)
|
||
// Must push base (block start) not user pointer!
|
||
// Phase E1: ALL classes (C0-C7) have 1-byte header → base = ptr-1
|
||
void* base = (char*)ptr - 1;
|
||
|
||
// Phase 14-C: UltraHot は free 時に横取りしない(Borrowing 設計)
|
||
// → 正史(TLS SLL)の在庫を正しく保つ
|
||
// → UltraHot refill は alloc 側で TLS SLL から借りる
|
||
|
||
// LARSON FIX (2025-11-16): Cross-thread free detection - ENV GATED
|
||
// Problem: Larson MT crash - TLS SLL poison (0xbada55...) from cross-thread free
|
||
// Root cause: Block allocated by Thread A, freed by Thread B → pushed to B's TLS SLL
|
||
// → B allocates the block → metadata still points to A's SuperSlab → corruption
|
||
// Solution: Check owner_tid_low, route cross-thread free to remote queue
|
||
// Status: ENV-gated for performance (HAKMEM_TINY_LARSON_FIX=1 to enable)
|
||
// Performance: OFF=5-10 cycles/free, ON=110-520 cycles/free (registry lookup overhead)
|
||
{
|
||
// TLS-cached ENV check (initialized once per thread)
|
||
static __thread int g_larson_fix = -1;
|
||
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
||
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
|
||
if (__builtin_expect(g_larson_fix, 0)) {
|
||
// Cross-thread check enabled - MT safe mode
|
||
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
|
||
SuperSlab* ss = ss_fast_lookup(base);
|
||
if (__builtin_expect(ss != NULL, 1)) {
|
||
int slab_idx = slab_index_for(ss, base);
|
||
if (__builtin_expect(slab_idx >= 0, 1)) {
|
||
uint32_t self_tid = tiny_self_u32_local();
|
||
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
|
||
|
||
// Check if this is a cross-thread free (compare bits 8-15; low 8 bits are 0 on glibc)
|
||
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
|
||
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
|
||
// Cross-thread free → remote queue routing
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
|
||
// Successfully queued to remote, done
|
||
return 1;
|
||
}
|
||
// Remote push failed → fall through to slow path
|
||
return 0;
|
||
}
|
||
// Same-thread free → continue to TLS SLL fast path below
|
||
}
|
||
}
|
||
// SuperSlab lookup failed → fall through to TLS SLL (may be headerless C7)
|
||
}
|
||
}
|
||
|
||
// REVERT E3-2: Use Box TLS-SLL for all builds (testing hypothesis)
|
||
// Hypothesis: Box TLS-SLL acts as verification layer, masking underlying bugs
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Address watcher: Check if this is the watched address being freed
|
||
{
|
||
extern uintptr_t get_watch_addr(void);
|
||
uintptr_t watch = get_watch_addr();
|
||
if (watch != 0 && (uintptr_t)base == watch) {
|
||
extern _Atomic uint64_t g_debug_op_count;
|
||
extern __thread TinyTLSSLL g_tls_sll[];
|
||
uint64_t op = atomic_load(&g_debug_op_count);
|
||
|
||
fprintf(stderr, "\n");
|
||
fprintf(stderr, "========================================\n");
|
||
fprintf(stderr, "[WATCH_FREE_HIT] Address %p freed!\n", base);
|
||
fprintf(stderr, "========================================\n");
|
||
fprintf(stderr, " Operation: #%lu\n", (unsigned long)op);
|
||
fprintf(stderr, " Class: %d\n", class_idx);
|
||
fprintf(stderr, " User ptr: %p\n", ptr);
|
||
fprintf(stderr, " Base ptr: %p\n", base);
|
||
fprintf(stderr, " TLS count: %u (before free)\n", g_tls_sll[class_idx].count);
|
||
fprintf(stderr, " TLS head: %p\n", g_tls_sll[class_idx].head);
|
||
fprintf(stderr, "========================================\n");
|
||
fprintf(stderr, "\n");
|
||
fflush(stderr);
|
||
|
||
// Print backtrace
|
||
void* bt[16];
|
||
int frames = backtrace(bt, 16);
|
||
fprintf(stderr, "[WATCH_FREE_BACKTRACE] %d frames:\n", frames);
|
||
backtrace_symbols_fd(bt, frames, fileno(stderr));
|
||
fprintf(stderr, "\n");
|
||
fflush(stderr);
|
||
|
||
// Abort to preserve state
|
||
fprintf(stderr, "[WATCH_ABORT] Aborting on watched free...\n");
|
||
fflush(stderr);
|
||
abort();
|
||
}
|
||
}
|
||
|
||
// Debug: Log free operations (first 2000, ALL classes)
|
||
{
|
||
extern _Atomic uint64_t g_debug_op_count;
|
||
extern __thread TinyTLSSLL g_tls_sll[];
|
||
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
|
||
if (op < 2000) { // ALL classes, not just class 1
|
||
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p tls_count_before=%u\n",
|
||
(unsigned long)op, class_idx, ptr, base,
|
||
g_tls_sll[class_idx].count);
|
||
fflush(stderr);
|
||
}
|
||
}
|
||
#endif
|
||
|
||
if (!tls_sll_push(class_idx, base, UINT32_MAX)) {
|
||
// C7 rejected or capacity exceeded - route to slow path
|
||
return 0;
|
||
}
|
||
|
||
// P1.3: Decrement meta->active when block is freed (user gives it back)
|
||
// ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance)
|
||
{
|
||
static __thread int g_active_track = -1;
|
||
if (__builtin_expect(g_active_track == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_ACTIVE_TRACK");
|
||
g_active_track = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
if (__builtin_expect(g_active_track, 0)) {
|
||
// Lookup the actual slab meta for this block
|
||
SuperSlab* ss = ss_fast_lookup(base);
|
||
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
||
int slab_idx = slab_index_for(ss, base);
|
||
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
atomic_fetch_sub_explicit(&meta->active, 1, memory_order_relaxed);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Option B: Periodic TLS SLL Drain (restore slab accounting consistency)
|
||
// Purpose: Every N frees (default: 1024), drain TLS SLL → slab freelist
|
||
// Impact: Enables empty detection → SuperSlabs freed → LRU cache functional
|
||
// Cost: 2-3 cycles (counter increment + comparison, predict-not-taken)
|
||
// Benefit: +1,300-1,700% throughput (563K → 8-10M ops/s expected)
|
||
tiny_tls_sll_try_drain(class_idx);
|
||
|
||
return 1; // Success - handled in fast path
|
||
}
|
||
|
||
// ========== Free Entry Point ==========
|
||
|
||
// Entry point for free() - tries fast path first, falls back to slow path
|
||
//
|
||
// Flow:
|
||
// 1. Try ultra-fast free (header-based) → 95-99% hit rate
|
||
// 2. Miss → Fallback to slow path → 1-5% (non-header, cache full)
|
||
//
|
||
// Performance:
|
||
// - Fast path: 5-10 cycles (header read + TLS push)
|
||
// - Slow path: 500+ cycles (SuperSlab lookup + validation)
|
||
// - Weighted average: ~10-30 cycles (vs 500+ current)
|
||
static inline void hak_free_fast_v2_entry(void* ptr) {
|
||
// Try ultra-fast free (header-based)
|
||
if (__builtin_expect(hak_tiny_free_fast_v2(ptr), 1)) {
|
||
return; // Success - done in 5-10 cycles!
|
||
}
|
||
|
||
// Slow path: Non-header allocation or TLS cache full
|
||
hak_tiny_free(ptr);
|
||
}
|
||
|
||
// ========== Performance Counters (Debug) ==========
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Performance counters (TLS, lightweight)
|
||
static __thread uint64_t g_free_v2_fast_hits = 0;
|
||
static __thread uint64_t g_free_v2_slow_hits = 0;
|
||
|
||
// Track fast path hit rate
|
||
static inline void hak_free_v2_track_fast(void) {
|
||
g_free_v2_fast_hits++;
|
||
}
|
||
|
||
static inline void hak_free_v2_track_slow(void) {
|
||
g_free_v2_slow_hits++;
|
||
}
|
||
|
||
// Print stats at exit
|
||
static void hak_free_v2_print_stats(void) __attribute__((destructor));
|
||
static void hak_free_v2_print_stats(void) {
|
||
uint64_t total = g_free_v2_fast_hits + g_free_v2_slow_hits;
|
||
if (total == 0) return;
|
||
|
||
double hit_rate = (double)g_free_v2_fast_hits / total * 100.0;
|
||
fprintf(stderr, "[FREE_V2] Fast hits: %lu, Slow hits: %lu, Hit rate: %.2f%%\n",
|
||
g_free_v2_fast_hits, g_free_v2_slow_hits, hit_rate);
|
||
}
|
||
#else
|
||
// Release: No tracking overhead
|
||
static inline void hak_free_v2_track_fast(void) {}
|
||
static inline void hak_free_v2_track_slow(void) {}
|
||
#endif
|
||
|
||
// ========== Benchmark Comparison ==========
|
||
//
|
||
// Current (hak_tiny_free_superslab):
|
||
// - 2x SuperSlab lookup: 200+ cycles
|
||
// - Safety checks (O(n) duplicate scan): 100+ cycles
|
||
// - Validation, atomics, diagnostics: 200+ cycles
|
||
// - Total: 500+ cycles
|
||
// - Throughput: 1.2M ops/s
|
||
//
|
||
// Phase 7 (hak_tiny_free_fast_v2):
|
||
// - Header read: 2-3 cycles
|
||
// - TLS push: 3-5 cycles
|
||
// - Total: 5-10 cycles (100x faster!)
|
||
// - Throughput: 40-60M ops/s (30-50x improvement)
|
||
//
|
||
// vs System malloc tcache:
|
||
// - System: 10-15 cycles (3-4 instructions)
|
||
// - HAKMEM: 5-10 cycles (3-5 instructions)
|
||
// - Result: 70-110% of System speed (互角〜勝ち!)
|
||
|
||
#endif // HAKMEM_TINY_HEADER_CLASSIDX
|