Files
hakmem/core/tiny_free_fast.inc.h
Moe Charm (CI) 9b0d746407 Phase 3d-B: TLS Cache Merge - Unified g_tls_sll[] structure (+12-18% expected)
Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified
TinyTLSSLL struct to improve L1D cache locality. Expected performance gain:
+12-18% from reducing cache line splits (2 loads → 1 load per operation).

Changes:
- core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad)
- core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8]
- core/box/tls_sll_box.h: Update Box API (13 sites) for unified access
- Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head
- Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count
- core/hakmem_tiny_integrity.h: Unified canary guards
- core/box/integrity_box.c: Simplified canary validation
- Makefile: Added core/box/tiny_sizeclass_hist_box.o to link

Build:  PASS (10K ops sanity test)
Warnings: Only pre-existing LTO type mismatches (unrelated)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 07:32:30 +09:00

334 lines
13 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_free_fast.inc.h - Box 6: Free Fast Path (same-thread free only)
// Purpose: Ultra-fast TLS freelist push (ownership check + 2-3 instructions)
// Invariant: owner_tid == my_tid → push to TLS, else → delegate to remote path
// Design: Clear boundary between same-thread (fast) and cross-thread (remote)
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "slab_handle.h"
#include "tiny_alloc_fast_sfc.inc.h" // For sfc_free_push
// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Free pipeline counters (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_ss_local[];
extern unsigned long long g_free_via_ss_remote[];
#endif
// ========== Box 6: Free Fast Path ==========
// 箱理論の Fast Free 層。Same-thread free のみ処理2-3命令 + ownership check
// 不変条件:
// - owner_tid == my_tid → TLS freelist に push (no lock, no sync)
// - owner_tid != my_tid → Box 2 (Remote Queue) に委譲
// - Cross-thread free は絶対に TLS freelist に入れないA213 エラー防止)
// External functions (Backend)
extern void hak_tiny_free(void* ptr);
extern void hak_tiny_free_with_slab(void* ptr, TinySlab* slab);
// hak_free_at signature: (void* ptr, size_t hint_sz, hak_callsite_t site)
// where hak_callsite_t is const void*
extern void hak_free_at(void* ptr, size_t hint_sz, const void* site);
extern SuperSlab* hak_super_lookup(void* ptr);
extern TinySlab* hak_tiny_owner_slab(void* ptr);
extern int g_use_superslab;
// External helpers
extern uint32_t tiny_self_u32(void);
extern pthread_t tiny_self_pt(void);
// External TLS variables (from Box 5)
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
// Hot-class toggle: class5 (256B) dedicated TLS fast path
extern int g_tiny_hotpath_class5;
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
// Box 5 helper (TLS push)
extern void tiny_alloc_fast_push(int class_idx, void* ptr);
// ========== Ownership Check ==========
// Check if ptr belongs to current thread (SuperSlab path)
// Returns: 1 if same-thread, 0 if cross-thread
//
// Box Boundary: This is the critical check that prevents TOCTOU races
// - owner_tid == my_tid → Safe to push to TLS freelist
// - owner_tid != my_tid → MUST delegate to remote path
//
// Invariant: This check MUST be atomic (no TOCTOU between check and push)
static inline int tiny_free_is_same_thread_ss(SuperSlab* ss, int slab_idx, uint32_t my_tid) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
uint8_t my_tid_low = (uint8_t)my_tid;
uint8_t owner = tiny_atomic_load_u8_relaxed(&meta->owner_tid_low);
return (owner == my_tid_low && owner != 0);
}
// Check if ptr belongs to current thread (Legacy TinySlab path)
// Returns: 1 if same-thread, 0 if cross-thread
static inline int tiny_free_is_same_thread_legacy(TinySlab* slab) {
pthread_t my_tid = tiny_self_pt();
return pthread_equal(slab->owner_tid, my_tid);
}
// ========== Fast Path: Same-Thread Free (2-3 instructions) ==========
// Free fast path for SuperSlab-backed allocation
// Returns: 1 on success (same-thread, pushed to TLS), 0 on failure (cross-thread)
//
// Assembly (x86-64, optimized):
// mov eax, DWORD PTR [meta->owner_tid] ; Load owner_tid
// cmp eax, my_tid ; Compare with my_tid
// jne .cross_thread ; If not equal, cross-thread
// mov rax, QWORD PTR g_tls_sll_head[cls] ; Load head
// mov QWORD PTR [ptr], rax ; ptr->next = head
// mov QWORD PTR g_tls_sll_head[cls], ptr ; head = ptr
// ret ; Done
// .cross_thread:
// ; Delegate to remote path
//
// Expected: 2-3 instructions on same-thread path (1 cmp, 1 load, 1 store)
//
// ⚠️ CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* base, uint32_t my_tid) {
// BUGFIX: Validate slab_idx before array access (prevents buffer overflow at ss->slabs[-1])
int cap = ss_slabs_capacity(ss);
if (__builtin_expect(slab_idx < 0 || slab_idx >= cap, 0)) {
return 0; // Invalid index, reject
}
TinySlabMeta* meta = &ss->slabs[slab_idx];
// Debug: Track tiny_free_fast_ss calls
#if !HAKMEM_BUILD_RELEASE
static __thread int free_ss_debug_enabled = -1;
static __thread int free_ss_debug_count = 0;
if (__builtin_expect(free_ss_debug_enabled == -1, 0)) {
free_ss_debug_enabled = getenv("HAKMEM_SFC_DEBUG") ? 1 : 0;
}
if (free_ss_debug_enabled && free_ss_debug_count < 20) {
free_ss_debug_count++;
int is_same = tiny_free_is_same_thread_ss(ss, slab_idx, my_tid);
extern int g_sfc_enabled;
fprintf(stderr, "[FREE_SS] base=%p, cls=%u, same_thread=%d, sfc_enabled=%d\n",
base,
meta->class_idx,
is_same,
g_sfc_enabled);
}
#endif
// Box 6 Boundary: Ownership check (TOCTOU-safe)
if (__builtin_expect(!tiny_free_is_same_thread_ss(ss, slab_idx, my_tid), 0)) {
#if HAKMEM_DEBUG_COUNTERS
// Track cross-thread frees (compile-time gated)
g_free_via_ss_remote[meta->class_idx]++;
#endif
return 0; // Cross-thread → caller should delegate to remote path
}
// Fast path: Same-thread free (2-3 instructions)
int class_idx = meta->class_idx;
// Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)
#if HAKMEM_DEBUG_COUNTERS
// Track same-thread frees (compile-time gated)
g_free_via_ss_local[class_idx]++;
#endif
// Box 5 integration: class5 can use dedicated TLS List hotpath
extern int g_sfc_enabled;
if (__builtin_expect(g_tiny_hotpath_class5 && class_idx == 5, 0)) {
TinyTLSList* tls5 = &g_tls_lists[5];
// Use guarded push for class5 to avoid sentinel/next poisoning during triage
tls_list_push(tls5, base, 5);
} else if (g_sfc_enabled) {
// Box 5-NEW: Try SFC (128-256 slots)
if (!sfc_free_push(class_idx, base)) {
// SFC full → skip caching, use slow path (return 0)
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
return 0;
}
} else {
// Box 5-OLD: Use SLL (16 slots)
tiny_alloc_fast_push(class_idx, base);
}
// Active accounting (Box 3: SuperSlab)
// This is relatively cheap (atomic decrement) and necessary for memory management
ss_active_dec_one(ss);
return 1; // Success
}
// Free fast path for Legacy TinySlab-backed allocation
// Returns: 1 on success (same-thread), 0 on failure (cross-thread)
//
// ⚠️ CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
static inline int tiny_free_fast_legacy(TinySlab* slab, void* base) {
// Box 6 Boundary: Ownership check
if (__builtin_expect(!tiny_free_is_same_thread_legacy(slab), 0)) {
return 0; // Cross-thread → caller should delegate to precise path
}
// Fast path: Same-thread free
int class_idx = slab->class_idx;
// Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)
// Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
extern int g_sfc_enabled;
if (g_sfc_enabled) {
// Box 5-NEW: Try SFC (128 slots)
if (!sfc_free_push(class_idx, base)) {
// SFC full → skip caching, use slow path (return 0)
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
return 0;
}
} else {
// Box 5-OLD: Use SLL (16 slots)
tiny_alloc_fast_push(class_idx, base);
}
return 1; // Success
}
// ========== Combined Fast Free (Lookup + Ownership + Push) ==========
// Complete fast free path (inline for zero-cost)
// Returns: none (delegates to backend on cross-thread or non-tiny)
//
// Flow:
// 1. Lookup ptr → SuperSlab or TinySlab
// 2. Ownership check (owner_tid == my_tid)
// 3. Same-thread → TLS freelist push (2-3 instructions)
// 4. Cross-thread → Delegate to Box 2 (Remote Queue)
// 5. Not Tiny → Delegate to backend (Mid/Large)
//
// Example usage:
// tiny_free_fast(ptr); // Always succeeds (delegates on failure)
static inline void tiny_free_fast(void* ptr) {
// Optional runtime gate to disable fast free and route to slow path
// Env: HAKMEM_TINY_FREE_FAST (default: 1). Additionally, if
// HAKMEM_TINY_FREE_TO_SS=1 is set, prefer SS path by disabling fast free.
static int s_free_fast_en = -1;
if (__builtin_expect(s_free_fast_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_FREE_FAST");
int v = (e && *e && *e != '0') ? 1 : 1; // default ON
const char* to_ss = getenv("HAKMEM_TINY_FREE_TO_SS");
if (to_ss && *to_ss && *to_ss != '0') v = 0; // FREE_TO_SS implies slow path
s_free_fast_en = v;
}
if (!s_free_fast_en) {
// Delegate to precise slow path (handles same/remote + publish)
hak_tiny_free(ptr);
return;
}
// 1. SuperSlab-backed tiny pointer?
if (__builtin_expect(g_use_superslab != 0, 1)) {
SuperSlab* ss = hak_super_lookup(ptr);
if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) {
// ✅ FIX: Phase E1-CORRECT - Convert USER → BASE before slab index calculation
void* base = (void*)((uint8_t*)ptr - 1);
int slab_idx = slab_index_for(ss, base);
uint32_t self_tid = tiny_self_u32();
// Box 6 Boundary: Try same-thread fast path
// CRITICAL: Pass BASE pointer (already converted above)
if (tiny_free_fast_ss(ss, slab_idx, base, self_tid)) {
return; // Success: same-thread, pushed to TLS
}
// Cross-thread free → Box 2 (Remote Queue)
// Delegate to full tiny free (handles remote push)
hak_tiny_free(ptr);
return;
}
}
// 2. Legacy TinySlab-backed pointer?
TinySlab* slab = hak_tiny_owner_slab(ptr);
if (__builtin_expect(slab != NULL, 0)) {
// Convert USER → BASE (for Legacy path)
void* base_legacy = (void*)((uint8_t*)ptr - 1);
// Box 6 Boundary: Try same-thread fast path
// CRITICAL: Pass BASE pointer (already converted above)
if (tiny_free_fast_legacy(slab, base_legacy)) {
return; // Success: same-thread, pushed to TLS
}
// Cross-thread free → precise path with known slab
hak_tiny_free_with_slab(ptr, slab);
return;
}
// 3. Not a tiny allocation → Delegate to backend (Mid/Large/Mmap)
hak_free_at(ptr, 0, 0);
}
// ========== Guard/Debug Variants ==========
// Free with additional safety checks (for debugging/testing)
// This variant includes:
// - Sentinel checks (0xBADA55)
// - Double-free detection
// - Ownership validation
//
// Usage: Enable with HAKMEM_SAFE_FREE=1 environment variable
static inline void tiny_free_fast_guarded(void* ptr) {
// TODO: Implement guard checks if needed
// For now, delegate to standard fast path
tiny_free_fast(ptr);
}
// ========== Statistics & Diagnostics ==========
// Free fast path stats (for profiling)
typedef struct {
uint64_t same_thread_count; // Same-thread frees (TLS push)
uint64_t cross_thread_count; // Cross-thread frees (remote queue)
uint64_t non_tiny_count; // Non-tiny frees (backend)
} TinyFreeFastStats;
// Get free fast path stats (TLS-local)
static __thread TinyFreeFastStats g_tiny_free_fast_stats = {0};
static inline TinyFreeFastStats tiny_free_fast_stats_get(void) {
return g_tiny_free_fast_stats;
}
// Reset free fast path stats (for testing/benchmarking)
static inline void tiny_free_fast_stats_reset(void) {
g_tiny_free_fast_stats.same_thread_count = 0;
g_tiny_free_fast_stats.cross_thread_count = 0;
g_tiny_free_fast_stats.non_tiny_count = 0;
}
// ========== Performance Notes ==========
//
// Expected metrics:
// - Same-thread hit rate: 80-90% (workload dependent)
// - Same-thread latency: 2-3 instructions (ownership check + push)
// - Cross-thread penalty: ~50-100 instructions (remote queue push)
// - Throughput improvement: +10-20% vs current multi-layer design
//
// Key optimizations:
// 1. Ownership check first (fail-fast on cross-thread)
// 2. `__builtin_expect` for branch prediction (same-thread is common)
// 3. `static inline` for zero-cost abstraction
// 4. TLS variables (no atomic ops in same-thread path)
//
// TOCTOU Race Prevention (Box 4 Boundary):
// - Ownership check is atomic (tiny_atomic_load_u32_relaxed)
// - No time window between check and push (single function)
// - Cross-thread frees are immediately delegated (no TLS touch)
//
// Comparison with current design:
// - Current: 38.43% free overhead (free.part.0 + mid_lookup + locks)
// - New: 2-3 instructions (ownership check + TLS push)
// - Reduction: -90% instructions in same-thread path
//
// Inspired by:
// - System tcache (glibc malloc) - fast same-thread free
// - Box Theory - Clear ownership boundaries
// - TOCTOU fix (Box 4) - Atomic ownership check