Documentation: - Created docs/DEFENSIVE_LAYERS_MAPPING.md documenting all 5 defensive layers - Maps which symptoms each layer suppresses - Defines safe removal order after root cause fix - Includes test methods for each layer removal Diagnostic Logging Enhancements (ChatGPT work): - TLS_SLL_HEAD_SET log with count and backtrace for NORMALIZE_USERPTR - tiny_next_store_log with filtering capability - Environment variables for log filtering: - HAKMEM_TINY_SLL_NEXTCLS: class filter for next store (-1 disables) - HAKMEM_TINY_SLL_NEXTTAG: tag filter (substring match) - HAKMEM_TINY_SLL_HEADCLS: class filter for head trace Current Investigation Status: - sh8bench 60/120s: crash-free, zero NEXT_INVALID/HDR_RESET/SANITIZE - BUT: shot limit (256) exhausted by class3 tls_push before class1/drain - Need: Add tags to pop/clear paths, or increase shot limit for class1 Purpose of this commit: - Document defensive layers for safe removal later - Enable targeted diagnostic logging - Prepare for final root cause identification Next Steps: 1. Add tags to tls_sll_pop tiny_next_write (e.g., "tls_pop_clear") 2. Re-run with HAKMEM_TINY_SLL_NEXTTAG=tls_pop 3. Capture class1 writes that lead to corruption 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
328 lines
13 KiB
C
328 lines
13 KiB
C
// tiny_free_fast.inc.h - Box 6: Free Fast Path (same-thread free only)
|
||
// Purpose: Ultra-fast TLS freelist push (ownership check + 2-3 instructions)
|
||
// Invariant: owner_tid == my_tid → push to TLS, else → delegate to remote path
|
||
// Design: Clear boundary between same-thread (fast) and cross-thread (remote)
|
||
#pragma once
|
||
#include "tiny_atomic.h"
|
||
#include "hakmem_tiny.h"
|
||
#include "hakmem_tiny_superslab.h"
|
||
#include "slab_handle.h"
|
||
#include "tiny_alloc_fast_sfc.inc.h" // For sfc_free_push
|
||
|
||
// ========== Debug Counters (compile-time gated) ==========
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Free pipeline counters (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_free_via_ss_local[];
|
||
extern unsigned long long g_free_via_ss_remote[];
|
||
#endif
|
||
|
||
// ========== Box 6: Free Fast Path ==========
|
||
// 箱理論の Fast Free 層。Same-thread free のみ処理(2-3命令 + ownership check)。
|
||
// 不変条件:
|
||
// - owner_tid == my_tid → TLS freelist に push (no lock, no sync)
|
||
// - owner_tid != my_tid → Box 2 (Remote Queue) に委譲
|
||
// - Cross-thread free は絶対に TLS freelist に入れない(A213 エラー防止)
|
||
|
||
// External functions (Backend)
|
||
extern void hak_tiny_free(void* ptr);
|
||
extern void hak_tiny_free_with_slab(void* ptr, TinySlab* slab);
|
||
// hak_free_at signature: (void* ptr, size_t hint_sz, hak_callsite_t site)
|
||
// where hak_callsite_t is const void*
|
||
extern void hak_free_at(void* ptr, size_t hint_sz, const void* site);
|
||
// Note: hak_super_lookup() is defined in hakmem_super_registry.h (included transitively)
|
||
extern TinySlab* hak_tiny_owner_slab(void* ptr);
|
||
extern int g_use_superslab;
|
||
|
||
// External helpers
|
||
extern uint32_t tiny_self_u32(void);
|
||
extern pthread_t tiny_self_pt(void);
|
||
|
||
// External TLS variables (from Box 5)
|
||
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
|
||
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
||
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||
|
||
// Box 5 helper (TLS push)
|
||
extern void tiny_alloc_fast_push(int class_idx, void* ptr);
|
||
|
||
// ========== Ownership Check ==========
|
||
|
||
// Check if ptr belongs to current thread (SuperSlab path)
|
||
// Returns: 1 if same-thread, 0 if cross-thread
|
||
//
|
||
// Box Boundary: This is the critical check that prevents TOCTOU races
|
||
// - owner_tid == my_tid → Safe to push to TLS freelist
|
||
// - owner_tid != my_tid → MUST delegate to remote path
|
||
//
|
||
// Invariant: This check MUST be atomic (no TOCTOU between check and push)
|
||
static inline int tiny_free_is_same_thread_ss(SuperSlab* ss, int slab_idx, uint32_t my_tid) {
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
uint8_t my_tid_low = (uint8_t)((my_tid >> 8) & 0xFFu);
|
||
uint8_t owner = tiny_atomic_load_u8_relaxed(&meta->owner_tid_low);
|
||
return (owner == my_tid_low && owner != 0);
|
||
}
|
||
|
||
// Check if ptr belongs to current thread (Legacy TinySlab path)
|
||
// Returns: 1 if same-thread, 0 if cross-thread
|
||
static inline int tiny_free_is_same_thread_legacy(TinySlab* slab) {
|
||
pthread_t my_tid = tiny_self_pt();
|
||
return pthread_equal(slab->owner_tid, my_tid);
|
||
}
|
||
|
||
// ========== Fast Path: Same-Thread Free (2-3 instructions) ==========
|
||
|
||
// Free fast path for SuperSlab-backed allocation
|
||
// Returns: 1 on success (same-thread, pushed to TLS), 0 on failure (cross-thread)
|
||
//
|
||
// Assembly (x86-64, optimized):
|
||
// mov eax, DWORD PTR [meta->owner_tid] ; Load owner_tid
|
||
// cmp eax, my_tid ; Compare with my_tid
|
||
// jne .cross_thread ; If not equal, cross-thread
|
||
// mov rax, QWORD PTR g_tls_sll_head[cls] ; Load head
|
||
// mov QWORD PTR [ptr], rax ; ptr->next = head
|
||
// mov QWORD PTR g_tls_sll_head[cls], ptr ; head = ptr
|
||
// ret ; Done
|
||
// .cross_thread:
|
||
// ; Delegate to remote path
|
||
//
|
||
// Expected: 2-3 instructions on same-thread path (1 cmp, 1 load, 1 store)
|
||
//
|
||
// ⚠️ CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
|
||
static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* base, uint32_t my_tid) {
|
||
// BUGFIX: Validate slab_idx before array access (prevents buffer overflow at ss->slabs[-1])
|
||
int cap = ss_slabs_capacity(ss);
|
||
if (__builtin_expect(slab_idx < 0 || slab_idx >= cap, 0)) {
|
||
return 0; // Invalid index, reject
|
||
}
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
|
||
// Debug: Track tiny_free_fast_ss calls
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
static __thread int free_ss_debug_count = 0;
|
||
extern int g_sfc_debug;
|
||
if (__builtin_expect(g_sfc_debug, 0) && free_ss_debug_count < 20) {
|
||
free_ss_debug_count++;
|
||
int is_same = tiny_free_is_same_thread_ss(ss, slab_idx, my_tid);
|
||
extern int g_sfc_enabled;
|
||
fprintf(stderr, "[FREE_SS] base=%p, cls=%u, same_thread=%d, sfc_enabled=%d\n",
|
||
base,
|
||
meta->class_idx,
|
||
is_same,
|
||
g_sfc_enabled);
|
||
}
|
||
#endif
|
||
|
||
// Box 6 Boundary: Ownership check (TOCTOU-safe)
|
||
if (__builtin_expect(!tiny_free_is_same_thread_ss(ss, slab_idx, my_tid), 0)) {
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Track cross-thread frees (compile-time gated)
|
||
g_free_via_ss_remote[meta->class_idx]++;
|
||
#endif
|
||
return 0; // Cross-thread → caller should delegate to remote path
|
||
}
|
||
|
||
// Fast path: Same-thread free (2-3 instructions)
|
||
int class_idx = meta->class_idx;
|
||
// Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)
|
||
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Track same-thread frees (compile-time gated)
|
||
g_free_via_ss_local[class_idx]++;
|
||
#endif
|
||
|
||
// Box 5 integration
|
||
extern int g_sfc_enabled;
|
||
if (g_sfc_enabled) {
|
||
// Box 5-NEW: Try SFC (128-256 slots)
|
||
if (!sfc_free_push(class_idx, base)) {
|
||
// SFC full → skip caching, use slow path (return 0)
|
||
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
|
||
return 0;
|
||
}
|
||
} else {
|
||
// Box 5-OLD: Use SLL (16 slots)
|
||
tiny_alloc_fast_push(class_idx, base);
|
||
}
|
||
|
||
return 1; // Success
|
||
}
|
||
|
||
// Free fast path for Legacy TinySlab-backed allocation
|
||
// Returns: 1 on success (same-thread), 0 on failure (cross-thread)
|
||
//
|
||
// ⚠️ CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
|
||
static inline int tiny_free_fast_legacy(TinySlab* slab, void* base) {
|
||
// Box 6 Boundary: Ownership check
|
||
if (__builtin_expect(!tiny_free_is_same_thread_legacy(slab), 0)) {
|
||
return 0; // Cross-thread → caller should delegate to precise path
|
||
}
|
||
|
||
// Fast path: Same-thread free
|
||
int class_idx = slab->class_idx;
|
||
// Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)
|
||
|
||
// Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
|
||
extern int g_sfc_enabled;
|
||
if (g_sfc_enabled) {
|
||
// Box 5-NEW: Try SFC (128 slots)
|
||
if (!sfc_free_push(class_idx, base)) {
|
||
// SFC full → skip caching, use slow path (return 0)
|
||
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
|
||
return 0;
|
||
}
|
||
} else {
|
||
// Box 5-OLD: Use SLL (16 slots)
|
||
tiny_alloc_fast_push(class_idx, base);
|
||
}
|
||
|
||
return 1; // Success
|
||
}
|
||
|
||
// ========== Combined Fast Free (Lookup + Ownership + Push) ==========
|
||
|
||
// Complete fast free path (inline for zero-cost)
|
||
// Returns: none (delegates to backend on cross-thread or non-tiny)
|
||
//
|
||
// Flow:
|
||
// 1. Lookup ptr → SuperSlab or TinySlab
|
||
// 2. Ownership check (owner_tid == my_tid)
|
||
// 3. Same-thread → TLS freelist push (2-3 instructions)
|
||
// 4. Cross-thread → Delegate to Box 2 (Remote Queue)
|
||
// 5. Not Tiny → Delegate to backend (Mid/Large)
|
||
//
|
||
// Example usage:
|
||
// tiny_free_fast(ptr); // Always succeeds (delegates on failure)
|
||
static inline void tiny_free_fast(void* ptr) {
|
||
// Optional runtime gate to disable fast free and route to slow path
|
||
// Env: HAKMEM_TINY_FREE_FAST (default: 1). Additionally, if
|
||
// HAKMEM_TINY_FREE_TO_SS=1 is set, prefer SS path by disabling fast free.
|
||
static int s_free_fast_en = -1;
|
||
if (__builtin_expect(s_free_fast_en == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_FREE_FAST");
|
||
int v = (e && *e && *e != '0') ? 1 : 1; // default ON
|
||
const char* to_ss = getenv("HAKMEM_TINY_FREE_TO_SS");
|
||
if (to_ss && *to_ss && *to_ss != '0') v = 0; // FREE_TO_SS implies slow path
|
||
s_free_fast_en = v;
|
||
}
|
||
if (!s_free_fast_en) {
|
||
// Delegate to precise slow path (handles same/remote + publish)
|
||
hak_tiny_free(ptr);
|
||
return;
|
||
}
|
||
// 1. SuperSlab-backed tiny pointer?
|
||
if (__builtin_expect(g_use_superslab != 0, 1)) {
|
||
// NOTE: Use safe hak_super_lookup here (not ss_fast_lookup) because
|
||
// tiny_free_fast() can receive arbitrary pointers without prior validation.
|
||
// ss_fast_lookup masks to 1MB boundary and reads magic - would crash on unmapped memory.
|
||
SuperSlab* ss = hak_super_lookup(ptr);
|
||
if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) {
|
||
// ROOT CAUSE FIX: slab_index_for() can handle USER pointers directly
|
||
// Avoid per-class offset conversion errors (C0/C7 have offset=0, C1-6 have offset=1)
|
||
int slab_idx = slab_index_for(ss, ptr);
|
||
|
||
// Convert USER → BASE for tiny_free_fast_ss (needed for next pointer operations)
|
||
int class_idx = tiny_get_class_from_ss(ss, slab_idx);
|
||
void* base = (void*)((uint8_t*)ptr - tiny_user_offset(class_idx));
|
||
uint32_t self_tid = tiny_self_u32();
|
||
|
||
// Box 6 Boundary: Try same-thread fast path
|
||
// CRITICAL: Pass BASE pointer (already converted above)
|
||
if (tiny_free_fast_ss(ss, slab_idx, base, self_tid)) {
|
||
return; // Success: same-thread, pushed to TLS
|
||
}
|
||
|
||
// Cross-thread free → Box 2 (Remote Queue)
|
||
// Delegate to full tiny free (handles remote push)
|
||
hak_tiny_free(ptr);
|
||
return;
|
||
}
|
||
}
|
||
|
||
// 2. Legacy TinySlab-backed pointer?
|
||
TinySlab* slab = hak_tiny_owner_slab(ptr);
|
||
if (__builtin_expect(slab != NULL, 0)) {
|
||
// Convert USER → BASE (for Legacy path)
|
||
void* base_legacy = (void*)((uint8_t*)ptr - 1);
|
||
|
||
// Box 6 Boundary: Try same-thread fast path
|
||
// CRITICAL: Pass BASE pointer (already converted above)
|
||
if (tiny_free_fast_legacy(slab, base_legacy)) {
|
||
return; // Success: same-thread, pushed to TLS
|
||
}
|
||
|
||
// Cross-thread free → precise path with known slab
|
||
hak_tiny_free_with_slab(ptr, slab);
|
||
return;
|
||
}
|
||
|
||
// 3. Not a tiny allocation → Delegate to backend (Mid/Large/Mmap)
|
||
hak_free_at(ptr, 0, 0);
|
||
}
|
||
|
||
// ========== Guard/Debug Variants ==========
|
||
|
||
// Free with additional safety checks (for debugging/testing)
|
||
// This variant includes:
|
||
// - Sentinel checks (0xBADA55)
|
||
// - Double-free detection
|
||
// - Ownership validation
|
||
//
|
||
// Usage: Enable with HAKMEM_SAFE_FREE=1 environment variable
|
||
static inline void tiny_free_fast_guarded(void* ptr) {
|
||
// TODO: Implement guard checks if needed
|
||
// For now, delegate to standard fast path
|
||
tiny_free_fast(ptr);
|
||
}
|
||
|
||
// ========== Statistics & Diagnostics ==========
|
||
|
||
// Free fast path stats (for profiling)
|
||
typedef struct {
|
||
uint64_t same_thread_count; // Same-thread frees (TLS push)
|
||
uint64_t cross_thread_count; // Cross-thread frees (remote queue)
|
||
uint64_t non_tiny_count; // Non-tiny frees (backend)
|
||
} TinyFreeFastStats;
|
||
|
||
// Get free fast path stats (TLS-local)
|
||
static __thread TinyFreeFastStats g_tiny_free_fast_stats = {0};
|
||
|
||
static inline TinyFreeFastStats tiny_free_fast_stats_get(void) {
|
||
return g_tiny_free_fast_stats;
|
||
}
|
||
|
||
// Reset free fast path stats (for testing/benchmarking)
|
||
static inline void tiny_free_fast_stats_reset(void) {
|
||
g_tiny_free_fast_stats.same_thread_count = 0;
|
||
g_tiny_free_fast_stats.cross_thread_count = 0;
|
||
g_tiny_free_fast_stats.non_tiny_count = 0;
|
||
}
|
||
|
||
// ========== Performance Notes ==========
|
||
//
|
||
// Expected metrics:
|
||
// - Same-thread hit rate: 80-90% (workload dependent)
|
||
// - Same-thread latency: 2-3 instructions (ownership check + push)
|
||
// - Cross-thread penalty: ~50-100 instructions (remote queue push)
|
||
// - Throughput improvement: +10-20% vs current multi-layer design
|
||
//
|
||
// Key optimizations:
|
||
// 1. Ownership check first (fail-fast on cross-thread)
|
||
// 2. `__builtin_expect` for branch prediction (same-thread is common)
|
||
// 3. `static inline` for zero-cost abstraction
|
||
// 4. TLS variables (no atomic ops in same-thread path)
|
||
//
|
||
// TOCTOU Race Prevention (Box 4 Boundary):
|
||
// - Ownership check is atomic (tiny_atomic_load_u32_relaxed)
|
||
// - No time window between check and push (single function)
|
||
// - Cross-thread frees are immediately delegated (no TLS touch)
|
||
//
|
||
// Comparison with current design:
|
||
// - Current: 38.43% free overhead (free.part.0 + mid_lookup + locks)
|
||
// - New: 2-3 instructions (ownership check + TLS push)
|
||
// - Reduction: -90% instructions in same-thread path
|
||
//
|
||
// Inspired by:
|
||
// - System tcache (glibc malloc) - fast same-thread free
|
||
// - Box Theory - Clear ownership boundaries
|
||
// - TOCTOU fix (Box 4) - Atomic ownership check
|