2025-11-05 12:31:14 +09:00
|
|
|
|
// tiny_free_fast.inc.h - Box 6: Free Fast Path (same-thread free only)
|
|
|
|
|
|
// Purpose: Ultra-fast TLS freelist push (ownership check + 2-3 instructions)
|
|
|
|
|
|
// Invariant: owner_tid == my_tid → push to TLS, else → delegate to remote path
|
|
|
|
|
|
// Design: Clear boundary between same-thread (fast) and cross-thread (remote)
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
#include "tiny_atomic.h"
|
|
|
|
|
|
#include "hakmem_tiny.h"
|
|
|
|
|
|
#include "hakmem_tiny_superslab.h"
|
|
|
|
|
|
#include "slab_handle.h"
|
2025-11-07 01:27:04 +09:00
|
|
|
|
#include "tiny_alloc_fast_sfc.inc.h" // For sfc_free_push
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// ========== Debug Counters (compile-time gated) ==========
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
// Free pipeline counters (defined in hakmem_tiny.c)
|
|
|
|
|
|
extern unsigned long long g_free_via_ss_local[];
|
|
|
|
|
|
extern unsigned long long g_free_via_ss_remote[];
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Box 6: Free Fast Path ==========
|
|
|
|
|
|
// 箱理論の Fast Free 層。Same-thread free のみ処理(2-3命令 + ownership check)。
|
|
|
|
|
|
// 不変条件:
|
|
|
|
|
|
// - owner_tid == my_tid → TLS freelist に push (no lock, no sync)
|
|
|
|
|
|
// - owner_tid != my_tid → Box 2 (Remote Queue) に委譲
|
|
|
|
|
|
// - Cross-thread free は絶対に TLS freelist に入れない(A213 エラー防止)
|
|
|
|
|
|
|
|
|
|
|
|
// External functions (Backend)
|
|
|
|
|
|
extern void hak_tiny_free(void* ptr);
|
|
|
|
|
|
extern void hak_tiny_free_with_slab(void* ptr, TinySlab* slab);
|
|
|
|
|
|
// hak_free_at signature: (void* ptr, size_t hint_sz, hak_callsite_t site)
|
|
|
|
|
|
// where hak_callsite_t is const void*
|
|
|
|
|
|
extern void hak_free_at(void* ptr, size_t hint_sz, const void* site);
|
|
|
|
|
|
extern SuperSlab* hak_super_lookup(void* ptr);
|
|
|
|
|
|
extern TinySlab* hak_tiny_owner_slab(void* ptr);
|
|
|
|
|
|
extern int g_use_superslab;
|
|
|
|
|
|
|
|
|
|
|
|
// External helpers
|
|
|
|
|
|
extern uint32_t tiny_self_u32(void);
|
|
|
|
|
|
extern pthread_t tiny_self_pt(void);
|
|
|
|
|
|
|
|
|
|
|
|
// External TLS variables (from Box 5)
|
|
|
|
|
|
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
|
|
|
|
|
|
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
|
// Hot-class toggle: class5 (256B) dedicated TLS fast path
|
|
|
|
|
|
extern int g_tiny_hotpath_class5;
|
|
|
|
|
|
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// Box 5 helper (TLS push)
|
|
|
|
|
|
extern void tiny_alloc_fast_push(int class_idx, void* ptr);
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Ownership Check ==========
|
|
|
|
|
|
|
|
|
|
|
|
// Check if ptr belongs to current thread (SuperSlab path)
|
|
|
|
|
|
// Returns: 1 if same-thread, 0 if cross-thread
|
|
|
|
|
|
//
|
|
|
|
|
|
// Box Boundary: This is the critical check that prevents TOCTOU races
|
|
|
|
|
|
// - owner_tid == my_tid → Safe to push to TLS freelist
|
|
|
|
|
|
// - owner_tid != my_tid → MUST delegate to remote path
|
|
|
|
|
|
//
|
|
|
|
|
|
// Invariant: This check MUST be atomic (no TOCTOU between check and push)
|
|
|
|
|
|
static inline int tiny_free_is_same_thread_ss(SuperSlab* ss, int slab_idx, uint32_t my_tid) {
|
|
|
|
|
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
|
|
|
|
|
|
|
|
|
|
|
// Box 3 (Ownership): Load owner_tid atomically
|
|
|
|
|
|
uint32_t owner = tiny_atomic_load_u32_relaxed(&meta->owner_tid);
|
|
|
|
|
|
|
|
|
|
|
|
// Same thread check
|
|
|
|
|
|
return (owner == my_tid);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Check if ptr belongs to current thread (Legacy TinySlab path)
|
|
|
|
|
|
// Returns: 1 if same-thread, 0 if cross-thread
|
|
|
|
|
|
static inline int tiny_free_is_same_thread_legacy(TinySlab* slab) {
|
|
|
|
|
|
pthread_t my_tid = tiny_self_pt();
|
|
|
|
|
|
return pthread_equal(slab->owner_tid, my_tid);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Fast Path: Same-Thread Free (2-3 instructions) ==========
|
|
|
|
|
|
|
|
|
|
|
|
// Free fast path for SuperSlab-backed allocation
|
|
|
|
|
|
// Returns: 1 on success (same-thread, pushed to TLS), 0 on failure (cross-thread)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Assembly (x86-64, optimized):
|
|
|
|
|
|
// mov eax, DWORD PTR [meta->owner_tid] ; Load owner_tid
|
|
|
|
|
|
// cmp eax, my_tid ; Compare with my_tid
|
|
|
|
|
|
// jne .cross_thread ; If not equal, cross-thread
|
|
|
|
|
|
// mov rax, QWORD PTR g_tls_sll_head[cls] ; Load head
|
|
|
|
|
|
// mov QWORD PTR [ptr], rax ; ptr->next = head
|
|
|
|
|
|
// mov QWORD PTR g_tls_sll_head[cls], ptr ; head = ptr
|
|
|
|
|
|
// ret ; Done
|
|
|
|
|
|
// .cross_thread:
|
|
|
|
|
|
// ; Delegate to remote path
|
|
|
|
|
|
//
|
|
|
|
|
|
// Expected: 2-3 instructions on same-thread path (1 cmp, 1 load, 1 store)
|
2025-11-13 07:43:30 +09:00
|
|
|
|
//
|
|
|
|
|
|
// ⚠️ CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
|
|
|
|
|
|
static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* base, uint32_t my_tid) {
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// BUGFIX: Validate slab_idx before array access (prevents buffer overflow at ss->slabs[-1])
|
|
|
|
|
|
int cap = ss_slabs_capacity(ss);
|
|
|
|
|
|
if (__builtin_expect(slab_idx < 0 || slab_idx >= cap, 0)) {
|
|
|
|
|
|
return 0; // Invalid index, reject
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// Debug: Track tiny_free_fast_ss calls
|
|
|
|
|
|
static __thread int free_ss_debug_count = 0;
|
|
|
|
|
|
if (getenv("HAKMEM_SFC_DEBUG") && free_ss_debug_count < 20) {
|
|
|
|
|
|
free_ss_debug_count++;
|
|
|
|
|
|
int is_same = tiny_free_is_same_thread_ss(ss, slab_idx, my_tid);
|
|
|
|
|
|
extern int g_sfc_enabled;
|
2025-11-13 07:43:30 +09:00
|
|
|
|
fprintf(stderr, "[FREE_SS] base=%p, cls=%d, same_thread=%d, sfc_enabled=%d\n",
|
|
|
|
|
|
base, ss->size_class, is_same, g_sfc_enabled);
|
2025-11-07 01:27:04 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Box 6 Boundary: Ownership check (TOCTOU-safe)
|
|
|
|
|
|
if (__builtin_expect(!tiny_free_is_same_thread_ss(ss, slab_idx, my_tid), 0)) {
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
// Track cross-thread frees (compile-time gated)
|
|
|
|
|
|
g_free_via_ss_remote[ss->size_class]++;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
return 0; // Cross-thread → caller should delegate to remote path
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Fast path: Same-thread free (2-3 instructions)
|
|
|
|
|
|
int class_idx = ss->size_class;
|
2025-11-13 07:43:30 +09:00
|
|
|
|
// Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
// Track same-thread frees (compile-time gated)
|
|
|
|
|
|
g_free_via_ss_local[class_idx]++;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
|
// Box 5 integration: class5 can use dedicated TLS List hotpath
|
2025-11-07 01:27:04 +09:00
|
|
|
|
extern int g_sfc_enabled;
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
|
if (__builtin_expect(g_tiny_hotpath_class5 && class_idx == 5, 0)) {
|
|
|
|
|
|
TinyTLSList* tls5 = &g_tls_lists[5];
|
|
|
|
|
|
tls_list_push_fast(tls5, base, 5);
|
|
|
|
|
|
} else if (g_sfc_enabled) {
|
|
|
|
|
|
// Box 5-NEW: Try SFC (128-256 slots)
|
2025-11-10 17:02:25 +09:00
|
|
|
|
if (!sfc_free_push(class_idx, base)) {
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// SFC full → skip caching, use slow path (return 0)
|
|
|
|
|
|
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// Box 5-OLD: Use SLL (16 slots)
|
2025-11-10 17:02:25 +09:00
|
|
|
|
tiny_alloc_fast_push(class_idx, base);
|
2025-11-07 01:27:04 +09:00
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// Active accounting (Box 3: SuperSlab)
|
|
|
|
|
|
// This is relatively cheap (atomic decrement) and necessary for memory management
|
|
|
|
|
|
ss_active_dec_one(ss);
|
|
|
|
|
|
|
|
|
|
|
|
return 1; // Success
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Free fast path for Legacy TinySlab-backed allocation
|
|
|
|
|
|
// Returns: 1 on success (same-thread), 0 on failure (cross-thread)
|
2025-11-13 07:43:30 +09:00
|
|
|
|
//
|
|
|
|
|
|
// ⚠️ CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
|
|
|
|
|
|
static inline int tiny_free_fast_legacy(TinySlab* slab, void* base) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Box 6 Boundary: Ownership check
|
|
|
|
|
|
if (__builtin_expect(!tiny_free_is_same_thread_legacy(slab), 0)) {
|
|
|
|
|
|
return 0; // Cross-thread → caller should delegate to precise path
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Fast path: Same-thread free
|
|
|
|
|
|
int class_idx = slab->class_idx;
|
2025-11-13 07:43:30 +09:00
|
|
|
|
// Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
|
|
|
|
|
|
extern int g_sfc_enabled;
|
|
|
|
|
|
if (g_sfc_enabled) {
|
|
|
|
|
|
// Box 5-NEW: Try SFC (128 slots)
|
2025-11-10 17:02:25 +09:00
|
|
|
|
if (!sfc_free_push(class_idx, base)) {
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// SFC full → skip caching, use slow path (return 0)
|
|
|
|
|
|
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// Box 5-OLD: Use SLL (16 slots)
|
2025-11-10 17:02:25 +09:00
|
|
|
|
tiny_alloc_fast_push(class_idx, base);
|
2025-11-07 01:27:04 +09:00
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
return 1; // Success
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Combined Fast Free (Lookup + Ownership + Push) ==========
|
|
|
|
|
|
|
|
|
|
|
|
// Complete fast free path (inline for zero-cost)
|
|
|
|
|
|
// Returns: none (delegates to backend on cross-thread or non-tiny)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Flow:
|
|
|
|
|
|
// 1. Lookup ptr → SuperSlab or TinySlab
|
|
|
|
|
|
// 2. Ownership check (owner_tid == my_tid)
|
|
|
|
|
|
// 3. Same-thread → TLS freelist push (2-3 instructions)
|
|
|
|
|
|
// 4. Cross-thread → Delegate to Box 2 (Remote Queue)
|
|
|
|
|
|
// 5. Not Tiny → Delegate to backend (Mid/Large)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Example usage:
|
|
|
|
|
|
// tiny_free_fast(ptr); // Always succeeds (delegates on failure)
|
|
|
|
|
|
static inline void tiny_free_fast(void* ptr) {
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// Optional runtime gate to disable fast free and route to slow path
|
|
|
|
|
|
// Env: HAKMEM_TINY_FREE_FAST (default: 1). Additionally, if
|
|
|
|
|
|
// HAKMEM_TINY_FREE_TO_SS=1 is set, prefer SS path by disabling fast free.
|
|
|
|
|
|
static int s_free_fast_en = -1;
|
|
|
|
|
|
if (__builtin_expect(s_free_fast_en == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_FREE_FAST");
|
|
|
|
|
|
int v = (e && *e && *e != '0') ? 1 : 1; // default ON
|
|
|
|
|
|
const char* to_ss = getenv("HAKMEM_TINY_FREE_TO_SS");
|
|
|
|
|
|
if (to_ss && *to_ss && *to_ss != '0') v = 0; // FREE_TO_SS implies slow path
|
|
|
|
|
|
s_free_fast_en = v;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!s_free_fast_en) {
|
|
|
|
|
|
// Delegate to precise slow path (handles same/remote + publish)
|
|
|
|
|
|
hak_tiny_free(ptr);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// 1. SuperSlab-backed tiny pointer?
|
|
|
|
|
|
if (__builtin_expect(g_use_superslab != 0, 1)) {
|
|
|
|
|
|
SuperSlab* ss = hak_super_lookup(ptr);
|
|
|
|
|
|
if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) {
|
Phase E1-CORRECT: Fix USER/BASE pointer conversion bugs in slab_index_for calls
CRITICAL BUG FIX: Phase E1 introduced 1-byte headers for ALL size classes (C0-C7),
changing the pointer contract. However, many locations still called slab_index_for()
with USER pointers (storage+1) instead of BASE pointers (storage), causing off-by-one
slab index calculations that corrupted memory.
Root Cause:
- USER pointer = BASE + 1 (returned by malloc, points past header)
- BASE pointer = storage start (where 1-byte header is written)
- slab_index_for() expects BASE pointer for correct slab boundary calculations
- Passing USER pointer → wrong slab_idx → wrong metadata → freelist corruption
Impact Before Fix:
- bench_random_mixed crashes at ~14K iterations with SEGV
- Massive C7 alignment check failures (wrong slab classification)
- Memory corruption from writing to wrong slab freelists
Fixes Applied (8 locations):
1. core/hakmem_tiny_free.inc:137
- Added USER→BASE conversion before slab_index_for()
2. core/hakmem_tiny_ultra_simple.inc:148
- Added USER→BASE conversion before slab_index_for()
3. core/tiny_free_fast.inc.h:220
- Added USER→BASE conversion before slab_index_for()
4-5. core/tiny_free_magazine.inc.h:126,315
- Added USER→BASE conversion before slab_index_for() (2 locations)
6. core/box/free_local_box.c:14,22,62
- Added USER→BASE conversion before slab_index_for()
- Fixed delta calculation to use BASE instead of USER
- Fixed debug logging to use BASE instead of USER
7. core/hakmem_tiny.c:448,460,473 (tiny_debug_track_alloc_ret)
- Added USER→BASE conversion before slab_index_for() (2 calls)
- Fixed delta calculation to use BASE instead of USER
- This function is called on EVERY allocation in debug builds
Results After Fix:
✅ bench_random_mixed stable up to 66K iterations (~4.7x improvement)
✅ C7 alignment check failures eliminated (was: 100% failure rate)
✅ Front Gate "Unknown" classification dropped to 0% (was: 1.67%)
✅ No segfaults for workloads up to ~33K allocations
Remaining Issue:
❌ Segfault still occurs at iteration 66152 (allocs=33137, frees=33014)
- Different bug from USER/BASE conversion issues
- Likely capacity/boundary condition (further investigation needed)
Testing:
- bench_random_mixed_hakmem 1K-66K iterations: PASS
- bench_random_mixed_hakmem 67K+ iterations: FAIL (different bug)
- bench_fixed_size_hakmem 200K iterations: PASS
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 05:21:36 +09:00
|
|
|
|
// ✅ FIX: Phase E1-CORRECT - Convert USER → BASE before slab index calculation
|
|
|
|
|
|
void* base = (void*)((uint8_t*)ptr - 1);
|
|
|
|
|
|
int slab_idx = slab_index_for(ss, base);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
uint32_t self_tid = tiny_self_u32();
|
|
|
|
|
|
|
|
|
|
|
|
// Box 6 Boundary: Try same-thread fast path
|
2025-11-13 07:43:30 +09:00
|
|
|
|
// CRITICAL: Pass BASE pointer (already converted above)
|
|
|
|
|
|
if (tiny_free_fast_ss(ss, slab_idx, base, self_tid)) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return; // Success: same-thread, pushed to TLS
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Cross-thread free → Box 2 (Remote Queue)
|
|
|
|
|
|
// Delegate to full tiny free (handles remote push)
|
|
|
|
|
|
hak_tiny_free(ptr);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 2. Legacy TinySlab-backed pointer?
|
|
|
|
|
|
TinySlab* slab = hak_tiny_owner_slab(ptr);
|
|
|
|
|
|
if (__builtin_expect(slab != NULL, 0)) {
|
2025-11-13 07:43:30 +09:00
|
|
|
|
// Convert USER → BASE (for Legacy path)
|
|
|
|
|
|
void* base_legacy = (void*)((uint8_t*)ptr - 1);
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Box 6 Boundary: Try same-thread fast path
|
2025-11-13 07:43:30 +09:00
|
|
|
|
// CRITICAL: Pass BASE pointer (already converted above)
|
|
|
|
|
|
if (tiny_free_fast_legacy(slab, base_legacy)) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return; // Success: same-thread, pushed to TLS
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Cross-thread free → precise path with known slab
|
|
|
|
|
|
hak_tiny_free_with_slab(ptr, slab);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 3. Not a tiny allocation → Delegate to backend (Mid/Large/Mmap)
|
|
|
|
|
|
hak_free_at(ptr, 0, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Guard/Debug Variants ==========
|
|
|
|
|
|
|
|
|
|
|
|
// Free with additional safety checks (for debugging/testing)
|
|
|
|
|
|
// This variant includes:
|
|
|
|
|
|
// - Sentinel checks (0xBADA55)
|
|
|
|
|
|
// - Double-free detection
|
|
|
|
|
|
// - Ownership validation
|
|
|
|
|
|
//
|
|
|
|
|
|
// Usage: Enable with HAKMEM_SAFE_FREE=1 environment variable
|
|
|
|
|
|
static inline void tiny_free_fast_guarded(void* ptr) {
|
|
|
|
|
|
// TODO: Implement guard checks if needed
|
|
|
|
|
|
// For now, delegate to standard fast path
|
|
|
|
|
|
tiny_free_fast(ptr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Statistics & Diagnostics ==========
|
|
|
|
|
|
|
|
|
|
|
|
// Free fast path stats (for profiling)
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
|
uint64_t same_thread_count; // Same-thread frees (TLS push)
|
|
|
|
|
|
uint64_t cross_thread_count; // Cross-thread frees (remote queue)
|
|
|
|
|
|
uint64_t non_tiny_count; // Non-tiny frees (backend)
|
|
|
|
|
|
} TinyFreeFastStats;
|
|
|
|
|
|
|
|
|
|
|
|
// Get free fast path stats (TLS-local)
|
|
|
|
|
|
static __thread TinyFreeFastStats g_tiny_free_fast_stats = {0};
|
|
|
|
|
|
|
|
|
|
|
|
static inline TinyFreeFastStats tiny_free_fast_stats_get(void) {
|
|
|
|
|
|
return g_tiny_free_fast_stats;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Reset free fast path stats (for testing/benchmarking)
|
|
|
|
|
|
static inline void tiny_free_fast_stats_reset(void) {
|
|
|
|
|
|
g_tiny_free_fast_stats.same_thread_count = 0;
|
|
|
|
|
|
g_tiny_free_fast_stats.cross_thread_count = 0;
|
|
|
|
|
|
g_tiny_free_fast_stats.non_tiny_count = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ========== Performance Notes ==========
|
|
|
|
|
|
//
|
|
|
|
|
|
// Expected metrics:
|
|
|
|
|
|
// - Same-thread hit rate: 80-90% (workload dependent)
|
|
|
|
|
|
// - Same-thread latency: 2-3 instructions (ownership check + push)
|
|
|
|
|
|
// - Cross-thread penalty: ~50-100 instructions (remote queue push)
|
|
|
|
|
|
// - Throughput improvement: +10-20% vs current multi-layer design
|
|
|
|
|
|
//
|
|
|
|
|
|
// Key optimizations:
|
|
|
|
|
|
// 1. Ownership check first (fail-fast on cross-thread)
|
|
|
|
|
|
// 2. `__builtin_expect` for branch prediction (same-thread is common)
|
|
|
|
|
|
// 3. `static inline` for zero-cost abstraction
|
|
|
|
|
|
// 4. TLS variables (no atomic ops in same-thread path)
|
|
|
|
|
|
//
|
|
|
|
|
|
// TOCTOU Race Prevention (Box 4 Boundary):
|
|
|
|
|
|
// - Ownership check is atomic (tiny_atomic_load_u32_relaxed)
|
|
|
|
|
|
// - No time window between check and push (single function)
|
|
|
|
|
|
// - Cross-thread frees are immediately delegated (no TLS touch)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Comparison with current design:
|
|
|
|
|
|
// - Current: 38.43% free overhead (free.part.0 + mid_lookup + locks)
|
|
|
|
|
|
// - New: 2-3 instructions (ownership check + TLS push)
|
|
|
|
|
|
// - Reduction: -90% instructions in same-thread path
|
|
|
|
|
|
//
|
|
|
|
|
|
// Inspired by:
|
|
|
|
|
|
// - System tcache (glibc malloc) - fast same-thread free
|
|
|
|
|
|
// - Box Theory - Clear ownership boundaries
|
|
|
|
|
|
// - TOCTOU fix (Box 4) - Atomic ownership check
|