## Root Cause Analysis (GPT5) **Physical Layout Constraints**: - Class 0: 8B = [1B header][7B payload] → offset 1 = 9B needed = ❌ IMPOSSIBLE - Class 1-6: >=16B = [1B header][15B+ payload] → offset 1 = ✅ POSSIBLE - Class 7: 1KB → offset 0 (compatibility) **Correct Specification**: - HAKMEM_TINY_HEADER_CLASSIDX != 0: - Class 0, 7: next at offset 0 (overwrites header when on freelist) - Class 1-6: next at offset 1 (after header) - HAKMEM_TINY_HEADER_CLASSIDX == 0: - All classes: next at offset 0 **Previous Bug**: - Attempted "ALL classes offset 1" unification - Class 0 with offset 1 caused immediate SEGV (9B > 8B block size) - Mixed 2-arg/3-arg API caused confusion ## Fixes Applied ### 1. Restored 3-Argument Box API (core/box/tiny_next_ptr_box.h) ```c // Correct signatures void tiny_next_write(int class_idx, void* base, void* next_value) void* tiny_next_read(int class_idx, const void* base) // Correct offset calculation size_t offset = (class_idx == 0 || class_idx == 7) ? 0 : 1; ``` ### 2. Updated 123+ Call Sites Across 34 Files - hakmem_tiny_hot_pop_v4.inc.h (4 locations) - hakmem_tiny_fastcache.inc.h (3 locations) - hakmem_tiny_tls_list.h (12 locations) - superslab_inline.h (5 locations) - tiny_fastcache.h (3 locations) - ptr_trace.h (macro definitions) - tls_sll_box.h (2 locations) - + 27 additional files Pattern: `tiny_next_read(base)` → `tiny_next_read(class_idx, base)` Pattern: `tiny_next_write(base, next)` → `tiny_next_write(class_idx, base, next)` ### 3. Added Sentinel Detection Guards - tiny_fast_push(): Block nodes with sentinel in ptr or ptr->next - tls_list_push(): Block nodes with sentinel in ptr or ptr->next - Defense-in-depth against remote free sentinel leakage ## Verification (GPT5 Report) **Test Command**: `./out/release/bench_random_mixed_hakmem --iterations=70000` **Results**: - ✅ Main loop completed successfully - ✅ Drain phase completed successfully - ✅ NO SEGV (previous crash at iteration 66151 is FIXED) - ℹ️ Final log: "tiny_alloc(1024) failed" is normal fallback to Mid/ACE layers **Analysis**: - Class 0 immediate SEGV: ✅ RESOLVED (correct offset 0 now used) - 66K iteration crash: ✅ RESOLVED (offset consistency fixed) - Box API conflicts: ✅ RESOLVED (unified 3-arg API) ## Technical Details ### Offset Logic Justification ``` Class 0: 8B block → next pointer (8B) fits ONLY at offset 0 Class 1: 16B block → next pointer (8B) fits at offset 1 (after 1B header) Class 2: 32B block → next pointer (8B) fits at offset 1 ... Class 6: 512B block → next pointer (8B) fits at offset 1 Class 7: 1024B block → offset 0 for legacy compatibility ``` ### Files Modified (Summary) - Core API: `box/tiny_next_ptr_box.h` - Hot paths: `hakmem_tiny_hot_pop*.inc.h`, `tiny_fastcache.h` - TLS layers: `hakmem_tiny_tls_list.h`, `hakmem_tiny_tls_ops.h` - SuperSlab: `superslab_inline.h`, `tiny_superslab_*.inc.h` - Refill: `hakmem_tiny_refill.inc.h`, `tiny_refill_opt.h` - Free paths: `tiny_free_magazine.inc.h`, `tiny_superslab_free.inc.h` - Documentation: Multiple Phase E3 reports ## Remaining Work None for Box API offset bugs - all structural issues resolved. Future enhancements (non-critical): - Periodic `grep -R '*(void**)' core/` to detect direct pointer access violations - Enforce Box API usage via static analysis - Document offset rationale in architecture docs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
234 lines
8.2 KiB
C
234 lines
8.2 KiB
C
// tiny_fastcache.h - Ultra-Simple Tiny Fast Path (System tcache style)
|
|
// Phase 6-3: Bypass Magazine/SuperSlab for Tiny allocations (<=128B)
|
|
// Goal: 3-4 instruction fast path, 70-80% of System tcache performance
|
|
#pragma once
|
|
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include <stdlib.h> // For getenv()
|
|
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
|
|
|
|
// ========== Configuration ==========
|
|
|
|
// Enable Tiny Fast Path (default: ON for Phase 6-3)
|
|
#ifndef HAKMEM_TINY_FAST_PATH
|
|
#define HAKMEM_TINY_FAST_PATH 1
|
|
#endif
|
|
|
|
// Tiny class count (sizes: 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128)
|
|
#define TINY_FAST_CLASS_COUNT 16
|
|
|
|
// Fast cache capacity per class (default: 64 slots, like System tcache)
|
|
#ifndef TINY_FAST_CACHE_CAP
|
|
#define TINY_FAST_CACHE_CAP 64
|
|
#endif
|
|
|
|
// Tiny size threshold (<=128B goes to fast path)
|
|
#define TINY_FAST_THRESHOLD 128
|
|
|
|
// ========== TLS Cache (System tcache style) ==========
|
|
|
|
// Per-thread fast cache: array of freelist heads (defined in tiny_fastcache.c)
|
|
extern __thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT];
|
|
|
|
// Per-thread cache counts (for capacity management)
|
|
extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
|
|
|
|
// Initialized flag
|
|
extern __thread int g_tiny_fast_initialized;
|
|
|
|
// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
|
|
// Separate free staging area to reduce cache line bouncing
|
|
|
|
extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT];
|
|
extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT];
|
|
|
|
// ========== RDTSC Profiling (Phase 6-8) ==========
|
|
// Extern declarations for inline functions to access profiling counters
|
|
|
|
extern __thread uint64_t g_tiny_malloc_count;
|
|
extern __thread uint64_t g_tiny_malloc_cycles;
|
|
extern __thread uint64_t g_tiny_free_count;
|
|
extern __thread uint64_t g_tiny_free_cycles;
|
|
extern __thread uint64_t g_tiny_refill_cycles;
|
|
extern __thread uint64_t g_tiny_migration_count;
|
|
extern __thread uint64_t g_tiny_migration_cycles;
|
|
|
|
#ifdef __x86_64__
|
|
static inline uint64_t tiny_fast_rdtsc(void) {
|
|
unsigned int lo, hi;
|
|
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
|
return ((uint64_t)hi << 32) | lo;
|
|
}
|
|
#else
|
|
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
|
|
#endif
|
|
|
|
extern int g_profile_enabled;
|
|
static inline int tiny_fast_profile_enabled(void) {
|
|
extern int g_profile_enabled;
|
|
if (__builtin_expect(g_profile_enabled == -1, 0)) {
|
|
const char* env = getenv("HAKMEM_TINY_PROFILE");
|
|
g_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
|
|
}
|
|
return g_profile_enabled;
|
|
}
|
|
|
|
// ========== Size to Class Mapping ==========
|
|
// Inline size-to-class for fast path (O(1) lookup table)
|
|
|
|
static inline int tiny_fast_size_to_class(size_t size) {
|
|
// Optimized: Lookup table for O(1) mapping (vs 11-branch linear search)
|
|
// Table indexed by (size >> 3) for sizes 0-128
|
|
// Class mapping: 0:16B, 1:24B, 2:32B, 3:40B, 4:48B, 5:56B, 6:64B, 7:80B, 8:96B, 9:112B, 10:128B
|
|
|
|
static const int8_t size_to_class_lut[17] = {
|
|
0, // 0-7 → 16B (class 0)
|
|
0, // 8-15 → 16B (class 0)
|
|
0, // 16 → 16B (class 0)
|
|
1, // 17-23 → 24B (class 1)
|
|
1, // 24 → 24B (class 1)
|
|
2, // 25-31 → 32B (class 2)
|
|
2, // 32 → 32B (class 2)
|
|
3, // 33-39 → 40B (class 3)
|
|
3, // 40 → 40B (class 3)
|
|
4, // 41-47 → 48B (class 4)
|
|
4, // 48 → 48B (class 4)
|
|
5, // 49-55 → 56B (class 5)
|
|
5, // 56 → 56B (class 5)
|
|
6, // 57-63 → 64B (class 6)
|
|
6, // 64 → 64B (class 6)
|
|
7, // 65-79 → 80B (class 7)
|
|
8 // 80-95 → 96B (class 8)
|
|
};
|
|
|
|
if (__builtin_expect(size > 128, 0)) return -1; // Not tiny
|
|
|
|
// Fast path: Direct lookup (1-2 instructions!)
|
|
unsigned int idx = size >> 3; // size / 8
|
|
if (__builtin_expect(idx < 17, 1)) {
|
|
return size_to_class_lut[idx];
|
|
}
|
|
|
|
// Size 96-128: class 9-10
|
|
if (size <= 112) return 9; // 112B (class 9)
|
|
return 10; // 128B (class 10)
|
|
}
|
|
|
|
// ========== Forward Declarations ==========
|
|
// Slow path: refill from Magazine/SuperSlab (implemented in tiny_fastcache.c)
|
|
void* tiny_fast_refill(int class_idx);
|
|
void tiny_fast_drain(int class_idx);
|
|
|
|
// ========== Fast Path: Alloc (3-4 instructions!) ==========
|
|
|
|
static inline void* tiny_fast_alloc(size_t size) {
|
|
uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
|
|
|
// Step 1: Size to class (1-2 instructions, branch predictor friendly)
|
|
int cls = tiny_fast_size_to_class(size);
|
|
if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare)
|
|
|
|
// Step 2: Pop from alloc_head (hot allocation path)
|
|
void* ptr = g_tiny_fast_cache[cls];
|
|
if (__builtin_expect(ptr != NULL, 1)) {
|
|
// Fast path: Pop head, decrement count
|
|
g_tiny_fast_cache[cls] = tiny_next_read(cls, ptr);
|
|
g_tiny_fast_count[cls]--;
|
|
|
|
if (start) {
|
|
g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
|
|
g_tiny_malloc_count++;
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
// ========================================================================
|
|
// Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2)
|
|
// If alloc_head empty but free_head has blocks, migrate with pointer swap
|
|
// This is mimalloc's key optimization: batched migration, zero overhead
|
|
// ========================================================================
|
|
if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) {
|
|
uint64_t mig_start = start ? tiny_fast_rdtsc() : 0;
|
|
|
|
// Migrate entire free_head → alloc_head (pointer swap, instant!)
|
|
g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls];
|
|
g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls];
|
|
g_tiny_fast_free_head[cls] = NULL;
|
|
g_tiny_fast_free_count[cls] = 0;
|
|
|
|
// Now pop one from newly migrated list
|
|
ptr = g_tiny_fast_cache[cls];
|
|
g_tiny_fast_cache[cls] = tiny_next_read(cls, ptr);
|
|
g_tiny_fast_count[cls]--;
|
|
|
|
if (mig_start) {
|
|
g_tiny_migration_cycles += (tiny_fast_rdtsc() - mig_start);
|
|
g_tiny_migration_count++;
|
|
}
|
|
|
|
if (start) {
|
|
g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
|
|
g_tiny_malloc_count++;
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
// Step 3: Slow path - refill from Magazine/SuperSlab
|
|
ptr = tiny_fast_refill(cls);
|
|
|
|
if (start) {
|
|
g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
|
|
g_tiny_malloc_count++;
|
|
}
|
|
return ptr;
|
|
}
|
|
|
|
// ========== Fast Path: Free (2-3 instructions!) ==========
|
|
|
|
static inline void tiny_fast_free(void* ptr, size_t size) {
|
|
uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
|
|
|
// Step 1: Size to class
|
|
int cls = tiny_fast_size_to_class(size);
|
|
if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error)
|
|
|
|
// ========================================================================
|
|
// Phase 6-7: Push to free_head (Phase 2)
|
|
// Separate free staging area reduces cache line contention with alloc_head
|
|
// mimalloc's key insight: alloc/free touch different cache lines
|
|
// ========================================================================
|
|
|
|
// Step 2: Check free_head capacity
|
|
if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) {
|
|
// Free cache full - drain to Magazine/SuperSlab
|
|
tiny_fast_drain(cls);
|
|
}
|
|
|
|
// Step 3: Push to free_head (separate cache line from alloc_head!)
|
|
tiny_next_write(cls, ptr, g_tiny_fast_free_head[cls]);
|
|
g_tiny_fast_free_head[cls] = ptr;
|
|
g_tiny_fast_free_count[cls]++;
|
|
|
|
if (start) {
|
|
g_tiny_free_cycles += (tiny_fast_rdtsc() - start);
|
|
g_tiny_free_count++;
|
|
}
|
|
}
|
|
|
|
// ========== Initialization ==========
|
|
|
|
static inline void tiny_fast_init(void) {
|
|
if (g_tiny_fast_initialized) return;
|
|
|
|
memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache));
|
|
memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count));
|
|
|
|
// Phase 6-7: Initialize dual free lists (Phase 2)
|
|
memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head));
|
|
memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count));
|
|
|
|
g_tiny_fast_initialized = 1;
|
|
}
|