Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -79,6 +79,23 @@ extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
|
||||
extern int hak_tiny_size_to_class(size_t size);
|
||||
extern int tiny_refill_failfast_level(void);
|
||||
extern const size_t g_tiny_class_sizes[];
|
||||
// Hot-class toggle: class5 (256B) dedicated TLS fast path
|
||||
extern int g_tiny_hotpath_class5;
|
||||
|
||||
// Minimal class5 refill helper: fixed, branch-light refill into TLS List, then take one
|
||||
// Preconditions: class_idx==5 and g_tiny_hotpath_class5==1
|
||||
static inline void* tiny_class5_minirefill_take(void) {
|
||||
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||||
TinyTLSList* tls5 = &g_tls_lists[5];
|
||||
// Fast pop if available
|
||||
void* base = tls_list_pop_fast(tls5, 5);
|
||||
if (base) {
|
||||
// CRITICAL FIX: Convert base -> user pointer for class 5
|
||||
return (void*)((uint8_t*)base + 1);
|
||||
}
|
||||
// Robust refill via generic helper(header対応・境界検証済み)
|
||||
return tiny_fast_refill_and_take(5, tls5);
|
||||
}
|
||||
|
||||
// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
|
||||
extern int g_refill_count_global;
|
||||
@ -212,8 +229,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
}
|
||||
|
||||
if (__builtin_expect(sfc_is_enabled, 1)) {
|
||||
void* ptr = sfc_alloc(class_idx);
|
||||
if (__builtin_expect(ptr != NULL, 1)) {
|
||||
void* base = sfc_alloc(class_idx);
|
||||
if (__builtin_expect(base != NULL, 1)) {
|
||||
// Front Gate: SFC hit
|
||||
extern unsigned long long g_front_sfc_hit[];
|
||||
g_front_sfc_hit[class_idx]++;
|
||||
@ -224,7 +241,9 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
g_tiny_alloc_hits++;
|
||||
}
|
||||
#endif
|
||||
return ptr;
|
||||
// CRITICAL FIX: Convert base -> user pointer for classes 0-6
|
||||
void* user_ptr = (class_idx == 7) ? base : (void*)((uint8_t*)base + 1);
|
||||
return user_ptr;
|
||||
}
|
||||
// SFC miss → try SLL (Layer 1)
|
||||
}
|
||||
@ -235,8 +254,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
// Use Box TLS-SLL API (C7-safe pop)
|
||||
// CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
|
||||
// Reading head before pop causes stale read → rbp=0xa0 SEGV
|
||||
void* head = NULL;
|
||||
if (tls_sll_pop(class_idx, &head)) {
|
||||
void* base = NULL;
|
||||
if (tls_sll_pop(class_idx, &base)) {
|
||||
// Front Gate: SLL hit (fast path 3 instructions)
|
||||
extern unsigned long long g_front_sll_hit[];
|
||||
g_front_sll_hit[class_idx]++;
|
||||
@ -253,7 +272,9 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
g_tiny_alloc_hits++;
|
||||
}
|
||||
#endif
|
||||
return head;
|
||||
// CRITICAL FIX: Convert base -> user pointer for classes 0-6
|
||||
void* user_ptr = (class_idx == 7) ? base : (void*)((uint8_t*)base + 1);
|
||||
return user_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -272,11 +293,28 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||||
// - No circular dependency: one-way only
|
||||
// - Boundary clear: SLL pop → SFC push
|
||||
// - Fallback safe: if SFC full, stop (no overflow)
|
||||
// Env-driven cascade percentage (0-100), default 50%
|
||||
static inline int sfc_cascade_pct(void) {
|
||||
static int pct = -1;
|
||||
if (__builtin_expect(pct == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_SFC_CASCADE_PCT");
|
||||
int v = e && *e ? atoi(e) : 50;
|
||||
if (v < 0) v = 0; if (v > 100) v = 100;
|
||||
pct = v;
|
||||
}
|
||||
return pct;
|
||||
}
|
||||
|
||||
static inline int sfc_refill_from_sll(int class_idx, int target_count) {
|
||||
int transferred = 0;
|
||||
uint32_t cap = g_sfc_capacity[class_idx];
|
||||
|
||||
while (transferred < target_count && g_tls_sll_count[class_idx] > 0) {
|
||||
// Adjust target based on cascade percentage
|
||||
int pct = sfc_cascade_pct();
|
||||
int want = (target_count * pct) / 100;
|
||||
if (want <= 0) want = target_count / 2; // safety fallback
|
||||
|
||||
while (transferred < want && g_tls_sll_count[class_idx] > 0) {
|
||||
// Check SFC capacity before transfer
|
||||
if (g_sfc_count[class_idx] >= cap) {
|
||||
break; // SFC full, stop
|
||||
@ -426,6 +464,10 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
|
||||
}
|
||||
|
||||
if (sfc_is_enabled_refill && refilled > 0) {
|
||||
// Skip SFC cascade for class5 when dedicated hotpath is enabled
|
||||
if (g_tiny_hotpath_class5 && class_idx == 5) {
|
||||
// no-op: keep refilled blocks in TLS List/SLL
|
||||
} else {
|
||||
// Transfer half of refilled blocks to SFC (keep half in SLL for future)
|
||||
int sfc_target = refilled / 2;
|
||||
if (sfc_target > 0) {
|
||||
@ -436,6 +478,7 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
|
||||
(void)transferred; // Unused, but could track stats
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
@ -472,18 +515,34 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
return NULL; // Size > 1KB, not Tiny
|
||||
}
|
||||
ROUTE_BEGIN(class_idx);
|
||||
void* ptr = NULL;
|
||||
const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5);
|
||||
|
||||
// 2. Fast path: Frontend pop (FastCache/SFC/SLL)
|
||||
// Try the consolidated fast pop path first (includes FastCache for C0–C3)
|
||||
void* ptr = tiny_alloc_fast_pop(class_idx);
|
||||
if (__builtin_expect(hot_c5, 0)) {
|
||||
// class5: 専用最短経路(generic frontは一切通らない)
|
||||
void* p = tiny_class5_minirefill_take();
|
||||
if (p) HAK_RET_ALLOC(class_idx, p);
|
||||
|
||||
int refilled = tiny_alloc_fast_refill(class_idx);
|
||||
if (__builtin_expect(refilled > 0, 1)) {
|
||||
p = tiny_class5_minirefill_take();
|
||||
if (p) HAK_RET_ALLOC(class_idx, p);
|
||||
}
|
||||
|
||||
// slow pathへ(genericフロントは回避)
|
||||
ptr = hak_tiny_alloc_slow(size, class_idx);
|
||||
if (ptr) HAK_RET_ALLOC(class_idx, ptr);
|
||||
return ptr; // NULL if OOM
|
||||
}
|
||||
|
||||
// Generic front (FastCache/SFC/SLL)
|
||||
ptr = tiny_alloc_fast_pop(class_idx);
|
||||
if (__builtin_expect(ptr != NULL, 1)) {
|
||||
// C7 (1024B, headerless) is never returned by tiny_alloc_fast_pop (returns NULL for C7)
|
||||
HAK_RET_ALLOC(class_idx, ptr);
|
||||
}
|
||||
|
||||
// 3. Miss: Refill from TLS List/SuperSlab and take one into FastCache/front
|
||||
// Generic: Refill and take(FastCacheやTLS Listへ)
|
||||
{
|
||||
// Use header-aware TLS List bulk transfer that prefers FastCache for C0–C3
|
||||
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||||
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
|
||||
if (took) {
|
||||
@ -491,12 +550,14 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Still miss: Fallback to existing backend refill and retry
|
||||
int refilled = tiny_alloc_fast_refill(class_idx);
|
||||
if (__builtin_expect(refilled > 0, 1)) {
|
||||
ptr = tiny_alloc_fast_pop(class_idx);
|
||||
if (ptr) {
|
||||
HAK_RET_ALLOC(class_idx, ptr);
|
||||
// Backend refill後に再トライ
|
||||
{
|
||||
int refilled = tiny_alloc_fast_refill(class_idx);
|
||||
if (__builtin_expect(refilled > 0, 1)) {
|
||||
ptr = tiny_alloc_fast_pop(class_idx);
|
||||
if (ptr) {
|
||||
HAK_RET_ALLOC(class_idx, ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user