Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default

## Major Changes

### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies

### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed

### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review

### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
  - Hotpath works (+24% vs baseline) after POOL_TLS fix
  - Still 9.2x slower than system malloc due to:
    * Heavy initialization (23.85% of cycles)
    * Syscall overhead (2,382 syscalls per 100K ops)
    * Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
    * 9.4x more instructions than system malloc

### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation

## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)

## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-12 01:01:23 +09:00
parent 862e8ea7db
commit 6859d589ea
13 changed files with 759 additions and 52 deletions

View File

@ -79,6 +79,23 @@ extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
extern int hak_tiny_size_to_class(size_t size);
extern int tiny_refill_failfast_level(void);
extern const size_t g_tiny_class_sizes[];
// Hot-class toggle: class5 (256B) dedicated TLS fast path
extern int g_tiny_hotpath_class5;
// Minimal class5 refill helper: fixed, branch-light refill into TLS List, then take one
// Preconditions: class_idx==5 and g_tiny_hotpath_class5==1
static inline void* tiny_class5_minirefill_take(void) {
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
TinyTLSList* tls5 = &g_tls_lists[5];
// Fast pop if available
void* base = tls_list_pop_fast(tls5, 5);
if (base) {
// CRITICAL FIX: Convert base -> user pointer for class 5
return (void*)((uint8_t*)base + 1);
}
// Robust refill via generic helperheader対応・境界検証済み
return tiny_fast_refill_and_take(5, tls5);
}
// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
extern int g_refill_count_global;
@ -212,8 +229,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
}
if (__builtin_expect(sfc_is_enabled, 1)) {
void* ptr = sfc_alloc(class_idx);
if (__builtin_expect(ptr != NULL, 1)) {
void* base = sfc_alloc(class_idx);
if (__builtin_expect(base != NULL, 1)) {
// Front Gate: SFC hit
extern unsigned long long g_front_sfc_hit[];
g_front_sfc_hit[class_idx]++;
@ -224,7 +241,9 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
g_tiny_alloc_hits++;
}
#endif
return ptr;
// CRITICAL FIX: Convert base -> user pointer for classes 0-6
void* user_ptr = (class_idx == 7) ? base : (void*)((uint8_t*)base + 1);
return user_ptr;
}
// SFC miss → try SLL (Layer 1)
}
@ -235,8 +254,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
// Use Box TLS-SLL API (C7-safe pop)
// CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
// Reading head before pop causes stale read → rbp=0xa0 SEGV
void* head = NULL;
if (tls_sll_pop(class_idx, &head)) {
void* base = NULL;
if (tls_sll_pop(class_idx, &base)) {
// Front Gate: SLL hit (fast path 3 instructions)
extern unsigned long long g_front_sll_hit[];
g_front_sll_hit[class_idx]++;
@ -253,7 +272,9 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
g_tiny_alloc_hits++;
}
#endif
return head;
// CRITICAL FIX: Convert base -> user pointer for classes 0-6
void* user_ptr = (class_idx == 7) ? base : (void*)((uint8_t*)base + 1);
return user_ptr;
}
}
@ -272,11 +293,28 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
// - No circular dependency: one-way only
// - Boundary clear: SLL pop → SFC push
// - Fallback safe: if SFC full, stop (no overflow)
// Env-driven cascade percentage (0-100), default 50%
static inline int sfc_cascade_pct(void) {
static int pct = -1;
if (__builtin_expect(pct == -1, 0)) {
const char* e = getenv("HAKMEM_SFC_CASCADE_PCT");
int v = e && *e ? atoi(e) : 50;
if (v < 0) v = 0; if (v > 100) v = 100;
pct = v;
}
return pct;
}
static inline int sfc_refill_from_sll(int class_idx, int target_count) {
int transferred = 0;
uint32_t cap = g_sfc_capacity[class_idx];
while (transferred < target_count && g_tls_sll_count[class_idx] > 0) {
// Adjust target based on cascade percentage
int pct = sfc_cascade_pct();
int want = (target_count * pct) / 100;
if (want <= 0) want = target_count / 2; // safety fallback
while (transferred < want && g_tls_sll_count[class_idx] > 0) {
// Check SFC capacity before transfer
if (g_sfc_count[class_idx] >= cap) {
break; // SFC full, stop
@ -426,6 +464,10 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
}
if (sfc_is_enabled_refill && refilled > 0) {
// Skip SFC cascade for class5 when dedicated hotpath is enabled
if (g_tiny_hotpath_class5 && class_idx == 5) {
// no-op: keep refilled blocks in TLS List/SLL
} else {
// Transfer half of refilled blocks to SFC (keep half in SLL for future)
int sfc_target = refilled / 2;
if (sfc_target > 0) {
@ -436,6 +478,7 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
(void)transferred; // Unused, but could track stats
#endif
}
}
}
#if !HAKMEM_BUILD_RELEASE
@ -472,18 +515,34 @@ static inline void* tiny_alloc_fast(size_t size) {
return NULL; // Size > 1KB, not Tiny
}
ROUTE_BEGIN(class_idx);
void* ptr = NULL;
const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5);
// 2. Fast path: Frontend pop (FastCache/SFC/SLL)
// Try the consolidated fast pop path first (includes FastCache for C0C3)
void* ptr = tiny_alloc_fast_pop(class_idx);
if (__builtin_expect(hot_c5, 0)) {
// class5: 専用最短経路generic frontは一切通らない
void* p = tiny_class5_minirefill_take();
if (p) HAK_RET_ALLOC(class_idx, p);
int refilled = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled > 0, 1)) {
p = tiny_class5_minirefill_take();
if (p) HAK_RET_ALLOC(class_idx, p);
}
// slow pathへgenericフロントは回避
ptr = hak_tiny_alloc_slow(size, class_idx);
if (ptr) HAK_RET_ALLOC(class_idx, ptr);
return ptr; // NULL if OOM
}
// Generic front (FastCache/SFC/SLL)
ptr = tiny_alloc_fast_pop(class_idx);
if (__builtin_expect(ptr != NULL, 1)) {
// C7 (1024B, headerless) is never returned by tiny_alloc_fast_pop (returns NULL for C7)
HAK_RET_ALLOC(class_idx, ptr);
}
// 3. Miss: Refill from TLS List/SuperSlab and take one into FastCache/front
// Generic: Refill and takeFastCacheやTLS Listへ
{
// Use header-aware TLS List bulk transfer that prefers FastCache for C0C3
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
if (took) {
@ -491,12 +550,14 @@ static inline void* tiny_alloc_fast(size_t size) {
}
}
// 4. Still miss: Fallback to existing backend refill and retry
int refilled = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled > 0, 1)) {
ptr = tiny_alloc_fast_pop(class_idx);
if (ptr) {
HAK_RET_ALLOC(class_idx, ptr);
// Backend refill後に再トライ
{
int refilled = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled > 0, 1)) {
ptr = tiny_alloc_fast_pop(class_idx);
if (ptr) {
HAK_RET_ALLOC(class_idx, ptr);
}
}
}