Files
hakmem/core/hakmem_tiny_alloc_new.inc
Moe Charm (CI) 5cc1f93622 Phase 13-A Step 1: TinyHeapV2 NO-REFILL L0 cache implementation
Implement TinyHeapV2 as a minimal "lucky hit" L0 cache that avoids
circular dependency with FastCache by eliminating self-refill.

Key Changes:
- New: core/front/tiny_heap_v2.h - NO-REFILL L0 cache implementation
  - tiny_heap_v2_alloc(): Pop from magazine if available, else return NULL
  - tiny_heap_v2_refill_mag(): No-op stub (no backend refill)
  - ENV: HAKMEM_TINY_HEAP_V2=1 to enable
  - ENV: HAKMEM_TINY_HEAP_V2_CLASS_MASK=bitmask (C0-C3 control)
  - ENV: HAKMEM_TINY_HEAP_V2_STATS=1 to print statistics
- Modified: core/hakmem_tiny_alloc_new.inc - Add TinyHeapV2 hook
  - Hook at entry point (after class_idx calculation)
  - Fallback to existing front if TinyHeapV2 returns NULL
- Modified: core/hakmem_tiny_alloc.inc - Add hook for legacy path
- Modified: core/hakmem_tiny.c - Add TLS variables and stats wrapper
  - TinyHeapV2Mag: Per-class magazine (capacity=16)
  - TinyHeapV2Stats: Per-class counters (alloc_calls, mag_hits, etc.)
  - tiny_heap_v2_print_stats(): Statistics output at exit
- New: TINY_HEAP_V2_TASK_SPEC.md - Phase 13 specification

Root Cause Fixed:
- BEFORE: TinyHeapV2 refilled from FastCache → circular dependency
  - TinyHeapV2 intercepted all allocs → FastCache never populated
  - Result: 100% backend OOM, 0% hit rate, 99% slowdown
- AFTER: TinyHeapV2 is passive L0 cache (no refill)
  - Magazine empty → return NULL → existing front handles it
  - Result: 0% overhead, stable baseline performance

A/B Test Results (100K iterations, fixed-size bench):
- C1 (8B):  Baseline 9,688 ops/s → HeapV2 ON 9,762 ops/s (+0.76%)
- C2 (16B): Baseline 9,804 ops/s → HeapV2 ON 9,845 ops/s (+0.42%)
- C3 (32B): Baseline 9,840 ops/s → HeapV2 ON 9,814 ops/s (-0.26%)
- All within noise range: NO PERFORMANCE REGRESSION 

Statistics (HeapV2 ON, C1-C3):
- alloc_calls: 200K (hook works correctly)
- mag_hits: 0 (0%) - Magazine empty as expected
- refill_calls: 0 - No refill executed (circular dependency avoided)
- backend_oom: 0 - No backend access

Next Steps (Phase 13-A Step 2):
- Implement magazine supply strategy (from existing front or free path)
- Goal: Populate magazine with "leftover" blocks from existing pipeline

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-15 01:42:57 +09:00

342 lines
14 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_alloc_new.inc
// New 3-layer Tiny Pool allocation (simplified)
//
// Purpose: Reduce from 6-7 layers to 3 layers
// Target: 100+ instructions/op → 20-30 instructions/op
//
// Part of 3-layer architecture simplification (2025-11-01)
// Based on ChatGPT Pro UltraThink recommendations
// === IMPORTANT: Disable old benchmark fastpath ===
// The old HAKMEM_TINY_BENCH_FASTPATH conflicts with new 3-layer architecture
// We must disable it to ensure our new code runs
#ifdef HAKMEM_TINY_BENCH_FASTPATH
#undef HAKMEM_TINY_BENCH_FASTPATH
#endif
// Phase E1-CORRECT: Box API for next pointer operations
#include "box/tiny_next_ptr_box.h"
// Debug counters (thread-local)
static __thread uint64_t g_3layer_bump_hits = 0;
static __thread uint64_t g_3layer_mag_hits = 0;
static __thread uint64_t g_3layer_slow_hits = 0;
static __thread uint64_t g_3layer_refill_count = 0;
static __thread uint64_t g_3layer_refill_items = 0;
static __thread uint64_t g_3layer_fallback_superslab_disabled = 0;
static __thread uint64_t g_3layer_fallback_no_ss = 0;
static __thread uint64_t g_3layer_fallback_no_meta = 0;
static __thread uint64_t g_3layer_batch_carve_count = 0;
// Active accounting helper (env toggle: HAKMEM_TINY_ACTIVE_FIX=0 to disable)
static inline int tiny_active_fix_enabled(void) {
static int g_active_fix_en = -1;
if (__builtin_expect(g_active_fix_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ACTIVE_FIX");
g_active_fix_en = (e && atoi(e) == 0) ? 0 : 1;
}
return g_active_fix_en;
}
static inline void tiny_active_account_alloc(void* ptr) {
if (!ptr || !g_use_superslab) return;
if (!tiny_active_fix_enabled()) return;
SuperSlab* ss = hak_super_lookup(ptr);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
ss_active_inc(ss);
}
}
// Forward declaration for Layer 3
__attribute__((noinline, cold))
static void* tiny_alloc_slow_new(int class_idx);
// ============================================================================
// Main Allocation Function (3-layer architecture)
// ============================================================================
void* hak_tiny_alloc(size_t size) {
// Initialization check (cold path, once per thread)
#if !HAKMEM_BUILD_RELEASE
if (!g_tiny_initialized) hak_tiny_init();
#else
if (__builtin_expect(!g_tiny_initialized, 0)) {
hak_tiny_init();
}
#endif
// Wrapper guard (safety check, rare)
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) return NULL;
# else
extern int hak_in_wrapper(void);
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) return NULL;
# endif
#endif
// Size to class index
int class_idx = hak_tiny_size_to_class(size);
if (class_idx < 0) return NULL; // > 1KB
// DEBUG: Verify hak_tiny_alloc() is called
static int g_alloc_dbg = -1;
if (g_alloc_dbg == -1) {
const char* e = getenv("HAKMEM_TINY_HEAP_V2_DEBUG");
g_alloc_dbg = (e && *e && *e != '0') ? 1 : 0;
}
if (g_alloc_dbg) {
static int g_call_count = 0;
if (g_call_count < 3) {
fprintf(stderr, "[HAK_TINY_ALLOC] Called #%d, size=%zu, class=%d\n",
g_call_count++, size, class_idx);
}
}
// Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
ROUTE_BEGIN(class_idx);
// Phase 13-A: Tiny Heap v2 (per-thread heap, experimental)
// ENV-gated: HAKMEM_TINY_HEAP_V2=1
// Targets class 0-3 (8-64B) only, falls back to existing path if NULL
if (__builtin_expect(tiny_heap_v2_enabled(), 0) && class_idx <= 3) {
static int g_heap_v2_dbg = -1;
if (g_heap_v2_dbg == -1) {
const char* e = getenv("HAKMEM_TINY_HEAP_V2_DEBUG");
g_heap_v2_dbg = (e && *e && *e != '0') ? 1 : 0;
}
if (g_heap_v2_dbg) {
static int g_hook_count = 0;
if (g_hook_count < 5) {
fprintf(stderr, "[NEW3L-HOOK] class_idx=%d, size=%zu, hook_count=%d\n",
class_idx, size, g_hook_count++);
}
}
void* base = tiny_heap_v2_alloc(size);
if (base) {
HAK_RET_ALLOC(class_idx, base); // Header write + return USER pointer
}
// Fall through to existing front path if HeapV2 returned NULL (disabled class or OOM)
}
// Initialize small magazine (once per thread)
if (__builtin_expect(!g_tiny_small_mag_initialized, 0)) {
tiny_small_mag_init();
}
// ========================================================================
// === LAYER 1: TLS Bump Allocator (hot classes 0-2: 8B/16B/32B) ===
// === Target: 2-3 instructions/op ===
// ========================================================================
if (likely(class_idx <= 2)) {
void* p = tiny_bump_alloc(class_idx);
if (likely(p)) {
tiny_active_account_alloc(p);
g_3layer_bump_hits++;
// Mark: bump hit便宜的にhot_hitのbitを再利用 8
ROUTE_MARK(8); ROUTE_COMMIT(class_idx, 0x40);
HAK_RET_ALLOC(class_idx, p);
}
}
// ========================================================================
// === LAYER 2: TLS Small Magazine (all classes, 128 items) ===
// === Target: 5-10 instructions/op ===
// ========================================================================
void* p = small_mag_pop(class_idx);
if (likely(p)) {
extern unsigned long long g_front_mag_hit[];
g_front_mag_hit[class_idx]++;
tiny_active_account_alloc(p);
g_3layer_mag_hits++;
// Mark: small mag hitbench_hitのbitを便宜的に再利用 10
ROUTE_MARK(10); ROUTE_COMMIT(class_idx, 0x41);
HAK_RET_ALLOC(class_idx, p);
}
// ========================================================================
// === LAYER 3: Slow path (refill, slab allocation) ===
// === Target: 50-100+ instructions/op (rare) ===
// ========================================================================
g_3layer_slow_hits++;
return tiny_alloc_slow_new(class_idx);
}
// ============================================================================
// Layer 3: Slow Path (refill and slab management)
// ============================================================================
__attribute__((noinline, cold))
static void* tiny_alloc_slow_new(int class_idx) {
// ReturnFirst Selector: try Ready/Mailbox/Sticky/Hot/Bench/Registry once
do {
static int g_return_first = -1; // env: HAKMEM_TINY_RETURN_FIRST (default ON)
if (__builtin_expect(g_return_first == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_RETURN_FIRST");
g_return_first = (e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_return_first, 1)) {
extern __thread TinyTLSSlab g_tls_slabs[];
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
SuperSlab* rs = tiny_refill_try_fast(class_idx, tls);
(void)rs; // On success, tls->ss is bound and Step 2 will carve
}
} while (0);
// ========================================================================
// Layer 3: Refill Small Magazine and/or Bump from existing infrastructure
// ========================================================================
// Step 1: Try to refill Small Magazine from existing TLS Magazine
tiny_mag_init_if_needed(class_idx);
TinyTLSMag* large_mag = &g_tls_mags[class_idx];
if (large_mag->top > 0) {
// Batch transfer from large magazine (2048) to small magazine
int batch_size = 64; // Transfer in batches of 64
if (batch_size > large_mag->top) batch_size = large_mag->top;
void* items[64];
for (int i = 0; i < batch_size; i++) {
items[i] = large_mag->items[large_mag->top - 1 - i].ptr;
}
large_mag->top -= batch_size;
// Push to Small Magazine
int pushed = small_mag_batch_push(class_idx, items, batch_size);
g_3layer_refill_count++;
g_3layer_refill_items += pushed;
// Try to pop one and return
void* p = small_mag_pop(class_idx);
if (p) {
tiny_active_account_alloc(p);
return p;
}
}
// Step 2: Large Magazine empty - batch carve from SuperSlab directly
// ChatGPT Pro P0: Complete batch化 (based on tls_refill_from_tls_slab:115-126)
if (!g_use_superslab) {
g_3layer_fallback_superslab_disabled++;
return hak_tiny_alloc_slow(0, class_idx);
}
TinyTLSSlab* tls_slab = &g_tls_slabs[class_idx];
if (!tls_slab->ss) {
if (superslab_refill(class_idx) == NULL) {
g_3layer_fallback_no_ss++;
// Optional one-shot debug
static int g_alloc_dbg = -1; if (__builtin_expect(g_alloc_dbg == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg) {
static _Atomic int printed_ss[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_ss[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] refill returned NULL (no SS) class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx); // Fallback
}
}
TinySlabMeta* meta = tls_slab->meta;
if (!meta) {
g_3layer_fallback_no_meta++;
// Optional one-shot debug
static int g_alloc_dbg2 = -1; if (__builtin_expect(g_alloc_dbg2 == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg2 = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg2) {
static _Atomic int printed_meta[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_meta[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] meta is NULL after refill class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx);
}
// Batch carve from SuperSlab (P0 optimization - no 64x function calls!)
uint32_t want = 64; // Refill target
void* items[64];
int got = 0;
// Try freelist first (small amount, usually 0)
while (got < (int)want && meta->freelist) {
void* node = meta->freelist;
meta->freelist = tiny_next_read(node); // Phase E1-CORRECT: Box API
items[got++] = node;
meta->used++;
}
// Then linear carve (KEY OPTIMIZATION - direct array fill!)
if (got < (int)want && meta->used < meta->capacity) {
uint32_t need = want - got;
uint32_t available = meta->capacity - meta->used;
if (need > available) need = available;
size_t block_size = g_tiny_class_sizes[class_idx];
uint8_t* slab_base = tls_slab->slab_base ? tls_slab->slab_base
: tiny_slab_base_for(tls_slab->ss, tls_slab->slab_idx);
uint8_t* cursor = slab_base + ((size_t)meta->used * block_size);
// Batch carve: directly fill items array (no linked list, no 64 function calls!)
for (uint32_t i = 0; i < need; ++i) {
items[got++] = (void*)cursor;
cursor += block_size;
}
meta->used += need; // Reserve to TLS; not active until returned to user
}
if (got == 0) {
// Slab exhausted, try refill and retry once
if (superslab_refill(class_idx) != NULL) {
return tiny_alloc_slow_new(class_idx); // Recursive retry
}
static int g_alloc_dbg3 = -1; if (__builtin_expect(g_alloc_dbg3 == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg3 = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg3) {
static _Atomic int printed_final[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_final[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] no items after retry (final fallback) class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx); // Ultimate fallback
}
// Take one for return, push rest to Small Magazine
g_3layer_batch_carve_count++;
void* result = items[0];
if (got > 1) {
int pushed = small_mag_batch_push(class_idx, &items[1], got - 1);
g_3layer_refill_count++;
g_3layer_refill_items += pushed;
}
tiny_active_account_alloc(result);
// Route: slab carve directlinear相当の採用扱い
ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60);
return result;
}
// Debug function: print layer statistics
__attribute__((destructor))
static void print_3layer_stats(void) {
uint64_t total = g_3layer_bump_hits + g_3layer_mag_hits + g_3layer_slow_hits;
if (total > 0) {
fprintf(stderr, "\n=== 3-Layer Architecture Stats ===\n");
fprintf(stderr, "Bump hits: %10lu (%5.2f%%)\n",
g_3layer_bump_hits, 100.0 * g_3layer_bump_hits / total);
fprintf(stderr, "Mag hits: %10lu (%5.2f%%)\n",
g_3layer_mag_hits, 100.0 * g_3layer_mag_hits / total);
fprintf(stderr, "Slow hits: %10lu (%5.2f%%)\n",
g_3layer_slow_hits, 100.0 * g_3layer_slow_hits / total);
fprintf(stderr, "Total allocs: %10lu\n", total);
fprintf(stderr, "Refill count: %10lu\n", g_3layer_refill_count);
fprintf(stderr, "Refill items: %10lu (avg %.1f/refill)\n",
g_3layer_refill_items,
g_3layer_refill_count > 0 ? (double)g_3layer_refill_items / g_3layer_refill_count : 0.0);
fprintf(stderr, "=== Fallback Paths ===\n");
fprintf(stderr, "SuperSlab disabled: %lu\n", g_3layer_fallback_superslab_disabled);
fprintf(stderr, "No SuperSlab: %lu\n", g_3layer_fallback_no_ss);
fprintf(stderr, "No meta: %lu\n", g_3layer_fallback_no_meta);
fprintf(stderr, "Batch carve count: %lu\n", g_3layer_batch_carve_count);
}
}