Files
hakmem/core/hakmem_tiny_alloc_new.inc

351 lines
14 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_alloc_new.inc
// New 3-layer Tiny Pool allocation (simplified)
//
// Purpose: Reduce from 6-7 layers to 3 layers
// Target: 100+ instructions/op → 20-30 instructions/op
//
// Part of 3-layer architecture simplification (2025-11-01)
// Based on ChatGPT Pro UltraThink recommendations
// === IMPORTANT: Disable old benchmark fastpath ===
// The old HAKMEM_TINY_BENCH_FASTPATH conflicts with new 3-layer architecture
// We must disable it to ensure our new code runs
#ifdef HAKMEM_TINY_BENCH_FASTPATH
#undef HAKMEM_TINY_BENCH_FASTPATH
#endif
// Phase E1-CORRECT: Box API for next pointer operations
#include "box/tiny_next_ptr_box.h"
#include "front/tiny_heap_v2.h"
#include "box/c7_meta_used_counter_box.h"
// Debug counters (thread-local)
static __thread uint64_t g_3layer_bump_hits = 0;
static __thread uint64_t g_3layer_mag_hits = 0;
static __thread uint64_t g_3layer_slow_hits = 0;
static __thread uint64_t g_3layer_refill_count = 0;
static __thread uint64_t g_3layer_refill_items = 0;
static __thread uint64_t g_3layer_fallback_superslab_disabled = 0;
static __thread uint64_t g_3layer_fallback_no_ss = 0;
static __thread uint64_t g_3layer_fallback_no_meta = 0;
static __thread uint64_t g_3layer_batch_carve_count = 0;
// Active accounting helper (env toggle: HAKMEM_TINY_ACTIVE_FIX=0 to disable)
static inline int tiny_active_fix_enabled(void) {
static int g_active_fix_en = -1;
if (__builtin_expect(g_active_fix_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ACTIVE_FIX");
g_active_fix_en = (e && atoi(e) == 0) ? 0 : 1;
}
return g_active_fix_en;
}
static inline void tiny_active_account_alloc(void* ptr) {
if (!ptr || !g_use_superslab) return;
if (!tiny_active_fix_enabled()) return;
SuperSlab* ss = hak_super_lookup(ptr);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
ss_active_inc(ss);
}
}
// Forward declaration for Layer 3
__attribute__((noinline, cold))
static void* tiny_alloc_slow_new(int class_idx);
// ============================================================================
// Main Allocation Function (3-layer architecture)
// ============================================================================
void* hak_tiny_alloc(size_t size) {
// Initialization check (cold path, once per thread)
#if !HAKMEM_BUILD_RELEASE
if (!g_tiny_initialized) hak_tiny_init();
#else
if (__builtin_expect(!g_tiny_initialized, 0)) {
hak_tiny_init();
}
#endif
// Wrapper guard (safety check, rare)
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) return NULL;
# else
extern int hak_in_wrapper(void);
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) return NULL;
# endif
#endif
// Size to class index
int class_idx = hak_tiny_size_to_class(size);
if (class_idx < 0) return NULL; // > 1KB
// DEBUG: Verify hak_tiny_alloc() is called
static int g_alloc_dbg = -1;
if (g_alloc_dbg == -1) {
const char* e = getenv("HAKMEM_TINY_HEAP_V2_DEBUG");
g_alloc_dbg = (e && *e && *e != '0') ? 1 : 0;
}
if (g_alloc_dbg) {
static int g_call_count = 0;
if (g_call_count < 3) {
fprintf(stderr, "[HAK_TINY_ALLOC] Called #%d, size=%zu, class=%d\n",
g_call_count++, size, class_idx);
}
}
// Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
ROUTE_BEGIN(class_idx);
// Phase 13-A/B: Tiny Heap v2 front (tcache-like, A/B)
if (__builtin_expect(tiny_heap_v2_enabled() && front_prune_heapv2_enabled() && class_idx <= 3, 0)) {
static int g_heap_v2_dbg = -1;
if (__builtin_expect(g_heap_v2_dbg == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_HEAP_V2_DEBUG");
g_heap_v2_dbg = (e && *e && *e != '0') ? 1 : 0;
}
if (g_heap_v2_dbg) {
static int g_hook_count = 0;
if (g_hook_count < 5) {
fprintf(stderr, "[NEW3L-HOOK] class_idx=%d, size=%zu, hook_count=%d\n",
class_idx, size, g_hook_count++);
}
}
void* base = tiny_heap_v2_alloc_by_class(class_idx);
if (base) {
front_metrics_heapv2_hit(class_idx);
HAK_RET_ALLOC(class_idx, base); // Header write + return USER pointer
} else {
front_metrics_heapv2_miss(class_idx);
}
// Fall through to existing front path if HeapV2 misses
}
// Initialize small magazine (once per thread)
if (__builtin_expect(!g_tiny_small_mag_initialized, 0)) {
tiny_small_mag_init();
}
// ========================================================================
// === LAYER 1: TLS Bump Allocator (hot classes 0-2: 8B/16B/32B) ===
// === Target: 2-3 instructions/op ===
// ========================================================================
if (likely(class_idx <= 2)) {
void* p = tiny_bump_alloc(class_idx);
if (likely(p)) {
tiny_active_account_alloc(p);
g_3layer_bump_hits++;
// Mark: bump hit便宜的にhot_hitのbitを再利用 8
ROUTE_MARK(8); ROUTE_COMMIT(class_idx, 0x40);
HAK_RET_ALLOC(class_idx, p);
}
}
// ========================================================================
// === LAYER 2: TLS Small Magazine (all classes, 128 items) ===
// === Target: 5-10 instructions/op ===
// ========================================================================
void* p = small_mag_pop(class_idx);
if (likely(p)) {
extern unsigned long long g_front_mag_hit[];
g_front_mag_hit[class_idx]++;
tiny_active_account_alloc(p);
g_3layer_mag_hits++;
// Mark: small mag hitbench_hitのbitを便宜的に再利用 10
ROUTE_MARK(10); ROUTE_COMMIT(class_idx, 0x41);
HAK_RET_ALLOC(class_idx, p);
}
// ========================================================================
// === LAYER 3: Slow path (refill, slab allocation) ===
// === Target: 50-100+ instructions/op (rare) ===
// ========================================================================
g_3layer_slow_hits++;
return tiny_alloc_slow_new(class_idx);
}
// ============================================================================
// Layer 3: Slow Path (refill and slab management)
// ============================================================================
__attribute__((noinline, cold))
static void* tiny_alloc_slow_new(int class_idx) {
// ReturnFirst Selector: try Ready/Mailbox/Sticky/Hot/Bench/Registry once
do {
static int g_return_first = -1; // env: HAKMEM_TINY_RETURN_FIRST (default ON)
if (__builtin_expect(g_return_first == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_RETURN_FIRST");
g_return_first = (e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_return_first, 1)) {
extern __thread TinyTLSSlab g_tls_slabs[];
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
SuperSlab* rs = tiny_refill_try_fast(class_idx, tls);
(void)rs; // On success, tls->ss is bound and Step 2 will carve
}
} while (0);
// ========================================================================
// Layer 3: Refill Small Magazine and/or Bump from existing infrastructure
// ========================================================================
// Step 1: Try to refill Small Magazine from existing TLS Magazine
tiny_mag_init_if_needed(class_idx);
TinyTLSMag* large_mag = &g_tls_mags[class_idx];
if (large_mag->top > 0) {
// Batch transfer from large magazine (2048) to small magazine
int batch_size = 64; // Transfer in batches of 64
if (batch_size > large_mag->top) batch_size = large_mag->top;
void* items[64];
for (int i = 0; i < batch_size; i++) {
items[i] = large_mag->items[large_mag->top - 1 - i].ptr;
}
large_mag->top -= batch_size;
// Push to Small Magazine
int pushed = small_mag_batch_push(class_idx, items, batch_size);
g_3layer_refill_count++;
g_3layer_refill_items += pushed;
// Try to pop one and return
void* p = small_mag_pop(class_idx);
if (p) {
tiny_active_account_alloc(p);
return p;
}
}
// Step 2: Large Magazine empty - batch carve from SuperSlab directly
// ChatGPT Pro P0: Complete batch化 (based on tls_refill_from_tls_slab:115-126)
if (!g_use_superslab) {
g_3layer_fallback_superslab_disabled++;
return hak_tiny_alloc_slow(0, class_idx);
}
TinyTLSSlab* tls_slab = &g_tls_slabs[class_idx];
if (!tls_slab->ss) {
if (superslab_refill(class_idx) == NULL) {
g_3layer_fallback_no_ss++;
// Optional one-shot debug
static int g_alloc_dbg = -1; if (__builtin_expect(g_alloc_dbg == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg) {
static _Atomic int printed_ss[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_ss[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] refill returned NULL (no SS) class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx); // Fallback
}
}
TinySlabMeta* meta = tls_slab->meta;
if (!meta) {
g_3layer_fallback_no_meta++;
// Optional one-shot debug
static int g_alloc_dbg2 = -1; if (__builtin_expect(g_alloc_dbg2 == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg2 = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg2) {
static _Atomic int printed_meta[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_meta[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] meta is NULL after refill class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx);
}
// Batch carve from SuperSlab (P0 optimization - no 64x function calls!)
uint32_t want = 64; // Refill target
void* items[64];
int got = 0;
// Try freelist first (small amount, usually 0)
while (got < (int)want && meta->freelist) {
void* node = meta->freelist;
meta->freelist = tiny_next_read(node); // Phase E1-CORRECT: Box API
items[got++] = node;
meta->used++;
c7_meta_used_note(class_idx, C7_META_USED_SRC_FRONT);
}
// Then linear carve (KEY OPTIMIZATION - direct array fill!)
if (got < (int)want && meta->used < meta->capacity) {
uint32_t need = want - got;
uint32_t available = meta->capacity - meta->used;
if (need > available) need = available;
size_t block_size = g_tiny_class_sizes[class_idx];
uint8_t* slab_base = tls_slab->slab_base ? tls_slab->slab_base
: tiny_slab_base_for(tls_slab->ss, tls_slab->slab_idx);
uint8_t* cursor = slab_base + ((size_t)meta->used * block_size);
// Batch carve: directly fill items array (no linked list, no 64 function calls!)
for (uint32_t i = 0; i < need; ++i) {
items[got++] = (void*)cursor;
cursor += block_size;
}
meta->used += need; // Reserve to TLS; not active until returned to user
if (class_idx == 7) {
for (uint32_t i = 0; i < need; ++i) {
c7_meta_used_note(class_idx, C7_META_USED_SRC_FRONT);
}
}
}
if (got == 0) {
// Slab exhausted, try refill and retry once
if (superslab_refill(class_idx) != NULL) {
return tiny_alloc_slow_new(class_idx); // Recursive retry
}
static int g_alloc_dbg3 = -1; if (__builtin_expect(g_alloc_dbg3 == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg3 = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg3) {
static _Atomic int printed_final[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_final[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] no items after retry (final fallback) class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx); // Ultimate fallback
}
// Take one for return, push rest to Small Magazine
g_3layer_batch_carve_count++;
void* result = items[0];
if (got > 1) {
int pushed = small_mag_batch_push(class_idx, &items[1], got - 1);
g_3layer_refill_count++;
g_3layer_refill_items += pushed;
}
tiny_active_account_alloc(result);
// Route: slab carve directlinear相当の採用扱い
ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60);
return result;
}
// Debug function: print layer statistics
__attribute__((destructor))
static void print_3layer_stats(void) {
uint64_t total = g_3layer_bump_hits + g_3layer_mag_hits + g_3layer_slow_hits;
if (total > 0) {
fprintf(stderr, "\n=== 3-Layer Architecture Stats ===\n");
fprintf(stderr, "Bump hits: %10lu (%5.2f%%)\n",
g_3layer_bump_hits, 100.0 * g_3layer_bump_hits / total);
fprintf(stderr, "Mag hits: %10lu (%5.2f%%)\n",
g_3layer_mag_hits, 100.0 * g_3layer_mag_hits / total);
fprintf(stderr, "Slow hits: %10lu (%5.2f%%)\n",
g_3layer_slow_hits, 100.0 * g_3layer_slow_hits / total);
fprintf(stderr, "Total allocs: %10lu\n", total);
fprintf(stderr, "Refill count: %10lu\n", g_3layer_refill_count);
fprintf(stderr, "Refill items: %10lu (avg %.1f/refill)\n",
g_3layer_refill_items,
g_3layer_refill_count > 0 ? (double)g_3layer_refill_items / g_3layer_refill_count : 0.0);
fprintf(stderr, "=== Fallback Paths ===\n");
fprintf(stderr, "SuperSlab disabled: %lu\n", g_3layer_fallback_superslab_disabled);
fprintf(stderr, "No SuperSlab: %lu\n", g_3layer_fallback_no_ss);
fprintf(stderr, "No meta: %lu\n", g_3layer_fallback_no_meta);
fprintf(stderr, "Batch carve count: %lu\n", g_3layer_batch_carve_count);
}
}