2025-11-05 12:31:14 +09:00
|
|
|
// hakmem_tiny_ultra_simple.inc
|
|
|
|
|
// Phase 6-1.5: Ultra-Simple Fast Path integrated with existing HAKMEM
|
|
|
|
|
//
|
|
|
|
|
// Design: "Simple Front + Smart Back" (inspired by Mid-Large HAKX +171%)
|
|
|
|
|
// - Front: Ultra-simple TLS SLL (reuse existing g_tls_sll_head[])
|
|
|
|
|
// - Back: Existing SuperSlab + ACE + Learning layer
|
|
|
|
|
//
|
|
|
|
|
// Key insight: HAKMEM already HAS the infrastructure!
|
|
|
|
|
// - g_tls_sll_head[] exists (line 492 of hakmem_tiny.c)
|
|
|
|
|
// - sll_refill_small_from_ss() exists (hakmem_tiny_refill.inc.h:187)
|
|
|
|
|
// - Just need to remove the overhead layers!
|
|
|
|
|
|
|
|
|
|
#ifndef HAKMEM_TINY_ULTRA_SIMPLE_INC
|
|
|
|
|
#define HAKMEM_TINY_ULTRA_SIMPLE_INC
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
// SFC integration
|
|
|
|
|
#include "tiny_alloc_fast_sfc.inc.h"
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Phase 6-1.5: Ultra-Simple Allocator (uses existing infrastructure)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// This replaces the complex multi-layer fast path with a 3-4 instruction path
|
|
|
|
|
// while keeping all existing backend infrastructure (SuperSlab, ACE, Learning)
|
|
|
|
|
|
|
|
|
|
// Forward declarations for external TLS variables and functions
|
|
|
|
|
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
|
|
|
|
|
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
static __thread int g_ultra_simple_called = 0;
|
|
|
|
|
|
|
|
|
|
// NOTE: These functions are NOT static because they need to be called from hakmem.c
|
|
|
|
|
// They MUST be defined in hakmem_tiny.c where TLS variables are accessible
|
|
|
|
|
void* hak_tiny_alloc_ultra_simple(size_t size) {
|
|
|
|
|
// DEBUG: Mark that we're using ultra_simple path (disabled in release)
|
|
|
|
|
#ifdef HAKMEM_DEBUG_VERBOSE
|
|
|
|
|
if (!g_ultra_simple_called) {
|
|
|
|
|
fprintf(stderr, "[PHASE 6-1.5] Ultra-simple path ACTIVE!\n");
|
|
|
|
|
g_ultra_simple_called = 1;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// 1. Size → class (inline function, existing)
|
|
|
|
|
int class_idx = hak_tiny_size_to_class(size);
|
|
|
|
|
if (__builtin_expect(class_idx < 0, 0)) {
|
|
|
|
|
return NULL; // >1KB
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 2. Ultra-fast path: Pop from existing TLS SLL (Phase 6-1 style!)
|
|
|
|
|
// This is IDENTICAL to Phase 6-1 but uses existing g_tls_sll_head[]
|
|
|
|
|
void* head = g_tls_sll_head[class_idx];
|
|
|
|
|
if (__builtin_expect(head != NULL, 1)) {
|
|
|
|
|
g_tls_sll_head[class_idx] = *(void**)head; // 1-instruction pop!
|
|
|
|
|
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
|
|
|
|
|
HAK_RET_ALLOC(class_idx, head);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3. Miss: Refill from existing SuperSlab infrastructure
|
|
|
|
|
// This gives us ACE, Learning layer, L25 integration for free!
|
|
|
|
|
// Tunable refill count (env: HAKMEM_TINY_REFILL_COUNT, default 32)
|
|
|
|
|
static int s_refill_count = 0;
|
|
|
|
|
if (__builtin_expect(s_refill_count == 0, 0)) {
|
|
|
|
|
int def = 32; // smaller refill improves warm-up and reuse density
|
|
|
|
|
char* env = getenv("HAKMEM_TINY_REFILL_COUNT");
|
|
|
|
|
int v = (env ? atoi(env) : def);
|
|
|
|
|
if (v < 8) v = 8; // clamp to sane range
|
|
|
|
|
if (v > 256) v = 256;
|
|
|
|
|
s_refill_count = v;
|
|
|
|
|
}
|
|
|
|
|
int refill_count = s_refill_count;
|
|
|
|
|
if (sll_refill_small_from_ss(class_idx, refill_count) > 0) {
|
|
|
|
|
head = g_tls_sll_head[class_idx];
|
|
|
|
|
if (head) {
|
|
|
|
|
g_tls_sll_head[class_idx] = *(void**)head;
|
|
|
|
|
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
|
|
|
|
|
HAK_RET_ALLOC(class_idx, head);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 4. Fallback to slow path (existing infrastructure)
|
|
|
|
|
void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
|
|
|
|
|
if (slow_ptr) {
|
|
|
|
|
HAK_RET_ALLOC(class_idx, slow_ptr);
|
|
|
|
|
}
|
|
|
|
|
return slow_ptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Ultra-Simple Free Path (bypasses free.part.0 complexity)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// This eliminates the 38.43% free path overhead identified by perf analysis:
|
|
|
|
|
// - free.part.0: 15.83%
|
|
|
|
|
// - mid_lookup: 9.55%
|
|
|
|
|
// - pthread locks: 8.81%
|
|
|
|
|
// Just 2-3 instructions: owner check → push to TLS SLL
|
|
|
|
|
|
|
|
|
|
static __thread int g_ultra_simple_free_called = 0;
|
|
|
|
|
static __thread uint64_t g_ultra_simple_free_count = 0;
|
|
|
|
|
|
|
|
|
|
// Ultra-fast class guess from pointer alignment (Phase 6-1.6: CTZ optimization)
|
|
|
|
|
// This is FAST but may be wrong - validation happens later!
|
|
|
|
|
static inline int guess_class_from_alignment(void* ptr) {
|
|
|
|
|
uintptr_t addr = (uintptr_t)ptr;
|
|
|
|
|
|
|
|
|
|
// Quick check: not 8-byte aligned → not Tiny
|
|
|
|
|
if (__builtin_expect((addr & 7) != 0, 0)) return -1;
|
|
|
|
|
|
|
|
|
|
// Fast path: Use Count Trailing Zeros (1 instruction!)
|
|
|
|
|
// Tiny classes: 8B(cls0), 16B(cls1), 32B(cls2), 64B(cls3), 128B(cls4), 256B(cls5), 512B(cls6), 1KB(cls7)
|
|
|
|
|
// 8B: addr ends ...000 → ctz=3 → cls=0
|
|
|
|
|
// 16B: addr ends ...0000 → ctz=4 → cls=1
|
|
|
|
|
// 32B: addr ends ...00000 → ctz=5 → cls=2
|
|
|
|
|
// 64B: addr ends ...000000 → ctz=6 → cls=3
|
|
|
|
|
int trailing_zeros = __builtin_ctzl(addr);
|
|
|
|
|
int class_idx = trailing_zeros - 3; // Subtract 3 (log2(8))
|
|
|
|
|
|
|
|
|
|
// Clamp to valid range (0-7 for Tiny classes)
|
|
|
|
|
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
|
|
|
|
|
return -1; // Invalid alignment
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return class_idx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NOTE: This function is NOT static because it needs to be called from hakmem.c
|
|
|
|
|
// It MUST be defined in hakmem_tiny.c where TLS variables are accessible
|
|
|
|
|
void hak_tiny_free_ultra_simple(void* ptr) {
|
2025-11-07 01:27:04 +09:00
|
|
|
// DEBUG: Mark that we're using ultra_simple free path (always enabled for SFC debug)
|
|
|
|
|
static __thread int free_entry_count = 0;
|
|
|
|
|
if (getenv("HAKMEM_SFC_DEBUG") && free_entry_count < 20) {
|
|
|
|
|
free_entry_count++;
|
|
|
|
|
fprintf(stderr, "[ULTRA_FREE_ENTRY] ptr=%p, count=%d\n", ptr, free_entry_count);
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
#ifdef HAKMEM_DEBUG_VERBOSE
|
|
|
|
|
if (!g_ultra_simple_free_called) {
|
|
|
|
|
fprintf(stderr, "[PHASE 6-1.5] Ultra-simple FREE path ACTIVE (LAZY VALIDATION)!\n");
|
|
|
|
|
g_ultra_simple_free_called = 1;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Prefer safe same-thread detection over pure alignment guessing to avoid
|
|
|
|
|
// capturing cross-thread frees into the wrong TLS SLL (Larson MT case).
|
|
|
|
|
|
|
|
|
|
// 1) SuperSlab-backed tiny pointer?
|
|
|
|
|
if (__builtin_expect(g_use_superslab != 0, 1)) {
|
|
|
|
|
SuperSlab* ss = hak_super_lookup(ptr);
|
|
|
|
|
if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) {
|
|
|
|
|
int slab_idx = slab_index_for(ss, ptr);
|
|
|
|
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
|
|
|
|
uint32_t self_tid = tiny_self_u32();
|
|
|
|
|
if (__builtin_expect(meta->owner_tid == self_tid, 1)) {
|
|
|
|
|
int class_idx = ss->size_class;
|
2025-11-07 01:27:04 +09:00
|
|
|
|
|
|
|
|
// SFC Integration: Same as tiny_free_fast_ss() in tiny_free_fast.inc.h
|
|
|
|
|
extern int g_sfc_enabled;
|
|
|
|
|
|
|
|
|
|
// Debug: Track ultra_simple free path (SFC integration) - BEFORE SFC call
|
|
|
|
|
static __thread int ultra_free_debug_count = 0;
|
|
|
|
|
if (getenv("HAKMEM_SFC_DEBUG") && ultra_free_debug_count < 20) {
|
|
|
|
|
ultra_free_debug_count++;
|
|
|
|
|
fprintf(stderr, "[ULTRA_FREE_SS] ptr=%p, cls=%d, sfc_enabled=%d\n",
|
|
|
|
|
ptr, class_idx, g_sfc_enabled);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (g_sfc_enabled) {
|
|
|
|
|
// Try SFC (128 slots)
|
|
|
|
|
// Debug: Log before calling sfc_free_push
|
|
|
|
|
static __thread int push_attempt_count = 0;
|
|
|
|
|
if (getenv("HAKMEM_SFC_DEBUG") && push_attempt_count < 20) {
|
|
|
|
|
push_attempt_count++;
|
|
|
|
|
fprintf(stderr, "[ULTRA_FREE_PUSH_ATTEMPT] cls=%d, ptr=%p\n", class_idx, ptr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!sfc_free_push(class_idx, ptr)) {
|
|
|
|
|
// SFC full → skip caching, delegate to slow path
|
|
|
|
|
// Do NOT fall back to SLL - it has no capacity check!
|
|
|
|
|
hak_tiny_free(ptr);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Old SLL path (16 slots)
|
|
|
|
|
*(void**)ptr = g_tls_sll_head[class_idx];
|
|
|
|
|
g_tls_sll_head[class_idx] = ptr;
|
|
|
|
|
g_tls_sll_count[class_idx]++;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// Active accounting on free
|
|
|
|
|
ss_active_dec_one(ss);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
// Cross-thread free → delegate to full tiny free
|
|
|
|
|
hak_tiny_free(ptr);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 2) Legacy TinySlab-backed pointer?
|
|
|
|
|
TinySlab* slab = hak_tiny_owner_slab(ptr);
|
|
|
|
|
if (__builtin_expect(slab != NULL, 0)) {
|
|
|
|
|
if (__builtin_expect(pthread_equal(slab->owner_tid, tiny_self_pt()), 1)) {
|
|
|
|
|
int class_idx = slab->class_idx;
|
2025-11-07 01:27:04 +09:00
|
|
|
|
|
|
|
|
// SFC Integration: Same as tiny_free_fast_legacy() in tiny_free_fast.inc.h
|
|
|
|
|
extern int g_sfc_enabled;
|
|
|
|
|
if (g_sfc_enabled) {
|
|
|
|
|
// Try SFC (128 slots)
|
|
|
|
|
if (!sfc_free_push(class_idx, ptr)) {
|
|
|
|
|
// SFC full → skip caching, delegate to slow path
|
|
|
|
|
// Do NOT fall back to SLL - it has no capacity check!
|
|
|
|
|
hak_tiny_free_with_slab(ptr, slab);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Old SLL path (16 slots)
|
|
|
|
|
*(void**)ptr = g_tls_sll_head[class_idx];
|
|
|
|
|
g_tls_sll_head[class_idx] = ptr;
|
|
|
|
|
g_tls_sll_count[class_idx]++;
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
// Cross-thread free → precise path with known slab
|
|
|
|
|
hak_tiny_free_with_slab(ptr, slab);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3) Fallback: Not a tiny allocation (or unknown) → delegate
|
|
|
|
|
hak_free_at(ptr, 0, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // HAKMEM_TINY_ULTRA_SIMPLE_INC
|