// hakmem_tiny_ultra_simple.inc // Phase 6-1.5: Ultra-Simple Fast Path integrated with existing HAKMEM // // Design: "Simple Front + Smart Back" (inspired by Mid-Large HAKX +171%) // - Front: Ultra-simple TLS SLL (reuse existing g_tls_sll_head[]) // - Back: Existing SuperSlab + ACE + Learning layer // // Key insight: HAKMEM already HAS the infrastructure! // - g_tls_sll_head[] exists (line 492 of hakmem_tiny.c) // - sll_refill_small_from_ss() exists (hakmem_tiny_refill.inc.h:187) // - Just need to remove the overhead layers! #ifndef HAKMEM_TINY_ULTRA_SIMPLE_INC #define HAKMEM_TINY_ULTRA_SIMPLE_INC // SFC integration #include "tiny_alloc_fast_sfc.inc.h" #include "box/tls_sll_box.h" // Box TLS-SLL API // ============================================================================ // Phase 6-1.5: Ultra-Simple Allocator (uses existing infrastructure) // ============================================================================ // This replaces the complex multi-layer fast path with a 3-4 instruction path // while keeping all existing backend infrastructure (SuperSlab, ACE, Learning) // Forward declarations for external TLS variables and functions extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; static __thread int g_ultra_simple_called = 0; // NOTE: These functions are NOT static because they need to be called from hakmem.c // They MUST be defined in hakmem_tiny.c where TLS variables are accessible void* hak_tiny_alloc_ultra_simple(size_t size) { // DEBUG: Mark that we're using ultra_simple path (disabled in release) #ifdef HAKMEM_DEBUG_VERBOSE if (!g_ultra_simple_called) { fprintf(stderr, "[PHASE 6-1.5] Ultra-simple path ACTIVE!\n"); g_ultra_simple_called = 1; } #endif // 1. Size → class (inline function, existing) int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0, 0)) { return NULL; // >1KB } // 2. Ultra-fast path: Pop from existing TLS SLL (Phase 6-1 style!) // This is IDENTICAL to Phase 6-1 but uses existing g_tls_sll_head[] void* head = NULL; if (tls_sll_pop(class_idx, &head)) { HAK_RET_ALLOC(class_idx, head); } // 3. Miss: Refill from existing SuperSlab infrastructure // This gives us ACE, Learning layer, L25 integration for free! // Tunable refill count (env: HAKMEM_TINY_REFILL_COUNT, default 32) static int s_refill_count = 0; if (__builtin_expect(s_refill_count == 0, 0)) { int def = 32; // smaller refill improves warm-up and reuse density char* env = getenv("HAKMEM_TINY_REFILL_COUNT"); int v = (env ? atoi(env) : def); if (v < 8) v = 8; // clamp to sane range if (v > 256) v = 256; s_refill_count = v; } int refill_count = s_refill_count; #if HAKMEM_TINY_P0_BATCH_REFILL if (sll_refill_batch_from_ss(class_idx, refill_count) > 0) { #else if (sll_refill_small_from_ss(class_idx, refill_count) > 0) { #endif if (tls_sll_pop(class_idx, &head)) { HAK_RET_ALLOC(class_idx, head); } } // 4. Fallback to slow path (existing infrastructure) void* slow_ptr = hak_tiny_alloc_slow(size, class_idx); if (slow_ptr) { HAK_RET_ALLOC(class_idx, slow_ptr); } return slow_ptr; } // ============================================================================ // Ultra-Simple Free Path (bypasses free.part.0 complexity) // ============================================================================ // This eliminates the 38.43% free path overhead identified by perf analysis: // - free.part.0: 15.83% // - mid_lookup: 9.55% // - pthread locks: 8.81% // Just 2-3 instructions: owner check → push to TLS SLL static __thread int g_ultra_simple_free_called = 0; static __thread uint64_t g_ultra_simple_free_count = 0; // Ultra-fast class guess from pointer alignment (Phase 6-1.6: CTZ optimization) // This is FAST but may be wrong - validation happens later! static inline int guess_class_from_alignment(void* ptr) { uintptr_t addr = (uintptr_t)ptr; // Quick check: not 8-byte aligned → not Tiny if (__builtin_expect((addr & 7) != 0, 0)) return -1; // Fast path: Use Count Trailing Zeros (1 instruction!) // Tiny classes: 8B(cls0), 16B(cls1), 32B(cls2), 64B(cls3), 128B(cls4), 256B(cls5), 512B(cls6), 1KB(cls7) // 8B: addr ends ...000 → ctz=3 → cls=0 // 16B: addr ends ...0000 → ctz=4 → cls=1 // 32B: addr ends ...00000 → ctz=5 → cls=2 // 64B: addr ends ...000000 → ctz=6 → cls=3 int trailing_zeros = __builtin_ctzl(addr); int class_idx = trailing_zeros - 3; // Subtract 3 (log2(8)) // Clamp to valid range (0-7 for Tiny classes) if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { return -1; // Invalid alignment } return class_idx; } // NOTE: This function is NOT static because it needs to be called from hakmem.c // It MUST be defined in hakmem_tiny.c where TLS variables are accessible void hak_tiny_free_ultra_simple(void* ptr) { // DEBUG: Mark that we're using ultra_simple free path (always enabled for SFC debug) static __thread int free_entry_count = 0; if (getenv("HAKMEM_SFC_DEBUG") && free_entry_count < 20) { free_entry_count++; fprintf(stderr, "[ULTRA_FREE_ENTRY] ptr=%p, count=%d\n", ptr, free_entry_count); } #ifdef HAKMEM_DEBUG_VERBOSE if (!g_ultra_simple_free_called) { fprintf(stderr, "[PHASE 6-1.5] Ultra-simple FREE path ACTIVE (LAZY VALIDATION)!\n"); g_ultra_simple_free_called = 1; } #endif // Prefer safe same-thread detection over pure alignment guessing to avoid // capturing cross-thread frees into the wrong TLS SLL (Larson MT case). // 1) SuperSlab-backed tiny pointer? if (__builtin_expect(g_use_superslab != 0, 1)) { SuperSlab* ss = hak_super_lookup(ptr); if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) { // ✅ FIX: Phase E1-CORRECT - Convert USER → BASE before slab index calculation void* base = (void*)((uint8_t*)ptr - 1); int slab_idx = slab_index_for(ss, base); TinySlabMeta* meta = &ss->slabs[slab_idx]; uint32_t self_tid = tiny_self_u32(); if (__builtin_expect(meta->owner_tid == self_tid, 1)) { int class_idx = ss->size_class; // SFC Integration: Same as tiny_free_fast_ss() in tiny_free_fast.inc.h extern int g_sfc_enabled; // Debug: Track ultra_simple free path (SFC integration) - BEFORE SFC call static __thread int ultra_free_debug_count = 0; if (getenv("HAKMEM_SFC_DEBUG") && ultra_free_debug_count < 20) { ultra_free_debug_count++; fprintf(stderr, "[ULTRA_FREE_SS] ptr=%p, cls=%d, sfc_enabled=%d\n", ptr, class_idx, g_sfc_enabled); } if (g_sfc_enabled) { // Try SFC (128 slots) // Debug: Log before calling sfc_free_push static __thread int push_attempt_count = 0; if (getenv("HAKMEM_SFC_DEBUG") && push_attempt_count < 20) { push_attempt_count++; fprintf(stderr, "[ULTRA_FREE_PUSH_ATTEMPT] cls=%d, ptr=%p\n", class_idx, ptr); } if (!sfc_free_push(class_idx, ptr)) { // SFC full → skip caching, delegate to slow path // Do NOT fall back to SLL - it has no capacity check! hak_tiny_free(ptr); return; } } else { // Old SLL path (16 slots) - Use Box TLS-SLL API if (!tls_sll_push(class_idx, ptr, UINT32_MAX)) { // C7 rejected or capacity exceeded - fallback to slow path hak_tiny_free(ptr); return; } } // Active accounting on free ss_active_dec_one(ss); return; } // Cross-thread free → delegate to full tiny free hak_tiny_free(ptr); return; } } // 2) Legacy TinySlab-backed pointer? TinySlab* slab = hak_tiny_owner_slab(ptr); if (__builtin_expect(slab != NULL, 0)) { if (__builtin_expect(pthread_equal(slab->owner_tid, tiny_self_pt()), 1)) { int class_idx = slab->class_idx; // SFC Integration: Same as tiny_free_fast_legacy() in tiny_free_fast.inc.h extern int g_sfc_enabled; if (g_sfc_enabled) { // Try SFC (128 slots) if (!sfc_free_push(class_idx, ptr)) { // SFC full → skip caching, delegate to slow path // Do NOT fall back to SLL - it has no capacity check! hak_tiny_free_with_slab(ptr, slab); return; } } else { // Old SLL path (16 slots) - Use Box TLS-SLL API if (!tls_sll_push(class_idx, ptr, UINT32_MAX)) { // C7 rejected or capacity exceeded - fallback to slow path hak_tiny_free_with_slab(ptr, slab); return; } } return; } // Cross-thread free → precise path with known slab hak_tiny_free_with_slab(ptr, slab); return; } // 3) Fallback: Not a tiny allocation (or unknown) → delegate hak_free_at(ptr, 0, 0); } #endif // HAKMEM_TINY_ULTRA_SIMPLE_INC