diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index 8db1c9ba..05b6da1e 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -28,7 +28,7 @@ #include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES #include "../hakmem_super_registry.h" // For cross-thread owner check -#include "../superslab/superslab_inline.h" // For slab_index_for +#include "../superslab/superslab_inline.h" // For ss_fast_lookup, slab_index_for (Phase 12) #include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get #include "../box/free_remote_box.h" // For tiny_free_remote_box #include "tiny_unified_cache.h" // For unified_cache_pop_or_refill @@ -159,8 +159,9 @@ static inline int free_tiny_fast(void* ptr) { } if (__builtin_expect(g_larson_fix, 0)) { - SuperSlab* ss = hak_super_lookup(base); - if (ss && ss->magic == SUPERSLAB_MAGIC) { + // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100) + SuperSlab* ss = ss_fast_lookup(base); + if (ss) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) { uint32_t self_tid = tiny_self_u32_local(); diff --git a/core/superslab/superslab_inline.h b/core/superslab/superslab_inline.h index 28c6a8d7..6e81573a 100644 --- a/core/superslab/superslab_inline.h +++ b/core/superslab/superslab_inline.h @@ -11,6 +11,33 @@ void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMe // Optional debug counter (defined in hakmem_tiny_superslab.c) extern _Atomic uint64_t g_ss_active_dec_calls; +// ========== Fast SuperSlab Lookup via Mask (Phase 12 optimization) ========== +// Purpose: Replace expensive hak_super_lookup() with O(1) mask calculation +// Invariant: All SuperSlabs are aligned to at least SUPERSLAB_SIZE_MIN (1MB) +// Cost: ~5-10 cycles (vs 50-100 cycles for registry lookup) +static inline SuperSlab* ss_fast_lookup(void* ptr) +{ + if (__builtin_expect(!ptr, 0)) return NULL; + + uintptr_t p = (uintptr_t)ptr; + // Step 1: Mask with minimum SuperSlab size (1MB alignment) + // Note: 2MB SuperSlabs are also 1MB aligned, so this works for both + SuperSlab* ss = (SuperSlab*)(p & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u)); + + // Step 2: Validate magic (quick reject for non-SuperSlab memory) + if (__builtin_expect(ss->magic != SUPERSLAB_MAGIC, 0)) { + return NULL; + } + + // Step 3: Range check (ptr must be within this SuperSlab) + size_t ss_size = (size_t)1 << ss->lg_size; + if (__builtin_expect(p >= (uintptr_t)ss + ss_size, 0)) { + return NULL; + } + + return ss; +} + // Return maximum number of slabs for this SuperSlab based on lg_size. static inline int ss_slabs_capacity(SuperSlab* ss) { diff --git a/core/tiny_free_fast.inc.h b/core/tiny_free_fast.inc.h index d1f9ac49..e3fb54d3 100644 --- a/core/tiny_free_fast.inc.h +++ b/core/tiny_free_fast.inc.h @@ -215,10 +215,11 @@ static inline void tiny_free_fast(void* ptr) { } // 1. SuperSlab-backed tiny pointer? if (__builtin_expect(g_use_superslab != 0, 1)) { - SuperSlab* ss = hak_super_lookup(ptr); - if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) { - // ✅ FIX: Phase E1-CORRECT - Convert USER → BASE before slab index calculation - void* base = (void*)((uint8_t*)ptr - 1); + // Phase 12 optimization: Use fast mask-based lookup instead of registry + // ss_fast_lookup does: mask + magic check + range check (~5-10 cycles vs 50-100) + void* base = (void*)((uint8_t*)ptr - 1); // Convert USER → BASE first + SuperSlab* ss = ss_fast_lookup(base); + if (__builtin_expect(ss != NULL, 1)) { int slab_idx = slab_index_for(ss, base); uint32_t self_tid = tiny_self_u32(); diff --git a/core/tiny_free_fast_v2.inc.h b/core/tiny_free_fast_v2.inc.h index 6fdaf126..56dc6a9d 100644 --- a/core/tiny_free_fast_v2.inc.h +++ b/core/tiny_free_fast_v2.inc.h @@ -194,7 +194,8 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { if (__builtin_expect(g_larson_fix, 0)) { // Cross-thread check enabled - MT safe mode - SuperSlab* ss = hak_super_lookup(base); + // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100) + SuperSlab* ss = ss_fast_lookup(base); if (__builtin_expect(ss != NULL, 1)) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0, 1)) {