diff --git a/core/hakmem_tiny_free.inc b/core/hakmem_tiny_free.inc index 8b90a569..bbf212e7 100644 --- a/core/hakmem_tiny_free.inc +++ b/core/hakmem_tiny_free.inc @@ -207,9 +207,15 @@ void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) { } if (g_fast_enable && g_fast_cap[class_idx] != 0) { - // Push block base into fast cache + // Push block base into array stack for C0–C3, otherwise into TLS fast list void* base = (class_idx == 7) ? ptr : (void*)((uint8_t*)ptr - 1); - if (tiny_fast_push(class_idx, base)) { + int pushed = 0; + if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) { + pushed = fastcache_push(class_idx, base); + } else { + pushed = tiny_fast_push(class_idx, base); + } + if (pushed) { tiny_debug_ring_record(TINY_RING_EVENT_FREE_FAST, (uint16_t)class_idx, ptr, slab_idx); HAK_STAT_FREE(class_idx); return; diff --git a/core/tiny_alloc_fast_inline.h b/core/tiny_alloc_fast_inline.h index 8db14b1f..b3977135 100644 --- a/core/tiny_alloc_fast_inline.h +++ b/core/tiny_alloc_fast_inline.h @@ -7,8 +7,10 @@ #define TINY_ALLOC_FAST_INLINE_H #include +#include #include "hakmem_build_flags.h" #include "tiny_remote.h" // for TINY_REMOTE_SENTINEL (defense-in-depth) +#include "tiny_nextptr.h" // External TLS variables (defined in hakmem_tiny.c) extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; @@ -49,9 +51,8 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; if (g_tls_sll_count[(class_idx)] > 0) g_tls_sll_count[(class_idx)]--; \ (ptr_out) = NULL; \ } else { \ - /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \ - size_t _off = (HAKMEM_TINY_HEADER_CLASSIDX ? (((class_idx) == 7) ? 0 : 1) : 0); \ - void* _next = *(void**)((uint8_t*)_head + _off); \ + /* Safe load of header-aware next (avoid UB on unaligned) */ \ + void* _next = tiny_next_load(_head, (class_idx)); \ g_tls_sll_head[(class_idx)] = _next; \ if (g_tls_sll_count[(class_idx)] > 0) { \ g_tls_sll_count[(class_idx)]--; \ @@ -83,9 +84,8 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; // mov %rsi, g_tls_sll_head(%rdi) // #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \ - /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \ - size_t _off = (HAKMEM_TINY_HEADER_CLASSIDX ? (((class_idx) == 7) ? 0 : 1) : 0); \ - *(void**)((uint8_t*)(ptr) + _off) = g_tls_sll_head[(class_idx)]; \ + /* Safe store of header-aware next (avoid UB on unaligned) */ \ + tiny_next_store((ptr), (class_idx), g_tls_sll_head[(class_idx)]); \ g_tls_sll_head[(class_idx)] = (ptr); \ g_tls_sll_count[(class_idx)]++; \ } while(0) diff --git a/core/tiny_superslab_alloc.inc.h b/core/tiny_superslab_alloc.inc.h index c0fbfad9..209fae50 100644 --- a/core/tiny_superslab_alloc.inc.h +++ b/core/tiny_superslab_alloc.inc.h @@ -18,6 +18,25 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) { TinySlabMeta* meta = &ss->slabs[slab_idx]; + // Phase 1 (Small): For hottest tiny classes (C0–C3), prefer strict bump-only + // when there is no pending remote and the freelist is empty. This avoids + // pointer-chasing and header writes entirely on the common path. + do { + if (__builtin_expect(ss->size_class <= 3, 1)) { + // Skip if remote queue has pending nodes + if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) + break; + if (meta->freelist == NULL && meta->used < meta->capacity) { + size_t unit_sz = tiny_stride_for_class(ss->size_class); + uint8_t* base = tiny_slab_base_for_geometry(ss, slab_idx); + void* block = tiny_block_at_index(base, meta->used, unit_sz); + meta->used++; + ss_active_inc(ss); + HAK_RET_ALLOC(ss->size_class, block); + } + } + } while (0); + // Ensure remote queue is drained before handing blocks back to TLS (UNLIKELY in 1T) if (__builtin_expect(atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0, 0)) { uint32_t self_tid = tiny_self_u32(); diff --git a/core/tiny_superslab_free.inc.h b/core/tiny_superslab_free.inc.h index d3251764..e4048378 100644 --- a/core/tiny_superslab_free.inc.h +++ b/core/tiny_superslab_free.inc.h @@ -32,7 +32,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) { static _Atomic int c7_free_count = 0; int count = atomic_fetch_add_explicit(&c7_free_count, 1, memory_order_relaxed); if (count == 0) { - #if !HAKMEM_BUILD_RELEASE + #if !HAKMEM_BUILD_RELEASE && HAKMEM_DEBUG_VERBOSE fprintf(stderr, "[C7_FIRST_FREE] ptr=%p base=%p slab_idx=%d\n", ptr, base, slab_idx); #endif }