diff --git a/core/hakmem_tiny_free.inc b/core/hakmem_tiny_free.inc
index 8b90a569..bbf212e7 100644
--- a/core/hakmem_tiny_free.inc
+++ b/core/hakmem_tiny_free.inc
@@ -207,9 +207,15 @@ void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) {
         }
 
         if (g_fast_enable && g_fast_cap[class_idx] != 0) {
-            // Push block base into fast cache
+            // Push block base into array stack for C0–C3, otherwise into TLS fast list
             void* base = (class_idx == 7) ? ptr : (void*)((uint8_t*)ptr - 1);
-            if (tiny_fast_push(class_idx, base)) {
+            int pushed = 0;
+            if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) {
+                pushed = fastcache_push(class_idx, base);
+            } else {
+                pushed = tiny_fast_push(class_idx, base);
+            }
+            if (pushed) {
                 tiny_debug_ring_record(TINY_RING_EVENT_FREE_FAST, (uint16_t)class_idx, ptr, slab_idx);
                 HAK_STAT_FREE(class_idx);
                 return;
diff --git a/core/tiny_alloc_fast_inline.h b/core/tiny_alloc_fast_inline.h
index 8db14b1f..b3977135 100644
--- a/core/tiny_alloc_fast_inline.h
+++ b/core/tiny_alloc_fast_inline.h
@@ -7,8 +7,10 @@
 #define TINY_ALLOC_FAST_INLINE_H
 
 #include <stddef.h>
+#include <stdint.h>
 #include "hakmem_build_flags.h"
 #include "tiny_remote.h"  // for TINY_REMOTE_SENTINEL (defense-in-depth)
+#include "tiny_nextptr.h"
 
 // External TLS variables (defined in hakmem_tiny.c)
 extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
@@ -49,9 +51,8 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
             if (g_tls_sll_count[(class_idx)] > 0) g_tls_sll_count[(class_idx)]--; \
             (ptr_out) = NULL; \
         } else { \
-            /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \
-            size_t _off = (HAKMEM_TINY_HEADER_CLASSIDX ? (((class_idx) == 7) ? 0 : 1) : 0); \
-            void* _next = *(void**)((uint8_t*)_head + _off); \
+            /* Safe load of header-aware next (avoid UB on unaligned) */ \
+            void* _next = tiny_next_load(_head, (class_idx)); \
             g_tls_sll_head[(class_idx)] = _next; \
             if (g_tls_sll_count[(class_idx)] > 0) { \
                 g_tls_sll_count[(class_idx)]--; \
@@ -83,9 +84,8 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
 //     mov    %rsi, g_tls_sll_head(%rdi)
 //
 #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \
-    /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \
-    size_t _off = (HAKMEM_TINY_HEADER_CLASSIDX ? (((class_idx) == 7) ? 0 : 1) : 0); \
-    *(void**)((uint8_t*)(ptr) + _off) = g_tls_sll_head[(class_idx)]; \
+    /* Safe store of header-aware next (avoid UB on unaligned) */ \
+    tiny_next_store((ptr), (class_idx), g_tls_sll_head[(class_idx)]); \
     g_tls_sll_head[(class_idx)] = (ptr); \
     g_tls_sll_count[(class_idx)]++; \
 } while(0)
diff --git a/core/tiny_superslab_alloc.inc.h b/core/tiny_superslab_alloc.inc.h
index c0fbfad9..209fae50 100644
--- a/core/tiny_superslab_alloc.inc.h
+++ b/core/tiny_superslab_alloc.inc.h
@@ -18,6 +18,25 @@
 static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
     TinySlabMeta* meta = &ss->slabs[slab_idx];
 
+    // Phase 1 (Small): For hottest tiny classes (C0–C3), prefer strict bump-only
+    // when there is no pending remote and the freelist is empty. This avoids
+    // pointer-chasing and header writes entirely on the common path.
+    do {
+        if (__builtin_expect(ss->size_class <= 3, 1)) {
+            // Skip if remote queue has pending nodes
+            if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0)
+                break;
+            if (meta->freelist == NULL && meta->used < meta->capacity) {
+                size_t unit_sz = tiny_stride_for_class(ss->size_class);
+                uint8_t* base = tiny_slab_base_for_geometry(ss, slab_idx);
+                void* block = tiny_block_at_index(base, meta->used, unit_sz);
+                meta->used++;
+                ss_active_inc(ss);
+                HAK_RET_ALLOC(ss->size_class, block);
+            }
+        }
+    } while (0);
+
     // Ensure remote queue is drained before handing blocks back to TLS (UNLIKELY in 1T)
     if (__builtin_expect(atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0, 0)) {
         uint32_t self_tid = tiny_self_u32();
diff --git a/core/tiny_superslab_free.inc.h b/core/tiny_superslab_free.inc.h
index d3251764..e4048378 100644
--- a/core/tiny_superslab_free.inc.h
+++ b/core/tiny_superslab_free.inc.h
@@ -32,7 +32,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
         static _Atomic int c7_free_count = 0;
         int count = atomic_fetch_add_explicit(&c7_free_count, 1, memory_order_relaxed);
         if (count == 0) {
-            #if !HAKMEM_BUILD_RELEASE
+            #if !HAKMEM_BUILD_RELEASE && HAKMEM_DEBUG_VERBOSE
             fprintf(stderr, "[C7_FIRST_FREE] ptr=%p base=%p slab_idx=%d\n", ptr, base, slab_idx);
             #endif
         }