Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte header during allocation, but linear carve/refill and initial slab capacity still used bare class block sizes. This mismatch could overrun slab usable space and corrupt freelists, causing reproducible SEGV at ~100k iters. Changes - Superslab: compute capacity with effective stride (block_size + header for classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a debug-only bound check in superslab_alloc_from_slab() to fail fast if carve would exceed usable bytes. - Refill (non-P0 and P0): use header-aware stride for all linear carving and TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h also uses stride, not raw class size. - Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes before splicing into freelist (already present). Notes - This unifies the memory layout across alloc/linear-carve/refill with a single stride definition and keeps class7 (1024B) headerless as designed. - Debug builds add fail-fast checks; release builds remain lean. Next - Re-run Tiny benches (256/1024B) in debug to confirm stability, then in release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0 to isolate P0 batch carve, and continue reducing branch-miss as planned.
2025-11-09 18:55:50 +09:00
parent ab68ee536d
commit 1010a961fb
171 changed files with 10238 additions and 634 deletions
--- a/core/box/ace_pool_connector.c
+++ b/core/box/ace_pool_connector.c
@ -0,0 +1,130 @@
+// ace_pool_connector.c - ACE-Pool Connection Box Implementation
+
+#include "ace_pool_connector.h"
+#include "../hakmem_pool.h"
+#include "../hakmem_ace_controller.h"
+#include <stdio.h>
+#include <string.h>
+
+// External references (from Pool)
+extern struct Pool {
+    int initialized;
+    // ... other fields
+} g_pool;
+
+extern size_t g_class_sizes[7];  // Pool class sizes
+extern int g_wrap_l2_enabled;
+
+// ============================================================================
+// Box Implementation
+// ============================================================================
+
+AcePoolHealth ace_pool_get_health(void) {
+    AcePoolHealth health;
+    memset(&health, 0, sizeof(health));
+
+    // Check Pool initialization
+    health.pool_initialized = g_pool.initialized;
+
+    // Check ACE status
+    const char* ace_env = getenv("HAKMEM_ACE_ENABLED");
+    health.ace_enabled = (ace_env && atoi(ace_env) == 1);
+
+    // Check WRAP_L2 status
+    health.wrap_l2_enabled = g_wrap_l2_enabled;
+
+    // Check Bridge classes
+    health.bridge_class_5_size = (int)g_class_sizes[5];
+    health.bridge_class_6_size = (int)g_class_sizes[6];
+
+    // TODO: Track pre-allocated pages count
+    health.preallocated_pages = 0;  // Not yet tracked
+
+    // Determine overall status
+    if (!health.pool_initialized) {
+        health.status = ACE_POOL_NOT_INIT;
+        health.message = "Pool not initialized";
+    } else if (!health.ace_enabled) {
+        health.status = ACE_POOL_NOT_INIT;
+        health.message = "ACE not enabled (set HAKMEM_ACE_ENABLED=1)";
+    } else if (!health.wrap_l2_enabled) {
+        health.status = ACE_POOL_WRAPPER_BLOCKED;
+        health.message = "WRAP_L2 not enabled (set HAKMEM_WRAP_L2=1)";
+    } else if (health.bridge_class_5_size == 0 && health.bridge_class_6_size == 0) {
+        health.status = ACE_POOL_SIZE_MISMATCH;
+        health.message = "Bridge classes disabled (class 5 and 6 are 0)";
+    } else if (health.preallocated_pages == 0) {
+        health.status = ACE_POOL_NO_PAGES;
+        health.message = "No pre-allocated pages (performance will be degraded)";
+    } else {
+        health.status = ACE_POOL_OK;
+        health.message = "ACE-Pool connection healthy";
+    }
+
+    return health;
+}
+
+int ace_pool_validate_connection(AcePoolStatus* out_status) {
+    AcePoolHealth health = ace_pool_get_health();
+
+    if (out_status) {
+        *out_status = health.status;
+    }
+
+    // Only OK status is considered "ready"
+    // NO_PAGES is warning but still functional
+    return (health.status == ACE_POOL_OK || health.status == ACE_POOL_NO_PAGES);
+}
+
+void* ace_pool_try_alloc(size_t size, uintptr_t site_id, AcePoolStatus* out_status) {
+    // Validate connection first
+    AcePoolStatus status;
+    if (!ace_pool_validate_connection(&status)) {
+        if (out_status) *out_status = status;
+
+        // Log why allocation failed
+        AcePoolHealth health = ace_pool_get_health();
+        static int logged_once = 0;
+        if (!logged_once) {
+            fprintf(stderr, "[ACE-Pool Connector] BLOCKED: %s\n", health.message);
+            logged_once = 1;
+        }
+        return NULL;
+    }
+
+    // Connection validated, try Pool allocation
+    void* ptr = hak_pool_try_alloc(size, site_id);
+
+    if (ptr) {
+        if (out_status) *out_status = ACE_POOL_OK;
+    } else {
+        if (out_status) *out_status = ACE_POOL_ALLOC_FAILED;
+
+        // Log allocation failure (but only once to avoid spam)
+        static int fail_logged = 0;
+        if (!fail_logged) {
+            fprintf(stderr, "[ACE-Pool Connector] Pool allocation failed for size=%zu (will fallback to mmap)\n", size);
+            fail_logged = 1;
+        }
+    }
+
+    return ptr;
+}
+
+void ace_pool_print_health(void) {
+    AcePoolHealth health = ace_pool_get_health();
+
+    fprintf(stderr, "\n=== ACE-Pool Connector Health Check ===\n");
+    fprintf(stderr, "Pool Initialized:    %s\n", health.pool_initialized ? "YES" : "NO");
+    fprintf(stderr, "ACE Enabled:         %s\n", health.ace_enabled ? "YES" : "NO");
+    fprintf(stderr, "WRAP_L2 Enabled:     %s\n", health.wrap_l2_enabled ? "YES" : "NO");
+    fprintf(stderr, "Bridge Class 5:      %d KB (%s)\n",
+            health.bridge_class_5_size / 1024,
+            health.bridge_class_5_size > 0 ? "ENABLED" : "DISABLED");
+    fprintf(stderr, "Bridge Class 6:      %d KB (%s)\n",
+            health.bridge_class_6_size / 1024,
+            health.bridge_class_6_size > 0 ? "ENABLED" : "DISABLED");
+    fprintf(stderr, "Pre-allocated Pages: %d\n", health.preallocated_pages);
+    fprintf(stderr, "Status:              %s\n", health.message);
+    fprintf(stderr, "========================================\n\n");
+}
--- a/core/box/ace_pool_connector.h
+++ b/core/box/ace_pool_connector.h
@ -0,0 +1,70 @@
+// ace_pool_connector.h - ACE-Pool Connection Box
+// Box Theory: Single Responsibility - Validate and route ACE ↔ Pool connections
+//
+// Purpose:
+//   - Make ACE-Pool connection VISIBLE and VALIDATED
+//   - Centralize error handling and logging
+//   - Health check API for diagnostics
+//
+// Responsibilities:
+//   ✅ Validate Pool is initialized before ACE uses it
+//   ✅ Log connection status (success/failure/reason)
+//   ✅ Provide health check API
+//   ❌ NOT responsible for: allocation logic, size rounding, or memory management
+//
+// Box Boundaries:
+//   INPUT:  ACE requests allocation from Pool (size, site_id)
+//   OUTPUT: Pool allocation result (ptr or NULL) + reason code
+//   ERROR:  Clear error messages (not silent failures!)
+
+#ifndef ACE_POOL_CONNECTOR_H
+#define ACE_POOL_CONNECTOR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// ============================================================================
+// Box API: ACE-Pool Connection
+// ============================================================================
+
+// Connection status codes
+typedef enum {
+    ACE_POOL_OK = 0,           // Connection healthy
+    ACE_POOL_NOT_INIT,         // Pool not initialized
+    ACE_POOL_NO_PAGES,         // Pool has no pre-allocated pages
+    ACE_POOL_WRAPPER_BLOCKED,  // Wrapper protection blocking
+    ACE_POOL_SIZE_MISMATCH,    // Size not in Pool range
+    ACE_POOL_ALLOC_FAILED,     // Pool allocation returned NULL
+} AcePoolStatus;
+
+// Health check result
+typedef struct {
+    int pool_initialized;      // 1 if Pool is initialized
+    int ace_enabled;           // 1 if ACE is enabled
+    int wrap_l2_enabled;       // 1 if WRAP_L2 is enabled
+    int bridge_class_5_size;   // Size of Bridge class 5 (40KB expected)
+    int bridge_class_6_size;   // Size of Bridge class 6 (52KB expected)
+    int preallocated_pages;    // Number of pre-allocated pages (should be > 0)
+    AcePoolStatus status;      // Overall status
+    const char* message;       // Human-readable status message
+} AcePoolHealth;
+
+// ============================================================================
+// Box Functions
+// ============================================================================
+
+// Get health status (for debugging and monitoring)
+AcePoolHealth ace_pool_get_health(void);
+
+// Validate connection is ready (called by ACE before using Pool)
+// Returns: 1 if ready, 0 if not (sets reason code)
+int ace_pool_validate_connection(AcePoolStatus* out_status);
+
+// Connect ACE to Pool (wrapper around hak_pool_try_alloc with validation)
+// Returns: Allocated pointer or NULL (logs reason if NULL)
+void* ace_pool_try_alloc(size_t size, uintptr_t site_id, AcePoolStatus* out_status);
+
+// Print health status (for debugging)
+void ace_pool_print_health(void);
+
+#endif // ACE_POOL_CONNECTOR_H
--- a/core/box/free_local_box.d
+++ b/core/box/free_local_box.d
@ -0,0 +1,24 @@
+core/box/free_local_box.o: core/box/free_local_box.c \
+ core/box/free_local_box.h core/hakmem_tiny_superslab.h \
+ core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \
+ core/superslab/superslab_inline.h core/superslab/superslab_types.h \
+ core/tiny_debug_ring.h core/tiny_remote.h core/tiny_debug_ring.h \
+ core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \
+ core/box/free_publish_box.h core/hakmem_tiny.h core/hakmem_build_flags.h \
+ core/hakmem_trace.h core/hakmem_tiny_mini_mag.h
+core/box/free_local_box.h:
+core/hakmem_tiny_superslab.h:
+core/superslab/superslab_types.h:
+core/hakmem_tiny_superslab_constants.h:
+core/superslab/superslab_inline.h:
+core/superslab/superslab_types.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/hakmem_tiny_superslab_constants.h:
+core/box/free_publish_box.h:
+core/hakmem_tiny.h:
+core/hakmem_build_flags.h:
+core/hakmem_trace.h:
+core/hakmem_tiny_mini_mag.h:
--- a/core/box/free_publish_box.d
+++ b/core/box/free_publish_box.d
@ -0,0 +1,28 @@
+core/box/free_publish_box.o: core/box/free_publish_box.c \
+ core/box/free_publish_box.h core/hakmem_tiny_superslab.h \
+ core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \
+ core/superslab/superslab_inline.h core/superslab/superslab_types.h \
+ core/tiny_debug_ring.h core/tiny_remote.h core/tiny_debug_ring.h \
+ core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \
+ core/hakmem_tiny.h core/hakmem_build_flags.h core/hakmem_trace.h \
+ core/hakmem_tiny_mini_mag.h core/tiny_route.h core/tiny_ready.h \
+ core/hakmem_tiny.h core/box/mailbox_box.h
+core/box/free_publish_box.h:
+core/hakmem_tiny_superslab.h:
+core/superslab/superslab_types.h:
+core/hakmem_tiny_superslab_constants.h:
+core/superslab/superslab_inline.h:
+core/superslab/superslab_types.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/hakmem_tiny_superslab_constants.h:
+core/hakmem_tiny.h:
+core/hakmem_build_flags.h:
+core/hakmem_trace.h:
+core/hakmem_tiny_mini_mag.h:
+core/tiny_route.h:
+core/tiny_ready.h:
+core/hakmem_tiny.h:
+core/box/mailbox_box.h:
--- a/core/box/free_remote_box.d
+++ b/core/box/free_remote_box.d
@ -0,0 +1,24 @@
+core/box/free_remote_box.o: core/box/free_remote_box.c \
+ core/box/free_remote_box.h core/hakmem_tiny_superslab.h \
+ core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \
+ core/superslab/superslab_inline.h core/superslab/superslab_types.h \
+ core/tiny_debug_ring.h core/tiny_remote.h core/tiny_debug_ring.h \
+ core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \
+ core/box/free_publish_box.h core/hakmem_tiny.h core/hakmem_build_flags.h \
+ core/hakmem_trace.h core/hakmem_tiny_mini_mag.h
+core/box/free_remote_box.h:
+core/hakmem_tiny_superslab.h:
+core/superslab/superslab_types.h:
+core/hakmem_tiny_superslab_constants.h:
+core/superslab/superslab_inline.h:
+core/superslab/superslab_types.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/hakmem_tiny_superslab_constants.h:
+core/box/free_publish_box.h:
+core/hakmem_tiny.h:
+core/hakmem_build_flags.h:
+core/hakmem_trace.h:
+core/hakmem_tiny_mini_mag.h:
--- a/core/box/front_gate_box.d
+++ b/core/box/front_gate_box.d
@ -0,0 +1,11 @@
+core/box/front_gate_box.o: core/box/front_gate_box.c \
+ core/box/front_gate_box.h core/hakmem_tiny.h core/hakmem_build_flags.h \
+ core/hakmem_trace.h core/hakmem_tiny_mini_mag.h \
+ core/tiny_alloc_fast_sfc.inc.h core/hakmem_tiny.h
+core/box/front_gate_box.h:
+core/hakmem_tiny.h:
+core/hakmem_build_flags.h:
+core/hakmem_trace.h:
+core/hakmem_tiny_mini_mag.h:
+core/tiny_alloc_fast_sfc.inc.h:
+core/hakmem_tiny.h:
--- a/core/box/hak_alloc_api.inc.h
+++ b/core/box/hak_alloc_api.inc.h
@ -6,6 +6,19 @@
 #include "../pool_tls.h"
 #endif

+// Centralized OS mapping boundary to keep syscalls in one place
+static inline void* hak_os_map_boundary(size_t size, uintptr_t site_id) {
+#if HAKMEM_DEBUG_TIMING
+    HKM_TIME_START(t_mmap);
+#endif
+    void* p = hak_alloc_mmap_impl(size);
+#if HAKMEM_DEBUG_TIMING
+    HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
+#endif
+    (void)site_id; // reserved for future accounting/learning
+    return p;
+}
+
 __attribute__((always_inline))
 inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
 #if HAKMEM_DEBUG_TIMING
@ -144,33 +157,24 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
    //
    // Solution: Use mmap for gap when ACE failed (ACE disabled or OOM)

+    // Track final fallback mmaps globally
+    extern _Atomic uint64_t g_final_fallback_mmap_count;
+
    void* ptr;
    if (size >= threshold) {
-        // Large allocation (>= 2MB default): use mmap
-#if HAKMEM_DEBUG_TIMING
-        HKM_TIME_START(t_mmap);
-#endif
-        ptr = hak_alloc_mmap_impl(size);
-#if HAKMEM_DEBUG_TIMING
-        HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
-#endif
+        // Large allocation (>= 2MB default): descend via single boundary
+        atomic_fetch_add(&g_final_fallback_mmap_count, 1);
+        ptr = hak_os_map_boundary(size, site_id);
    } else if (size >= TINY_MAX_SIZE) {
        // Mid-range allocation (1KB-2MB): try mmap as final fallback
        // This handles the gap when ACE is disabled or failed
+        atomic_fetch_add(&g_final_fallback_mmap_count, 1);
        static _Atomic int gap_alloc_count = 0;
        int count = atomic_fetch_add(&gap_alloc_count, 1);
        #if HAKMEM_DEBUG_VERBOSE
-        if (count < 3) {
-            fprintf(stderr, "[HAKMEM] INFO: Using mmap for mid-range size=%zu (ACE disabled or failed)\n", size);
-        }
+        if (count < 3) fprintf(stderr, "[HAKMEM] INFO: mid-gap fallback size=%zu\n", size);
        #endif
-#if HAKMEM_DEBUG_TIMING
-        HKM_TIME_START(t_mmap);
-#endif
-        ptr = hak_alloc_mmap_impl(size);
-#if HAKMEM_DEBUG_TIMING
-        HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
-#endif
+        ptr = hak_os_map_boundary(size, site_id);
    } else {
        // Should never reach here (size <= TINY_MAX_SIZE should be handled by Tiny)
        static _Atomic int oom_count = 0;
--- a/core/box/hak_core_init.inc.h
+++ b/core/box/hak_core_init.inc.h
@ -117,6 +117,39 @@ static void hak_init_impl(void) {
    HAKMEM_LOG("Sampling rate: 1/%d\n", SAMPLING_RATE);
    HAKMEM_LOG("Max sites: %d\n", MAX_SITES);

+    // Build banner (one-shot)
+    do {
+        const char* bf = "UNKNOWN";
+#ifdef HAKMEM_BUILD_RELEASE
+        bf = "RELEASE";
+#elif defined(HAKMEM_BUILD_DEBUG)
+        bf = "DEBUG";
+#endif
+        HAKMEM_LOG("[Build] Flavor=%s  Flags: HEADER_CLASSIDX=%d, AGGRESSIVE_INLINE=%d, POOL_TLS_PHASE1=%d, POOL_TLS_PREWARM=%d\n",
+                   bf,
+#ifdef HAKMEM_TINY_HEADER_CLASSIDX
+                   1,
+#else
+                   0,
+#endif
+#ifdef HAKMEM_TINY_AGGRESSIVE_INLINE
+                   1,
+#else
+                   0,
+#endif
+#ifdef HAKMEM_POOL_TLS_PHASE1
+                   1,
+#else
+                   0,
+#endif
+#ifdef HAKMEM_POOL_TLS_PREWARM
+                   1
+#else
+                   0
+#endif
+                   );
+    } while (0);
+
    // Bench preset: Tiny-only (disable non-essential subsystems)
    {
        char* bt = getenv("HAKMEM_BENCH_TINY_ONLY");
--- a/core/box/mailbox_box.d
+++ b/core/box/mailbox_box.d
@ -0,0 +1,23 @@
+core/box/mailbox_box.o: core/box/mailbox_box.c core/box/mailbox_box.h \
+ core/hakmem_tiny_superslab.h core/superslab/superslab_types.h \
+ core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \
+ core/superslab/superslab_types.h core/tiny_debug_ring.h \
+ core/tiny_remote.h core/tiny_debug_ring.h core/tiny_remote.h \
+ core/hakmem_tiny_superslab_constants.h core/hakmem_tiny.h \
+ core/hakmem_build_flags.h core/hakmem_trace.h \
+ core/hakmem_tiny_mini_mag.h
+core/box/mailbox_box.h:
+core/hakmem_tiny_superslab.h:
+core/superslab/superslab_types.h:
+core/hakmem_tiny_superslab_constants.h:
+core/superslab/superslab_inline.h:
+core/superslab/superslab_types.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/tiny_debug_ring.h:
+core/tiny_remote.h:
+core/hakmem_tiny_superslab_constants.h:
+core/hakmem_tiny.h:
+core/hakmem_build_flags.h:
+core/hakmem_trace.h:
+core/hakmem_tiny_mini_mag.h:
--- a/core/box/pool_api.inc.h
+++ b/core/box/pool_api.inc.h
@ -3,15 +3,54 @@
 #define POOL_API_INC_H

 void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {
+    // Debug: IMMEDIATE output to verify function is called
+    static int first_call = 1;
+    if (first_call) {
+        fprintf(stderr, "[Pool] hak_pool_try_alloc FIRST CALL EVER!\n");
+        first_call = 0;
+    }
+
+    if (size == 40960) {  // Exactly 40KB
+        fprintf(stderr, "[Pool] hak_pool_try_alloc called with 40KB (Bridge class 5)\n");
+    }
+
    hak_pool_init();  // pthread_once() ensures thread-safe init (no data race!)
+
+    // Debug for 33-41KB allocations
+    if (size >= 33000 && size <= 41000) {
+        fprintf(stderr, "[Pool] hak_pool_try_alloc: size=%zu (after init)\n", size);
+    }
+
    // P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe)
    extern int hak_in_wrapper(void);
-    if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL;
-    if (!hak_pool_is_poolable(size)) return NULL;
+    if (hak_in_wrapper() && !g_wrap_l2_enabled) {
+        if (size >= 33000 && size <= 41000) {
+            fprintf(stderr, "[Pool]   REJECTED: in_wrapper=%d, wrap_l2=%d\n",
+                    hak_in_wrapper(), g_wrap_l2_enabled);
+        }
+        return NULL;
+    }
+    if (!hak_pool_is_poolable(size)) {
+        if (size >= 33000 && size <= 41000) {
+            fprintf(stderr, "[Pool]   REJECTED: not poolable (min=%d, max=%d)\n",
+                    POOL_MIN_SIZE, POOL_MAX_SIZE);
+        }
+        return NULL;
+    }

    // Get class and shard indices
    int class_idx = hak_pool_get_class_index(size);
-    if (class_idx < 0) return NULL;
+    if (class_idx < 0) {
+        if (size >= 33000 && size <= 41000) {
+            fprintf(stderr, "[Pool]   REJECTED: class_idx=%d (size=%zu not mapped)\n",
+                    class_idx, size);
+        }
+        return NULL;
+    }
+
+    if (size >= 33000 && size <= 41000) {
+        fprintf(stderr, "[Pool]   ACCEPTED: class_idx=%d, proceeding with allocation\n", class_idx);
+    }

    // MF2: Per-Page Sharding path
    if (g_mf2_enabled) {
--- a/core/box/pool_init_api.inc.h
+++ b/core/box/pool_init_api.inc.h
@ -5,7 +5,14 @@
 // Thread-safe initialization using pthread_once
 static pthread_once_t hak_pool_init_once_control = PTHREAD_ONCE_INIT;
 static void hak_pool_init_impl(void) {
+    fprintf(stderr, "[Pool] hak_pool_init_impl() EXECUTING - Bridge class fix applied\n");
    const FrozenPolicy* pol = hkm_policy_get();
+
+    // Phase 6.21 CRITICAL FIX: Bridge classes are hardcoded in g_class_sizes,
+    // NOT from Policy. DO NOT overwrite them with 0!
+    // The code below was disabling Bridge classes by setting them to 0
+    // because Policy returns mid_dyn1_bytes=0 and mid_dyn2_bytes=0.
+    /*
    if (pol && pol->mid_dyn1_bytes >= POOL_MIN_SIZE && pol->mid_dyn1_bytes <= POOL_MAX_SIZE) {
        g_class_sizes[5] = pol->mid_dyn1_bytes;
    } else {
@ -16,6 +23,8 @@ static void hak_pool_init_impl(void) {
    } else {
        g_class_sizes[6] = 0;
    }
+    */
+    // Bridge classes remain as initialized: 40KB and 52KB
    for (int c = 0; c < POOL_NUM_CLASSES; c++) {
        for (int s = 0; s < POOL_NUM_SHARDS; s++) {
            g_pool.freelist[c][s] = NULL;
@ -82,20 +91,65 @@ static void hak_pool_init_impl(void) {
        HAKMEM_LOG("[MF2] max_queues=%d, lease_ms=%d, idle_threshold_us=%d\n", g_mf2_max_queues, g_mf2_lease_ms, g_mf2_idle_threshold_us);
    }
    g_pool.initialized = 1;
+    fprintf(stderr, "[Pool] Initialized (L2 Hybrid Pool) - Bridge classes SHOULD be enabled\n");
+    fprintf(stderr, "[Pool] Class 5 (40KB): %zu\n", g_class_sizes[5]);
+    fprintf(stderr, "[Pool] Class 6 (52KB): %zu\n", g_class_sizes[6]);
    HAKMEM_LOG("[Pool] Initialized (L2 Hybrid Pool)\n");
-    if (g_class_sizes[5] != 0 || g_class_sizes[6] != 0) {
-        HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB%s%s%s\n",
-                   g_class_sizes[5] ? ", dyn1=" : "",
-                   g_class_sizes[5] ? "" : (g_class_sizes[6]?",":""),
-                   (g_class_sizes[5]||g_class_sizes[6]) ? "" : "");
-    } else {
-        HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB\n");
+
+#ifdef HAKMEM_DEBUG_VERBOSE
+    // Debug: Show actual class sizes after initialization
+    HAKMEM_LOG("[Pool] Class configuration:\n");
+    for (int i = 0; i < POOL_NUM_CLASSES; i++) {
+        if (g_class_sizes[i] != 0) {
+            HAKMEM_LOG("  Class %d: %zu KB (ENABLED)\n", i, g_class_sizes[i]/1024);
+        } else {
+            HAKMEM_LOG("  Class %d: DISABLED\n", i);
+        }
    }
+#endif
    HAKMEM_LOG("[Pool] Page size: %d KB\n", POOL_PAGE_SIZE / 1024);
    HAKMEM_LOG("[Pool] Shards: %d (site-based)\n", POOL_NUM_SHARDS);
+
+    // ACE Performance Fix: Pre-allocate pages for Bridge classes to avoid cold start
+    // This ensures ACE can serve Mid-Large allocations (33KB) immediately without mmap fallback
+    extern int refill_freelist(int class_idx, int shard_idx);
+    int prewarm_pages = 4;  // Pre-allocate 4 pages per shard for hot classes
+
+    // Pre-warm Bridge class 5 (40KB) - Critical for 33KB allocations
+    if (g_class_sizes[5] != 0) {
+        int allocated = 0;
+        for (int s = 0; s < prewarm_pages && s < POOL_NUM_SHARDS; s++) {
+            if (refill_freelist(5, s) != 0) {  // FIX: Check for SUCCESS (1), not FAILURE (0)
+                allocated++;
+            }
+        }
+        fprintf(stderr, "[Pool] Pre-allocated %d pages for Bridge class 5 (%zu KB) - Critical for 33KB allocs\n",
+                allocated, g_class_sizes[5]/1024);
+    } else {
+        fprintf(stderr, "[Pool] WARNING: Bridge class 5 (40KB) is DISABLED - 33KB allocations will fail!\n");
+    }
+
+    // Pre-warm Bridge class 6 (52KB)
+    if (g_class_sizes[6] != 0) {
+        int allocated = 0;
+        for (int s = 0; s < prewarm_pages && s < POOL_NUM_SHARDS; s++) {
+            if (refill_freelist(6, s) != 0) {  // FIX: Check for SUCCESS (1), not FAILURE (0)
+                allocated++;
+            }
+        }
+        fprintf(stderr, "[Pool] Pre-allocated %d pages for Bridge class 6 (%zu KB)\n",
+                allocated, g_class_sizes[6]/1024);
+    }
 }

-void hak_pool_init(void) { pthread_once(&hak_pool_init_once_control, hak_pool_init_impl); }
+void hak_pool_init(void) {
+    // Always print this to see if it's being called
+    static int called = 0;
+    if (called++ == 0) {
+        fprintf(stderr, "[Pool] hak_pool_init() called for the first time\n");
+    }
+    pthread_once(&hak_pool_init_once_control, hak_pool_init_impl);
+}

 static void mf2_print_debug_stats(void) {
    if (!g_mf2_enabled) return;
--- a/core/hakmem_ace.c
+++ b/core/hakmem_ace.c
@ -1,3 +1,4 @@
+#include <stdio.h>
 #include "hakmem_ace.h"
 #include "hakmem_pool.h"
 #include "hakmem_l25_pool.h"
@ -50,9 +51,24 @@ void* hkm_ace_alloc(size_t size, uintptr_t site_id, const FrozenPolicy* pol) {
    double wmax_large = (pol ? pol->w_max_large : 1.25);

    // MidPool: 2–52KiB (Phase 6.21: with Bridge classes for W_MAX rounding)
+    if (size >= 33000 && size <= 34000) {
+        fprintf(stderr, "[ACE] Processing 33KB: size=%zu, POOL_MAX_SIZE=%d\n", size, POOL_MAX_SIZE);
+    }
    if (size <= POOL_MAX_SIZE) {
        size_t r = round_to_mid_class(size, wmax_mid, pol);
+        if (size >= 33000 && size <= 34000) {
+            fprintf(stderr, "[ACE]   round_to_mid_class returned: %zu (0 means no valid class)\n", r);
+        }
        if (r != 0) {
+            // Debug: Log 33KB allocation routing (only in debug builds)
+#ifdef HAKMEM_DEBUG_VERBOSE
+            if (size >= 33000 && size <= 34000) {
+                HAKMEM_LOG("[ACE] 33KB alloc: size=%zu → rounded=%zu (class 5: 40KB)\n", size, r);
+            }
+#endif
+            if (size >= 33000 && size <= 34000) {
+                fprintf(stderr, "[ACE]   Calling hak_pool_try_alloc with size=%zu\n", r);
+            }
            HKM_TIME_START(t_mid_get);
            void* p = hak_pool_try_alloc(r, site_id);
            HKM_TIME_END(HKM_CAT_POOL_GET, t_mid_get);
@ -74,7 +90,7 @@ void* hkm_ace_alloc(size_t size, uintptr_t site_id, const FrozenPolicy* pol) {
        }
    } else if (size > POOL_MAX_SIZE && size < L25_MIN_SIZE) {
        // Gap 32–64KiB: try rounding up to 64KiB if permitted
-        size_t r = round_to_large_class(L25_MIN_SIZE, wmax_large); // check 64KiB vs size
+        // size_t r = round_to_large_class(L25_MIN_SIZE, wmax_large); // check 64KiB vs size (unused)
        if ((double)L25_MIN_SIZE <= wmax_large * (double)size) {
            HKM_TIME_START(t_l25_get2);
            void* p = hak_l25_pool_try_alloc(L25_MIN_SIZE, site_id);
--- a/core/hakmem_tiny.c
+++ b/core/hakmem_tiny.c
@ -237,6 +237,21 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
    int scan_limit = tiny_reg_scan_max();
    if (scan_limit > reg_size) scan_limit = reg_size;
    uint32_t self_tid = tiny_self_u32();
+    // Local helper (mirror adopt_bind_if_safe) to avoid including alloc inline here
+    auto int adopt_bind_if_safe_local(TinyTLSSlab* tls_l, SuperSlab* ss, int slab_idx, int class_idx_l) {
+        uint32_t self_tid = tiny_self_u32();
+        SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
+        if (!slab_is_valid(&h)) return 0;
+        slab_drain_remote_full(&h);
+        if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) {
+            tiny_tls_bind_slab(tls_l, h.ss, h.slab_idx);
+            slab_release(&h);
+            return 1;
+        }
+        slab_release(&h);
+        return 0;
+    }
+
    for (int i = 0; i < scan_limit; i++) {
        SuperSlab* cand = g_super_reg_by_class[class_idx][i];
        if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue;
@ -248,25 +263,16 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
        }
        if (mask == 0) continue;  // No visible freelists in this SS
        int cap = ss_slabs_capacity(cand);
-        // Iterate set bits only
        while (mask) {
            int sidx = __builtin_ctz(mask);
-            mask &= (mask - 1);  // clear lowest set bit
+            mask &= (mask - 1);
            if (sidx >= cap) continue;
-            SlabHandle h = slab_try_acquire(cand, sidx, self_tid);
-            if (!slab_is_valid(&h)) continue;
-            if (slab_remote_pending(&h)) {
-                slab_drain_remote_full(&h);
-            }
-            if (slab_is_safe_to_bind(&h)) {
-                tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
+            if (adopt_bind_if_safe_local(tls, cand, sidx, class_idx)) {
                g_adopt_gate_success[class_idx]++;
                g_reg_scan_hits[class_idx]++;
                ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07);
-                slab_release(&h);
-                return h.ss;
+                return cand;
            }
-            slab_release(&h);
        }
    }
    return NULL;
@ -1455,7 +1461,7 @@ static inline int ultra_batch_for_class(int class_idx) {
        case 1: return 96;            // 16B（A/B最良）
        case 2: return 96;            // 32B（A/B最良）
        case 3: return 224;           // 64B（A/B最良）
-        case 4: return 64;            // 128B
+        case 4: return 96;            // 128B (promote front refill a bit)
        case 5: return 64;            // 256B (promote front refill)
        case 6: return 64;            // 512B (promote front refill)
        default: return 32;           // 1024B and others
--- a/core/hakmem_tiny.h
+++ b/core/hakmem_tiny.h
@ -23,7 +23,7 @@ int hak_is_initializing(void);

 #define TINY_NUM_CLASSES 8
 #define TINY_SLAB_SIZE (64 * 1024)  // 64KB per slab
-#define TINY_MAX_SIZE 1024          // Maximum allocation size (1KB)
+#define TINY_MAX_SIZE 1536          // Maximum allocation size (1.5KB, accommodate 1024B + header)

 // ============================================================================
 // Size Classes
@ -244,12 +244,14 @@ void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold);
 static inline int hak_tiny_size_to_class(size_t size) {
    if (size == 0 || size > TINY_MAX_SIZE) return -1;
 #if HAKMEM_TINY_HEADER_CLASSIDX
-    // Phase 7 CRITICAL FIX (2025-11-08): Add 1-byte header overhead BEFORE class lookup
-    // Bug: 64B request was mapped to class 3 (64B blocks), leaving only 63B usable → BUS ERROR
-    // Fix: 64B request → alloc_size=65 → class 4 (128B blocks) → 127B usable ✓
-    size_t alloc_size = size + 1;  // Add header overhead
-    if (alloc_size > TINY_MAX_SIZE) return -1;  // 1024B request becomes 1025B, reject to Mid
-    return g_size_to_class_lut_1k[alloc_size];  // Look up with header-adjusted size
+    // Phase 7 header adds +1 byte. Special-case 1024B to remain in Tiny (no header).
+    // Rationale: Avoid forcing 1024B to Mid/OS which causes frequent mmap/madvise.
+    if (size == TINY_MAX_SIZE) {
+        return g_size_to_class_lut_1k[size];  // class 7 (1024B blocks)
+    }
+    size_t alloc_size = size + 1;  // Add header for other sizes
+    if (alloc_size > TINY_MAX_SIZE) return -1;
+    return g_size_to_class_lut_1k[alloc_size];
 #else
    return g_size_to_class_lut_1k[size];  // 1..1024: single load
 #endif
--- a/core/hakmem_tiny_init.inc
+++ b/core/hakmem_tiny_init.inc
@ -414,6 +414,10 @@ void hak_tiny_init(void) {
        char* m = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
        if (m) { int v = atoi(m); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_mid = v; }
    }
+    // Sensible default for class 7 (1024B): favor larger refill to reduce refills/syscalls
+    if (g_refill_count_class[7] == 0) {
+        g_refill_count_class[7] = 64;  // can be overridden by env HAKMEM_TINY_REFILL_COUNT_C7
+    }
    {
        char* fast_env = getenv("HAKMEM_TINY_FAST");
        if (fast_env && atoi(fast_env) == 0) g_fast_enable = 0;
--- a/core/hakmem_tiny_refill.inc.h
+++ b/core/hakmem_tiny_refill.inc.h
@ -204,14 +204,20 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
    TinySlabMeta* meta = tls->meta;
    if (!meta) return 0;

-    // Class 5/6/7 special-case: simple batch refill (favor linear carve, minimal branching)
-    if (__builtin_expect(class_idx >= 5, 0)) {
+    // Class 4/5/6/7 special-case: simple batch refill (favor linear carve, minimal branching)
+    // Optional gate for class3 via env: HAKMEM_TINY_SIMPLE_REFILL_C3=1
+    static int g_simple_c3 = -1;
+    if (__builtin_expect(g_simple_c3 == -1, 0)) {
+        const char* e = getenv("HAKMEM_TINY_SIMPLE_REFILL_C3");
+        g_simple_c3 = (e && *e && *e != '0') ? 1 : 0;
+    }
+    if (__builtin_expect(class_idx >= 4 || (class_idx == 3 && g_simple_c3), 0)) {
        uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
        int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
        if (room <= 0) return 0;
        int take = max_take < room ? max_take : room;
        int taken = 0;
-        size_t bs = g_tiny_class_sizes[class_idx];
+        size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
        for (; taken < take;) {
            // Linear first (LIKELY for class7)
            if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
@ -251,7 +257,7 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
    int take = max_take < room ? max_take : room;

    int taken = 0;
-    size_t bs = g_tiny_class_sizes[class_idx];
+    size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
    while (taken < take) {
        void* p = NULL;
        if (__builtin_expect(meta->freelist != NULL, 0)) {
@ -311,7 +317,7 @@ static inline void* superslab_tls_bump_fast(int class_idx) {
    uint32_t avail = (uint32_t)cap - (uint32_t)used;
    uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);
    if (chunk > avail) chunk = avail;
-    size_t bs = g_tiny_class_sizes[tls->ss->size_class];
+    size_t bs = g_tiny_class_sizes[tls->ss->size_class] + ((tls->ss->size_class != 7) ? 1 : 0);
    uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
    uint8_t* start = base + ((size_t)used * bs);
    // Reserve the chunk once in header (keeps remote-free accounting valid)
@ -412,7 +418,7 @@ static inline void ultra_refill_sll(int class_idx) {
        }
    }
    if (slab) {
-        size_t bs = g_tiny_class_sizes[class_idx];
+        size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
        int remaining = need;
        while (remaining > 0 && slab->free_count > 0) {
            if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;
--- a/core/hakmem_tiny_refill_p0.inc.h
+++ b/core/hakmem_tiny_refill_p0.inc.h
@ -90,7 +90,8 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
        return 0;
    }

-    size_t bs = g_tiny_class_sizes[class_idx];
+    // Effective stride: class block size + 1-byte header for classes 0..6
+    size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
    int total_taken = 0;

    // === P0 Batch Carving Loop ===
--- a/core/hakmem_tiny_superslab.c
+++ b/core/hakmem_tiny_superslab.c
@ -184,8 +184,13 @@ static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) {
    g_hakmem_lock_depth--;  // Now safe to restore (all libc calls complete)
 }

+// Global counters for debugging (non-static for external access)
+_Atomic uint64_t g_ss_mmap_count = 0;
+_Atomic uint64_t g_final_fallback_mmap_count = 0;
+
 static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) {
    void* ptr = NULL;
+    static int log_count = 0;

 #ifdef MAP_ALIGNED_SUPER
    int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER;
@ -199,6 +204,7 @@ static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask
               map_flags,
               -1, 0);
    if (ptr != MAP_FAILED) {
+        atomic_fetch_add(&g_ss_mmap_count, 1);
        if (((uintptr_t)ptr & ss_mask) == 0) {
            ss_stats_os_alloc(size_class, ss_size);
            return ptr;
@ -221,6 +227,14 @@ static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask
                     PROT_READ | PROT_WRITE,
                     flags,
                     -1, 0);
+    if (raw != MAP_FAILED) {
+        uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1;
+        if (log_count < 10) {
+            fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n",
+                    (unsigned long)count, size_class, ss_size);
+            log_count++;
+        }
+    }
    if (raw == MAP_FAILED) {
        log_superslab_oom_once(ss_size, alloc_size, errno);
        return NULL;
@ -717,15 +731,22 @@ void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_
    //
    // Phase 6-2.5: Use constants from hakmem_tiny_superslab_constants.h
    size_t usable_size = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
-    int capacity = (int)(usable_size / block_size);
+    // Header-aware stride: include 1-byte header for classes 0-6 when enabled
+    size_t stride = block_size;
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    if (__builtin_expect(ss->size_class != 7, 1)) {
+        stride += 1;
+    }
+#endif
+    int capacity = (int)(usable_size / stride);

    // Diagnostic: Verify capacity for class 7 slab 0 (one-shot)
    if (ss->size_class == 7 && slab_idx == 0) {
        static _Atomic int g_cap_log_printed = 0;
        if (atomic_load(&g_cap_log_printed) == 0 &&
            atomic_exchange(&g_cap_log_printed, 1) == 0) {
-            fprintf(stderr, "[SUPERSLAB_INIT] class 7 slab 0: usable_size=%zu block_size=%zu capacity=%d\n",
-                    usable_size, block_size, capacity);
+            fprintf(stderr, "[SUPERSLAB_INIT] class 7 slab 0: usable_size=%zu stride=%zu capacity=%d\n",
+                    usable_size, stride, capacity);
            fprintf(stderr, "[SUPERSLAB_INIT] Expected: 63488 / 1024 = 62 blocks\n");
            if (capacity != 62) {
                fprintf(stderr, "[SUPERSLAB_INIT] WARNING: capacity=%d (expected 62!)\n", capacity);
--- a/core/hakmem_tiny_superslab.h
+++ b/core/hakmem_tiny_superslab.h
@ -25,6 +25,7 @@
 #include "tiny_debug_ring.h"
 #include "tiny_remote.h"
 #include "hakmem_tiny_superslab_constants.h"  // Phase 6-2.5: Centralized layout constants
+#include "hakmem_build_flags.h"

 // Debug instrumentation flags (defined in hakmem_tiny.c)
 extern int g_debug_remote_guard;
@ -33,6 +34,31 @@ extern _Atomic uint64_t g_ss_active_dec_calls;

 uint32_t tiny_remote_drain_threshold(void);

+// ============================================================================
+// Tiny block stride helper (Phase 7 header-aware)
+// ============================================================================
+// Returns the effective per-block stride used for linear carving within slabs.
+// When header-based class indexing is enabled, classes 0-6 reserve an extra
+// byte per block for the header. Class 7 (1024B) remains headerless by design.
+static inline size_t tiny_block_stride_for_class(int class_idx) {
+    size_t bs = g_tiny_class_sizes[class_idx];
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    if (__builtin_expect(class_idx != 7, 1)) bs += 1;
+#endif
+#if !HAKMEM_BUILD_RELEASE
+    // One-shot debug: confirm stride behavior at runtime for class 0
+    static _Atomic int g_stride_dbg = 0;
+    if (class_idx == 0) {
+        int exp = 0;
+        if (atomic_compare_exchange_strong(&g_stride_dbg, &exp, 1)) {
+            fprintf(stderr, "[STRIDE_DBG] HEADER_CLASSIDX=%d class=%d stride=%zu\n",
+                    (int)HAKMEM_TINY_HEADER_CLASSIDX, class_idx, bs);
+        }
+    }
+#endif
+    return bs;
+}
+
 // ============================================================================
 // Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads
 // ============================================================================
--- a/core/pool_refill_legacy.c.bak
+++ b/core/pool_refill_legacy.c.bak
@ -0,0 +1,105 @@
+#include "pool_refill.h"
+#include "pool_tls.h"
+#include <sys/mman.h>
+#include <stdint.h>
+#include <errno.h>
+
+// Get refill count from Box 1
+extern int pool_get_refill_count(int class_idx);
+
+// Refill and return first block
+void* pool_refill_and_alloc(int class_idx) {
+    int count = pool_get_refill_count(class_idx);
+    if (count <= 0) return NULL;
+
+    // Batch allocate from existing Pool backend
+    void* chain = backend_batch_carve(class_idx, count);
+    if (!chain) return NULL;  // OOM
+
+    // Pop first block for return
+    void* ret = chain;
+    chain = *(void**)chain;
+    count--;
+
+    #if POOL_USE_HEADERS
+    // Write header for the block we're returning
+    *((uint8_t*)ret - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx;
+    #endif
+
+    // Install rest in TLS (if any)
+    if (count > 0 && chain) {
+        pool_install_chain(class_idx, chain, count);
+    }
+
+    return ret;
+}
+
+// Backend batch carve - Phase 1: Direct mmap allocation
+void* backend_batch_carve(int class_idx, int count) {
+    if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES || count <= 0) {
+        return NULL;
+    }
+
+    // Get the class size
+    size_t block_size = POOL_CLASS_SIZES[class_idx];
+
+    // For Phase 1: Allocate a single large chunk via mmap
+    // and carve it into blocks
+    #if POOL_USE_HEADERS
+    size_t total_block_size = block_size + POOL_HEADER_SIZE;
+    #else
+    size_t total_block_size = block_size;
+    #endif
+
+    // Allocate enough for all requested blocks
+    size_t total_size = total_block_size * count;
+
+    // Round up to page size
+    size_t page_size = 4096;
+    total_size = (total_size + page_size - 1) & ~(page_size - 1);
+
+    // Allocate memory via mmap
+    void* chunk = mmap(NULL, total_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (chunk == MAP_FAILED) {
+        return NULL;
+    }
+
+    // Carve into blocks and chain them
+    void* head = NULL;
+    void* tail = NULL;
+    char* ptr = (char*)chunk;
+
+    for (int i = 0; i < count; i++) {
+        #if POOL_USE_HEADERS
+        // Skip header space - user data starts after header
+        void* user_ptr = ptr + POOL_HEADER_SIZE;
+        #else
+        void* user_ptr = ptr;
+        #endif
+
+        // Chain the blocks
+        if (!head) {
+            head = user_ptr;
+            tail = user_ptr;
+        } else {
+            *(void**)tail = user_ptr;
+            tail = user_ptr;
+        }
+
+        // Move to next block
+        ptr += total_block_size;
+
+        // Stop if we'd go past the allocated chunk
+        if ((ptr + total_block_size) > ((char*)chunk + total_size)) {
+            break;
+        }
+    }
+
+    // Terminate chain
+    if (tail) {
+        *(void**)tail = NULL;
+    }
+
+    return head;
+}
--- a/core/pool_tls.c
+++ b/core/pool_tls.c
@ -2,6 +2,14 @@
 #include <string.h>
 #include <stdint.h>
 #include <stdbool.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include "pool_tls_registry.h"
+
+static inline pid_t gettid_cached(void){
+  static __thread pid_t t=0; if (__builtin_expect(t==0,0)) t=(pid_t)syscall(SYS_gettid); return t;
+}
+#include <stdio.h>

 // Class sizes: 8KB, 16KB, 24KB, 32KB, 40KB, 48KB, 52KB
 const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
@ -12,11 +20,27 @@ const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
 __thread void* g_tls_pool_head[POOL_SIZE_CLASSES];
 __thread uint32_t g_tls_pool_count[POOL_SIZE_CLASSES];

+// Phase 1.5b: Lazy pre-warm flag (per-thread)
+#ifdef HAKMEM_POOL_TLS_PREWARM
+__thread int g_tls_pool_prewarmed = 0;
+#endif
+
 // Fixed refill counts (Phase 1: no learning)
 static const uint32_t DEFAULT_REFILL_COUNT[POOL_SIZE_CLASSES] = {
    64, 48, 32, 32, 24, 16, 16  // Larger classes = smaller refill
 };

+// Pre-warm counts optimized for memory usage (Phase 1.5b)
+// Total memory: ~1.6MB per thread
+// Hot classes (8-24KB): 16 blocks - common in real workloads
+// Warm classes (32-40KB): 8 blocks
+// Cold classes (48-52KB): 4 blocks - rare
+static const int PREWARM_COUNTS[POOL_SIZE_CLASSES] = {
+    16, 16, 12,  // Hot: 8KB, 16KB, 24KB
+    8, 8,        // Warm: 32KB, 40KB
+    4, 4         // Cold: 48KB, 52KB
+};
+
 // Forward declare refill function (from Box 2)
 extern void* pool_refill_and_alloc(int class_idx);

@ -36,12 +60,34 @@ static inline int pool_size_to_class(size_t size) {

 // Ultra-fast allocation (5-6 cycles)
 void* pool_alloc(size_t size) {
+    // Phase 1.5b: Lazy pre-warm on first allocation per thread
+    #ifdef HAKMEM_POOL_TLS_PREWARM
+    if (__builtin_expect(!g_tls_pool_prewarmed, 0)) {
+        g_tls_pool_prewarmed = 1;  // Set flag FIRST to prevent recursion!
+        pool_tls_prewarm();  // Pre-populate TLS caches
+    }
+    #endif
+
    // Quick bounds check
    if (size < 8192 || size > 53248) return NULL;

    int class_idx = pool_size_to_class(size);
    if (class_idx < 0) return NULL;

+    // Drain a small batch of remote frees for this class
+    extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
+    void* chain = NULL;
+    int drained = pool_remote_pop_chain(class_idx, 32, &chain);
+    if (drained > 0 && chain) {
+        // Splice into TLS freelist
+        void* tail = chain;
+        int n = 1;
+        while (*(void**)tail) { tail = *(void**)tail; n++; }
+        *(void**)tail = g_tls_pool_head[class_idx];
+        g_tls_pool_head[class_idx] = chain;
+        g_tls_pool_count[class_idx] += n;
+    }
+
    void* head = g_tls_pool_head[class_idx];

    if (__builtin_expect(head != NULL, 1)) {  // LIKELY
@ -54,6 +100,17 @@ void* pool_alloc(size_t size) {
        *((uint8_t*)head - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx;
        #endif

+        // Low-water integration: if TLS count is low, opportunistically drain remotes
+        if (g_tls_pool_count[class_idx] < 4) {
+            extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
+            void* chain2 = NULL; int got = pool_remote_pop_chain(class_idx, 32, &chain2);
+            if (got > 0 && chain2) {
+                void* tail = chain2; while (*(void**)tail) tail = *(void**)tail;
+                *(void**)tail = g_tls_pool_head[class_idx];
+                g_tls_pool_head[class_idx] = chain2;
+                g_tls_pool_count[class_idx] += got;
+            }
+        }
        return head;
    }

@ -78,8 +135,18 @@ void pool_free(void* ptr) {
    // Need registry lookup (slower fallback) - not implemented in Phase 1
    return;
    #endif
+    // Owner resolution via page registry
+    pid_t owner_tid=0; int reg_cls=-1;
+    if (pool_reg_lookup(ptr, &owner_tid, &reg_cls)){
+        pid_t me = gettid_cached();
+        if (owner_tid != me){
+            extern int pool_remote_push(int class_idx, void* ptr, int owner_tid);
+            (void)pool_remote_push(class_idx, ptr, owner_tid);
+            return;
+        }
+    }

-    // Push to freelist (2-3 instructions)
+    // Same-thread: Push to TLS freelist (2-3 instructions)
    *(void**)ptr = g_tls_pool_head[class_idx];
    g_tls_pool_head[class_idx] = ptr;
    g_tls_pool_count[class_idx]++;
@ -109,4 +176,25 @@ void pool_thread_init(void) {
 void pool_thread_cleanup(void) {
    // Phase 1: No cleanup (keep it simple)
    // TODO: Drain back to global pool
-}
+}
+
+// Pre-warm TLS cache (Phase 1.5b optimization)
+// Eliminates cold-start penalty by pre-populating TLS freelists
+// Expected improvement: +180-740% (based on Phase 7 Task 3 success)
+void pool_tls_prewarm(void) {
+    // Forward declare refill function (from Box 2)
+    extern void* backend_batch_carve(int class_idx, int count);
+
+    for (int class_idx = 0; class_idx < POOL_SIZE_CLASSES; class_idx++) {
+        int count = PREWARM_COUNTS[class_idx];
+
+        // Directly refill TLS cache (bypass alloc/free during init)
+        // This avoids issues with g_initializing=1 affecting routing
+        void* chain = backend_batch_carve(class_idx, count);
+        if (chain) {
+            // Install entire chain directly into TLS
+            pool_install_chain(class_idx, chain, count);
+        }
+        // If OOM, continue with other classes (graceful degradation)
+    }
+}
--- a/core/pool_tls.h
+++ b/core/pool_tls.h
@ -14,10 +14,17 @@ void  pool_free(void* ptr);
 void  pool_thread_init(void);
 void  pool_thread_cleanup(void);

+// Pre-warm TLS cache (Phase 1.5b - call once at thread init)
+void  pool_tls_prewarm(void);
+
 // Internal API (for Box 2 only)
 void pool_install_chain(int class_idx, void* chain, int count);
 int pool_get_refill_count(int class_idx);

+// Remote queue (cross-thread free) API — Phase 1.5c
+int pool_remote_push(int class_idx, void* ptr, int owner_tid);
+int pool_remote_drain_light(int class_idx);
+
 // Feature flags
 #define POOL_USE_HEADERS 1  // 1-byte headers for O(1) free

@ -26,4 +33,4 @@ int pool_get_refill_count(int class_idx);
 #define POOL_HEADER_SIZE 1
 #endif

-#endif // POOL_TLS_H
+#endif // POOL_TLS_H
--- a/core/pool_tls_arena.c
+++ b/core/pool_tls_arena.c
@ -0,0 +1,172 @@
+#include "pool_tls_arena.h"
+#include "pool_tls.h"  // For POOL_HEADER_SIZE, POOL_USE_HEADERS
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+// TLS storage (automatically zero-initialized)
+__thread PoolChunk g_tls_arena[POOL_SIZE_CLASSES];
+
+int g_arena_max_growth_level = 3;           // 0:1MB,1:2MB,2:4MB,3:8MB
+size_t g_arena_initial_chunk_size = (size_t)1 << 20; // 1MB
+
+static pthread_once_t g_arena_cfg_once = PTHREAD_ONCE_INIT;
+static void arena_read_env(void){
+  const char* s_init = getenv("HAKMEM_POOL_TLS_ARENA_MB_INIT");
+  const char* s_max  = getenv("HAKMEM_POOL_TLS_ARENA_MB_MAX");
+  const char* s_gl   = getenv("HAKMEM_POOL_TLS_ARENA_GROWTH_LEVELS");
+  if (s_init){ long v = atol(s_init); if (v>=1 && v<=64) g_arena_initial_chunk_size = (size_t)v << 20; }
+  if (s_max){ long v = atol(s_max); if (v>=1 && v<=1024){
+      size_t max_bytes = (size_t)v << 20; size_t sz = g_arena_initial_chunk_size; int lvl=0; while (sz < max_bytes && lvl<30){ sz <<= 1; lvl++; }
+      g_arena_max_growth_level = lvl; if (g_arena_max_growth_level<0) g_arena_max_growth_level=0; }
+  }
+  if (s_gl){ long v = atol(s_gl); if (v>=0 && v<=30) g_arena_max_growth_level = (int)v; }
+}
+
+// External imports (from pool config)
+extern const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES];
+
+// Debug stats
+#ifdef POOL_TLS_ARENA_DEBUG
+static __thread struct {
+    uint64_t mmap_calls;
+    uint64_t total_carved;
+    uint64_t chunk_exhaustions;
+} g_arena_stats;
+#endif
+
+// Ensure chunk has space for at least 'needed' bytes
+// Returns 0 on success, -1 on mmap failure
+static int chunk_ensure(PoolChunk* chunk, size_t needed) {
+    // Check if current chunk has space
+    if (chunk->chunk_base && (chunk->offset + needed <= chunk->chunk_size)) {
+        return 0;  // Space available
+    }
+
+    // Need new chunk - calculate size with exponential growth
+    pthread_once(&g_arena_cfg_once, arena_read_env);
+    size_t new_size;
+    if (chunk->growth_level >= g_arena_max_growth_level) {
+        new_size = g_arena_initial_chunk_size << g_arena_max_growth_level;
+    } else {
+        new_size = g_arena_initial_chunk_size << chunk->growth_level;
+        chunk->growth_level++;
+    }
+
+    // CRITICAL FIX: DO NOT munmap old chunk!
+    // Reason: Live allocations may still point into it. Arena chunks are kept
+    // alive for the thread's lifetime and only freed at thread exit.
+    // This is standard arena behavior - grow but never shrink.
+    //
+    // REMOVED BUGGY CODE:
+    // if (chunk->chunk_base) {
+    //     munmap(chunk->chunk_base, chunk->chunk_size);  // ← SEGV! Live ptrs exist!
+    // }
+    //
+    // OLD CHUNK IS LEAKED INTENTIONALLY - it contains live allocations
+
+#ifdef POOL_TLS_ARENA_DEBUG
+    if (chunk->chunk_base) {
+        g_arena_stats.chunk_exhaustions++;
+    }
+#endif
+
+    // Allocate new chunk
+    void* new_base = mmap(NULL, new_size, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (new_base == MAP_FAILED) {
+        return -1;  // OOM
+    }
+
+#ifdef POOL_TLS_ARENA_DEBUG
+    g_arena_stats.mmap_calls++;
+#endif
+
+    // Register range for owner resolution
+    pid_t tid = (pid_t)syscall(SYS_gettid);
+    pool_reg_register(new_base, new_size, tid, -1); // class-less at arena level
+
+    chunk->chunk_base = new_base;
+    chunk->chunk_size = new_size;
+    chunk->offset = 0;
+
+    return 0;
+}
+
+// Carve blocks from TLS Arena
+int arena_batch_carve(int class_idx, void** out_blocks, int count) {
+    if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES) {
+        return 0;  // Invalid class
+    }
+
+    PoolChunk* chunk = &g_tls_arena[class_idx];
+    size_t block_size = POOL_CLASS_SIZES[class_idx];
+
+    // Calculate allocation size with header space
+#if POOL_USE_HEADERS
+    size_t alloc_size = block_size + POOL_HEADER_SIZE;
+#else
+    size_t alloc_size = block_size;
+#endif
+
+    // Ensure chunk has space for all blocks
+    size_t needed = alloc_size * count;
+    if (chunk_ensure(chunk, needed) != 0) {
+        return 0;  // OOM
+    }
+
+    // Carve blocks from chunk
+    int carved = 0;
+    for (int i = 0; i < count; i++) {
+        if (chunk->offset + alloc_size > chunk->chunk_size) {
+            break;  // Chunk exhausted (shouldn't happen after ensure)
+        }
+
+        // Return pointer AFTER header space
+        out_blocks[i] = (char*)chunk->chunk_base + chunk->offset
+#if POOL_USE_HEADERS
+                        + POOL_HEADER_SIZE
+#endif
+                        ;
+        chunk->offset += alloc_size;
+        carved++;
+
+#ifdef POOL_TLS_ARENA_DEBUG
+        g_arena_stats.total_carved++;
+#endif
+    }
+
+    return carved;
+}
+
+// Thread cleanup
+static void __attribute__((destructor)) arena_cleanup(void) {
+    arena_cleanup_thread();
+}
+
+void arena_cleanup_thread(void) {
+    for (int i = 0; i < POOL_SIZE_CLASSES; i++) {
+        PoolChunk* chunk = &g_tls_arena[i];
+        if (chunk->chunk_base) {
+            pid_t tid = (pid_t)syscall(SYS_gettid);
+            pool_reg_unregister(chunk->chunk_base, chunk->chunk_size, tid);
+            munmap(chunk->chunk_base, chunk->chunk_size);
+            chunk->chunk_base = NULL;
+        }
+    }
+}
+
+#ifdef POOL_TLS_ARENA_DEBUG
+#include <stdio.h>
+
+void arena_print_stats(void) {
+    printf("[Pool TLS Arena Stats]\n");
+    printf("  mmap calls: %lu\n", g_arena_stats.mmap_calls);
+    printf("  blocks carved: %lu\n", g_arena_stats.total_carved);
+    printf("  chunk exhaustions: %lu\n", g_arena_stats.chunk_exhaustions);
+}
+#endif
--- a/core/pool_tls_arena.d
+++ b/core/pool_tls_arena.d
@ -0,0 +1,4 @@
+core/pool_tls_arena.o: core/pool_tls_arena.c core/pool_tls_arena.h \
+ core/pool_tls.h
+core/pool_tls_arena.h:
+core/pool_tls.h:
--- a/core/pool_tls_arena.h
+++ b/core/pool_tls_arena.h
@ -0,0 +1,31 @@
+#ifndef HAKMEM_POOL_TLS_ARENA_H
+#define HAKMEM_POOL_TLS_ARENA_H
+
+#include <stddef.h>
+
+// Configuration
+#define POOL_SIZE_CLASSES 7
+extern int g_arena_max_growth_level;     // 0..N (3 => 8MB cap)
+extern size_t g_arena_initial_chunk_size; // bytes (default 1MB)
+
+// TLS Arena Chunk
+typedef struct {
+    void*  chunk_base;      // mmap base address (page-aligned)
+    size_t chunk_size;      // Current chunk size (1/2/4/8 MB)
+    size_t offset;          // Next carve offset
+    int    growth_level;    // 0=1MB, 1=2MB, 2=4MB, 3=8MB
+} PoolChunk;
+
+// API
+// Carve 'count' blocks from TLS Arena for 'class_idx'
+// Returns number of blocks carved (0 on OOM)
+int arena_batch_carve(int class_idx, void** out_blocks, int count);
+
+// Thread cleanup (munmap all chunks)
+void arena_cleanup_thread(void);
+
+#ifdef POOL_TLS_ARENA_DEBUG
+void arena_print_stats(void);
+#endif
+
+#endif // HAKMEM_POOL_TLS_ARENA_H
--- a/core/pool_tls_registry.c
+++ b/core/pool_tls_registry.c
@ -0,0 +1,68 @@
+#include "pool_tls_registry.h"
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct RegEntry {
+  void* base;
+  void* end;
+  pid_t tid;
+  int class_idx;
+  struct RegEntry* next;
+} RegEntry;
+
+#define REG_BUCKETS 1024
+static RegEntry* g_buckets[REG_BUCKETS];
+static pthread_mutex_t g_locks[REG_BUCKETS];
+static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;
+
+static void reg_init(void){
+  for (int i=0;i<REG_BUCKETS;i++) pthread_mutex_init(&g_locks[i], NULL);
+}
+
+static inline uint64_t hash_ptr(void* p){
+  uintptr_t x=(uintptr_t)p; x ^= x>>33; x*=0xff51afd7ed558ccdULL; x ^= x>>33; x*=0xc4ceb9fe1a85ec53ULL; x ^= x>>33; return x;
+}
+
+void pool_reg_register(void* base, size_t size, pid_t tid, int class_idx){
+  pthread_once(&g_init_once, reg_init);
+  void* end = (void*)((char*)base + size);
+  uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
+  pthread_mutex_lock(&g_locks[h]);
+  RegEntry* e = (RegEntry*)malloc(sizeof(RegEntry));
+  e->base = base; e->end = end; e->tid = tid; e->class_idx = class_idx; e->next = g_buckets[h];
+  g_buckets[h] = e;
+  pthread_mutex_unlock(&g_locks[h]);
+}
+
+void pool_reg_unregister(void* base, size_t size, pid_t tid){
+  pthread_once(&g_init_once, reg_init);
+  uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
+  pthread_mutex_lock(&g_locks[h]);
+  RegEntry** pp = &g_buckets[h];
+  while (*pp){
+    RegEntry* e = *pp;
+    if (e->base == base && e->tid == tid){
+      *pp = e->next; free(e); break;
+    }
+    pp = &e->next;
+  }
+  pthread_mutex_unlock(&g_locks[h]);
+}
+
+int pool_reg_lookup(void* ptr, pid_t* tid_out, int* class_idx_out){
+  pthread_once(&g_init_once, reg_init);
+  uint64_t h = hash_ptr(ptr) & (REG_BUCKETS-1);
+  pthread_mutex_lock(&g_locks[h]);
+  for (RegEntry* e = g_buckets[h]; e; e=e->next){
+    if (ptr >= e->base && ptr < e->end){
+      if (tid_out) *tid_out = e->tid;
+      if (class_idx_out) *class_idx_out = e->class_idx;
+      pthread_mutex_unlock(&g_locks[h]);
+      return 1;
+    }
+  }
+  pthread_mutex_unlock(&g_locks[h]);
+  return 0;
+}
+
--- a/core/pool_tls_registry.h
+++ b/core/pool_tls_registry.h
@ -0,0 +1,16 @@
+#ifndef HAKMEM_POOL_TLS_REGISTRY_H
+#define HAKMEM_POOL_TLS_REGISTRY_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+// Register an arena chunk range with owner thread id and class index
+void pool_reg_register(void* base, size_t size, pid_t tid, int class_idx);
+// Unregister a previously registered chunk
+void pool_reg_unregister(void* base, size_t size, pid_t tid);
+// Lookup owner for a pointer; returns 1 if found, 0 otherwise
+int pool_reg_lookup(void* ptr, pid_t* tid_out, int* class_idx_out);
+
+#endif
+
--- a/core/pool_tls_remote.c
+++ b/core/pool_tls_remote.c
@ -0,0 +1,72 @@
+#include "pool_tls_remote.h"
+#include <pthread.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#define REMOTE_BUCKETS 256
+
+typedef struct RemoteRec {
+  int tid;
+  void* head[7];
+  int   count[7];
+  struct RemoteRec* next;
+} RemoteRec;
+
+static RemoteRec* g_buckets[REMOTE_BUCKETS];
+static pthread_mutex_t g_locks[REMOTE_BUCKETS];
+static pthread_once_t g_once = PTHREAD_ONCE_INIT;
+
+static void rq_init(void){
+  for (int i=0;i<REMOTE_BUCKETS;i++) pthread_mutex_init(&g_locks[i], NULL);
+}
+
+static inline unsigned hb(int tid){ return (unsigned)tid & (REMOTE_BUCKETS-1); }
+
+int pool_remote_push(int class_idx, void* ptr, int owner_tid){
+  if (class_idx < 0 || class_idx > 6 || ptr == NULL) return 0;
+  pthread_once(&g_once, rq_init);
+  unsigned b = hb(owner_tid);
+  pthread_mutex_lock(&g_locks[b]);
+  RemoteRec* r = g_buckets[b];
+  while (r && r->tid != owner_tid) r = r->next;
+  if (!r){
+    r = (RemoteRec*)calloc(1, sizeof(RemoteRec));
+    r->tid = owner_tid; r->next = g_buckets[b]; g_buckets[b] = r;
+  }
+  *(void**)ptr = r->head[class_idx];
+  r->head[class_idx] = ptr;
+  r->count[class_idx]++;
+  pthread_mutex_unlock(&g_locks[b]);
+  return 1;
+}
+
+// Drain up to a small batch for this thread and class
+int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain){
+  if (class_idx < 0 || class_idx > 6 || out_chain==NULL) return 0;
+  pthread_once(&g_once, rq_init);
+  int mytid = (int)syscall(SYS_gettid);
+  unsigned b = hb(mytid);
+  pthread_mutex_lock(&g_locks[b]);
+  RemoteRec* r = g_buckets[b];
+  while (r && r->tid != mytid) r = r->next;
+  int drained = 0;
+  if (r){
+    // Pop up to max_take nodes and return chain
+    void* head = r->head[class_idx];
+    int batch = 0; if (max_take <= 0) max_take = 32;
+    void* chain = NULL; void* tail = NULL;
+    while (head && batch < max_take){
+      void* nxt = *(void**)head;
+      if (!chain){ chain = head; tail = head; }
+      else { *(void**)tail = head; tail = head; }
+      head = nxt; batch++;
+    }
+    r->head[class_idx] = head;
+    r->count[class_idx] -= batch;
+    drained = batch;
+    *out_chain = chain;
+  }
+  pthread_mutex_unlock(&g_locks[b]);
+  return drained;
+}
--- a/core/pool_tls_remote.h
+++ b/core/pool_tls_remote.h
@ -0,0 +1,9 @@
+#ifndef POOL_TLS_REMOTE_H
+#define POOL_TLS_REMOTE_H
+
+#include <stdint.h>
+
+int pool_remote_push(int class_idx, void* ptr, int owner_tid);
+int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
+
+#endif
--- a/core/tiny_alloc_fast.inc.h
+++ b/core/tiny_alloc_fast.inc.h
@ -336,6 +336,8 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
    // Previous: Complex precedence logic on every miss (5-10 cycles overhead)
    // Now: Simple TLS cache lookup (1-2 cycles)
    static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
+    // Simple adaptive booster: bump per-class refill size when refills are frequent.
+    static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
    int cnt = s_refill_count[class_idx];
    if (__builtin_expect(cnt == 0, 0)) {
        // First miss: Initialize from globals (parsed at init time)
@ -375,6 +377,26 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
    // Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss()
    int refilled = sll_refill_small_from_ss(class_idx, cnt);

+    // Lightweight adaptation: if refills keep happening, increase per-class refill.
+    // Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
+    if (refilled > 0) {
+        uint8_t c = ++s_refill_calls[class_idx];
+        if (class_idx == 7) {
+            // Every 4 refills, increase target by +16 up to 128 (unless overridden).
+            if ((c & 0x03u) == 0) {
+                int target = s_refill_count[class_idx];
+                if (target < 128) {
+                    target += 16;
+                    if (target > 128) target = 128;
+                    s_refill_count[class_idx] = target;
+                }
+            }
+        }
+    } else {
+        // No refill performed (capacity full): slowly decay the counter.
+        if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
+    }
+
    // Phase 2b: Track refill and adapt cache size
    if (refilled > 0) {
        track_refill_for_adaptation(class_idx);
--- a/core/tiny_refill_opt.h
+++ b/core/tiny_refill_opt.h
@ -60,7 +60,8 @@ static inline void trc_splice_to_sll(int class_idx, TinyRefillChain* c,
    // CORRUPTION DEBUG: Validate chain before splicing
    if (__builtin_expect(trc_refill_guard_enabled(), 0)) {
        extern const size_t g_tiny_class_sizes[];
-        size_t blk = g_tiny_class_sizes[class_idx];
+        // Validate alignment using effective stride (include header for classes 0..6)
+        size_t blk = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);

        fprintf(stderr, "[SPLICE_TO_SLL] cls=%d head=%p tail=%p count=%u\n",
                class_idx, c->head, c->tail, c->count);
@ -187,7 +188,13 @@ static inline uint32_t trc_linear_carve(uint8_t* base, size_t bs,
    }

    // FIX: Use carved counter (monotonic) instead of used (which decrements on free)
-    uint8_t* cursor = base + ((size_t)meta->carved * bs);
+    // Effective stride: account for Tiny header when enabled (classes 0..6)
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    size_t stride = (bs == 1024 ? bs : (bs + 1));
+#else
+    size_t stride = bs;
+#endif
+    uint8_t* cursor = base + ((size_t)meta->carved * stride);
    void* head = (void*)cursor;

    // CORRUPTION DEBUG: Log carve operation
@ -197,7 +204,7 @@ static inline uint32_t trc_linear_carve(uint8_t* base, size_t bs,
    }

    for (uint32_t i = 1; i < batch; i++) {
-        uint8_t* next = cursor + bs;
+        uint8_t* next = cursor + stride;
        *(void**)cursor = (void*)next;
        cursor = next;
    }
--- a/core/tiny_region_id.h
+++ b/core/tiny_region_id.h
@ -44,17 +44,18 @@
 static inline void* tiny_region_id_write_header(void* base, int class_idx) {
    if (!base) return base;

+    // Special-case class 7 (1024B blocks): return full block without header.
+    // Rationale: 1024B requests must not pay an extra 1-byte header (would overflow)
+    // and routing them to Mid/OS causes excessive mmap/madvise. We keep Tiny owner
+    // and let free() take the slow path (headerless → slab lookup).
+    if (__builtin_expect(class_idx == 7, 0)) {
+        return base;  // no header written; user gets full 1024B
+    }
+
    // Write header at block start
    uint8_t* header_ptr = (uint8_t*)base;
-
-    // CRITICAL (Phase 7-1.3): ALWAYS write magic byte for safety
-    // Reason: Free path ALWAYS validates magic (even in release) to detect
-    // non-Tiny allocations. Without magic, all frees would fail validation.
-    // Performance: Magic write is FREE (same 1-byte write, just different value)
    *header_ptr = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
-
-    // Return user pointer (skip header)
-    return header_ptr + 1;
+    return header_ptr + 1;  // skip header for user pointer
 }

 // ========== Read Header (Free) ==========
--- a/core/tiny_superslab_alloc.inc.h
+++ b/core/tiny_superslab_alloc.inc.h
@ -13,6 +13,7 @@
 // ============================================================================

 // Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
+#include "hakmem_tiny_superslab_constants.h"
 static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
    TinySlabMeta* meta = &ss->slabs[slab_idx];

@ -70,13 +71,36 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
    // This avoids the 4000-8000 cycle cost of building freelist on init
    if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
        // Linear allocation: use canonical tiny_slab_base_for() only
-        size_t block_size = g_tiny_class_sizes[ss->size_class];
+        size_t unit_sz = g_tiny_class_sizes[ss->size_class]
+#if HAKMEM_TINY_HEADER_CLASSIDX
+                         + ((ss->size_class != 7) ? 1 : 0)
+#endif
+                         ;
        uint8_t* base = tiny_slab_base_for(ss, slab_idx);
-        void* block = (void*)(base + ((size_t)meta->used * block_size));
+        void* block_base = (void*)(base + ((size_t)meta->used * unit_sz));
+#if !HAKMEM_BUILD_RELEASE
+        // Debug safety: Ensure we never carve past slab usable region (capacity mismatch guard)
+        size_t dbg_usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
+        uintptr_t dbg_off = (uintptr_t)((uint8_t*)block_base - base);
+        if (__builtin_expect(dbg_off + unit_sz > dbg_usable, 0)) {
+            fprintf(stderr, "[TINY_ALLOC_BOUNDS] cls=%u slab=%d used=%u cap=%u unit=%zu off=%lu usable=%zu\n",
+                    ss->size_class, slab_idx, meta->used, meta->capacity, unit_sz,
+                    (unsigned long)dbg_off, dbg_usable);
+            return NULL;
+        }
+#endif
        meta->used++;
-        tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0);
-        tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0);
-        return block;  // Fast path: O(1) pointer arithmetic
+        void* user =
+#if HAKMEM_TINY_HEADER_CLASSIDX
+            tiny_region_id_write_header(block_base, ss->size_class);
+#else
+            block_base;
+#endif
+        if (__builtin_expect(g_debug_remote_guard, 0)) {
+            tiny_remote_track_on_alloc(ss, slab_idx, user, "linear_alloc", 0);
+            tiny_remote_assert_not_remote(ss, slab_idx, user, "linear_alloc_ret", 0);
+        }
+        return user;  // Fast path: O(1) pointer arithmetic
    }

    // Freelist mode (after first free())
@ -125,8 +149,10 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
            }
        }

-        tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
-        tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
+        if (__builtin_expect(g_debug_remote_guard, 0)) {
+            tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
+            tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
+        }
        return block;
    }