diff --git a/core/box/tiny_front_hot_box.h b/core/box/tiny_front_hot_box.h
index 0e6220d5..aca1ee72 100644
--- a/core/box/tiny_front_hot_box.h
+++ b/core/box/tiny_front_hot_box.h
@@ -50,13 +50,13 @@
 // Debug Metrics (Zero Overhead in Release)
 // ============================================================================
 
-#if !HAKMEM_BUILD_RELEASE
-// Increment cache hit counter (debug only)
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
+// Increment cache hit counter (debug/observe only; zero overhead when compiled-out)
 #define TINY_HOT_METRICS_HIT(class_idx) \
     do { extern __thread uint64_t g_unified_cache_hit[]; \
          g_unified_cache_hit[class_idx]++; } while(0)
 
-// Increment cache miss counter (debug only)
+// Increment cache miss counter (debug/observe only; zero overhead when compiled-out)
 #define TINY_HOT_METRICS_MISS(class_idx) \
     do { extern __thread uint64_t g_unified_cache_miss[]; \
          g_unified_cache_miss[class_idx]++; } while(0)
diff --git a/core/front/tiny_unified_cache.c b/core/front/tiny_unified_cache.c
index 86574d02..527c11ef 100644
--- a/core/front/tiny_unified_cache.c
+++ b/core/front/tiny_unified_cache.c
@@ -97,7 +97,7 @@ __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
 // Metrics (Phase 23, optional for debugging)
 // ============================================================================
 
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
 __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
 __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
 __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
@@ -262,7 +262,7 @@ void unified_cache_shutdown(void) {
 void unified_cache_print_stats(void) {
     if (!unified_cache_enabled()) return;
 
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
     fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");
 
     for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
@@ -296,6 +296,11 @@ void unified_cache_print_stats(void) {
 #endif
 }
 
+__attribute__((destructor))
+static void unified_cache_auto_stats(void) {
+    unified_cache_print_stats();
+}
+
 // ============================================================================
 // Warm Pool Stats (always compiled, ENV-gated at runtime)
 // ============================================================================
diff --git a/core/front/tiny_unified_cache.h b/core/front/tiny_unified_cache.h
index 493e8679..65adb40f 100644
--- a/core/front/tiny_unified_cache.h
+++ b/core/front/tiny_unified_cache.h
@@ -92,7 +92,7 @@ extern __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];
 // Metrics (Phase 23, optional for debugging)
 // ============================================================================
 
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
 extern __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES];    // Alloc hits
 extern __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES];   // Alloc misses
 extern __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES];   // Free pushes
@@ -143,7 +143,7 @@ static inline size_t unified_capacity(int class_idx) {
         while (pow2 < g_cap[class_idx]) pow2 *= 2;
         g_cap[class_idx] = pow2;
 
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
         fprintf(stderr, "[Unified-INIT] C%d capacity = %zu (power of 2)\n", class_idx, g_cap[class_idx]);
         fflush(stderr);
 #endif
@@ -197,7 +197,7 @@ static inline hak_base_ptr_t unified_cache_pop(int class_idx) {
 
     // Empty check
     if (__builtin_expect(cache->head == cache->tail, 0)) {
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
         g_unified_cache_miss[class_idx]++;
 #endif
         return HAK_BASE_FROM_RAW(NULL);  // Empty
@@ -207,7 +207,7 @@ static inline hak_base_ptr_t unified_cache_pop(int class_idx) {
     void* base = cache->slots[cache->head];  // 1 cache miss (array access)
     cache->head = (cache->head + 1) & cache->mask;  // Fast modulo (power of 2)
 
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
     g_unified_cache_hit[class_idx]++;
 #endif
 
@@ -251,7 +251,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
 
     // Full check (leave 1 slot empty to distinguish full/empty)
     if (__builtin_expect(next_tail == cache->head, 0)) {
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
         g_unified_cache_full[class_idx]++;
 #endif
         return 0;  // Full
@@ -261,7 +261,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
     cache->slots[cache->tail] = base_raw;  // 1 cache miss (array write)
     cache->tail = next_tail;
 
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
     g_unified_cache_push[class_idx]++;
 #endif
 
@@ -300,7 +300,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
     // Phase 22: Compile-out when disabled (default OFF)
     void* tcache_base = tiny_tcache_try_pop(class_idx);
     if (tcache_base != NULL) {
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
         g_unified_cache_hit[class_idx]++;
 #endif
 #if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
@@ -320,7 +320,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
     if (__builtin_expect(cache->head != cache->tail, 1)) {
         void* base = cache->slots[cache->head];  // 1 cache miss (array access)
         cache->head = (cache->head + 1) & cache->mask;
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
         g_unified_cache_hit[class_idx]++;
 #endif
 #if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
@@ -337,7 +337,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
     }
 
     // Cache miss → Batch refill from SuperSlab
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
     g_unified_cache_miss[class_idx]++;
 #endif
     return unified_cache_refill(class_idx);  // Refill + return first block (BASE)
diff --git a/core/front/tiny_warm_pool.h b/core/front/tiny_warm_pool.h
index c9b21a48..4e12b73e 100644
--- a/core/front/tiny_warm_pool.h
+++ b/core/front/tiny_warm_pool.h
@@ -38,12 +38,11 @@
 //
 // ============================================================================
 
-// Maximum warm SuperSlabs per thread per class (tunable)
-// Trade-off: Working set size vs warm pool effectiveness
-//   - 4: Original (90% hit rate expected, but broken implementation - hardcoded prefill threshold)
-//   - 12: Optimized capacity with matching prefill threshold (Phase 1)
-//   - Higher values: More memory but better locality
-#define TINY_WARM_POOL_MAX_PER_CLASS 12
+// Warm pool sizing policy:
+// - DEFAULT is the value used when ENV is unset (keeps TLS footprint reasonable).
+// - MAX is the absolute cap used for array sizing and clamping (allows safe ENV sweeps).
+#define TINY_WARM_POOL_DEFAULT_PER_CLASS 12
+#define TINY_WARM_POOL_MAX_PER_CLASS 32
 
 typedef struct {
     SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS];
@@ -106,12 +105,12 @@ static inline int warm_pool_max_per_class(void) {
         const char* env = getenv("HAKMEM_WARM_POOL_SIZE");
         if (env && *env) {
             int v = atoi(env);
-            // Clamp to valid range [1, 12]
+            // Clamp to valid range [1, TINY_WARM_POOL_MAX_PER_CLASS]
             if (v < 1) v = 1;
-            if (v > 12) v = 12;
+            if (v > TINY_WARM_POOL_MAX_PER_CLASS) v = TINY_WARM_POOL_MAX_PER_CLASS;
             g_max = v;
         } else {
-            g_max = TINY_WARM_POOL_MAX_PER_CLASS;
+            g_max = TINY_WARM_POOL_DEFAULT_PER_CLASS;
         }
     }
     return g_max;
diff --git a/core/hakmem_shared_pool_acquire.c b/core/hakmem_shared_pool_acquire.c
index 21167c22..8a98b7b8 100644
--- a/core/hakmem_shared_pool_acquire.c
+++ b/core/hakmem_shared_pool_acquire.c
@@ -266,6 +266,9 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
     SuperSlab* primary_result = NULL;
     int primary_slab_idx = -1;
 
+    // Cache warm pool cap once per acquire call (SSOT: same as unified_cache_refill()).
+    const int warm_cap = warm_pool_max_per_class();
+
     for (int i = 0; i < scan_limit; i++) {
         SuperSlab* ss = super_reg_by_class_at(class_idx, i);
         if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
@@ -274,9 +277,8 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
         if (ss->empty_count == 0) continue;  // No EMPTY slabs in this SS
 
         // WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
-        // This is low-cost during registry scan and avoids future expensive scans
-        // Phase 1: Increase threshold from 4 to 12 to match TINY_WARM_POOL_MAX_PER_CLASS
-        if (ss != primary_result && tiny_warm_pool_count(class_idx) < 12) {
+        // This is low-cost during registry scan and avoids future expensive scans.
+        if (ss != primary_result && tiny_warm_pool_count(class_idx) < warm_cap) {
             tiny_warm_pool_push(class_idx, ss);
             // Track prefilled SuperSlabs for metrics
             g_warm_pool_stats[class_idx].prefilled++;