From 030132f9113cb0d2649a20449008e28f078e93f5 Mon Sep 17 00:00:00 2001
From: "Moe Charm (CI)" <moecharm@example.com>
Date: Thu, 13 Nov 2025 14:25:54 +0900
Subject: [PATCH] Phase 10: TLS/SFC aggressive cache tuning (syscall reduction
 failed)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Goal: Reduce backend transitions by increasing frontend hit rate
Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn)

Implementation:

1. Cache capacity expansion (2-8x per-class)
   - Hot classes (C0-C3): 4x increase (512 slots)
   - Medium classes (C4-C6): 2-3x increase
   - Class 7 (1KB): 2x increase (128 slots)
   - Fast cache: 2x default capacity

2. Refill batch size increase (4-8x)
   - Global default: 16 → 64 (4x)
   - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT
   - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID
   - Class 7: 64 → 128 (2x)
   - SFC refill: 64 → 128 (2x)

3. Adaptive sizing aggressive parameters
   - Grow threshold: 80% → 70% (expand earlier)
   - Shrink threshold: 20% → 10% (shrink less)
   - Growth rate: 2x → 1.5x (smoother growth)
   - Max capacity: 2048 → 4096 (2x ceiling)
   - Adapt frequency: Every 10 → 5 refills (more responsive)

Performance Results (100K iterations):

Before (Phase 9):
- Performance: 9.71M ops/s
- Syscalls: 1,729 (mmap:877, munmap:852)

After (Phase 10):
- Default settings: 8.77M ops/s (-9.7%) ⚠️
- Optimal ENV: 9.89M ops/s (+2%) ✅
- Syscalls: 1,729 (unchanged) ❌

Optimal ENV configuration:
export HAKMEM_TINY_REFILL_COUNT_HOT=256
export HAKMEM_TINY_REFILL_COUNT_MID=192

Root Cause Analysis:

Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn:
- 877 SuperSlabs allocated (877MB via mmap)
- Phase 9 LRU cache not utilized (no frees during benchmark)
- All SuperSlabs retained until program exit
- System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap)

Conclusion:

TLS/SFC tuning cannot solve SuperSlab allocation policy problem.
Next step: Phase 11 SuperSlab Prewarm strategy to eliminate
mmap/munmap during benchmark execution.

ChatGPT review: Strategy validated, Option A (Prewarm) recommended.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 core/hakmem_build_flags.h   | 11 +++----
 core/hakmem_tiny_config.c   | 58 ++++++++++++++++++++-----------------
 core/hakmem_tiny_config.h   |  9 +++---
 core/hakmem_tiny_init.inc   |  8 ++++-
 core/tiny_adaptive_sizing.c |  6 ++--
 core/tiny_adaptive_sizing.h | 17 ++++++-----
 6 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/core/hakmem_build_flags.h b/core/hakmem_build_flags.h
index d0a65f23..638bc8ce 100644
--- a/core/hakmem_build_flags.h
+++ b/core/hakmem_build_flags.h
@@ -81,12 +81,13 @@
 #  define HAKMEM_TINY_SAFE_FREE 0
 #endif
 
-// Phase 7 refill count defaults (tunable via env vars)
-// HAKMEM_TINY_REFILL_COUNT: global default (default: 16)
-// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 16)
-// HAKMEM_TINY_REFILL_COUNT_MID: class 4-7 (default: 16)
+// Phase 10: Aggressive refill count defaults (tunable via env vars)
+// Goal: Reduce backend transitions by refilling in larger batches
+// HAKMEM_TINY_REFILL_COUNT: global default (default: 64)
+// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 128)
+// HAKMEM_TINY_REFILL_COUNT_MID: class 4-7 (default: 96)
 #ifndef HAKMEM_TINY_REFILL_DEFAULT
-#  define HAKMEM_TINY_REFILL_DEFAULT 16
+#  define HAKMEM_TINY_REFILL_DEFAULT 64
 #endif
 
 // ------------------------------------------------------------
diff --git a/core/hakmem_tiny_config.c b/core/hakmem_tiny_config.c
index 15ae4b96..1e3ff4d5 100644
--- a/core/hakmem_tiny_config.c
+++ b/core/hakmem_tiny_config.c
@@ -10,21 +10,22 @@
 // Fast Cache Configuration
 // ============================================================================
 
-// Factory defaults (“balanced”) – mutable at runtime
-// Small classes (0..2) are given higher caps by default to favor hot small-size throughput.
+// Factory defaults ("aggressive") – mutable at runtime
+// Phase 10: Aggressive cache sizing to maximize TLS hit rate
+// Hot classes (C0-C3) get 2-4x larger caches to reduce backend transitions
 static const uint16_t k_fast_cap_defaults_factory[TINY_NUM_CLASSES] = {
-    256,   // Class 0:   8B (was 128)
-    256,   // Class 1:  16B (was 128)
-    256,   // Class 2:  32B (was 128)
-    128,   // Class 3:  64B (reduced from 512 to limit RSS)
-    128,   // Class 4: 128B (trimmed via ACE/TLS caps)
-    224,   // Class 5: 256B (bench-optimized default)
-    128,   // Class 6: 512B
-    48     // Class 7: 1KB (reduce superslab reliance)
+    512,   // Class 0:   8B (2x increase: hot class)
+    512,   // Class 1:  16B (2x increase: hot class)
+    512,   // Class 2:  32B (2x increase: hot class)
+    384,   // Class 3:  64B (3x increase: hot class)
+    256,   // Class 4: 128B (2x increase: medium class)
+    384,   // Class 5: 256B (1.7x increase: bench-optimized)
+    192,   // Class 6: 512B (1.5x increase)
+    96     // Class 7: 1KB (2x increase: reduce superslab reliance)
 };
 
 uint16_t g_fast_cap_defaults[TINY_NUM_CLASSES] = {
-    256, 256, 256, 128, 128, 224, 128, 48
+    512, 512, 512, 384, 256, 384, 192, 96
 };
 
 void tiny_config_reset_defaults(void) {
@@ -38,16 +39,18 @@ void tiny_config_reset_defaults(void) {
 // ============================================================================
 
 // Default TLS magazine capacities per class
+// Phase 10: Aggressive cache sizing for hot classes (C0-C3)
+// Goal: Maximize TLS hit rate, reduce backend transitions
 int tiny_default_cap(int class_idx) {
     switch (class_idx) {
-        case 0: return 128;   // 8B
-        case 1: return 128;   // 16B
-        case 2: return 128;   // 32B
-        case 3: return 128;   // 64B (reduced from 512 to limit RSS)
-        case 4: return 96;    // 128B (aggressively trimmed to limit RSS)
-        case 5: return 128;   // 256B
-        case 6: return 128;   // 512B
-        default: return 64;   // 1KB
+        case 0: return 512;   // 8B  (4x increase: hot class)
+        case 1: return 512;   // 16B (4x increase: hot class)
+        case 2: return 512;   // 32B (4x increase: hot class)
+        case 3: return 384;   // 64B (3x increase: hot class)
+        case 4: return 192;   // 128B (2x increase: medium class)
+        case 5: return 256;   // 256B (2x increase: medium class)
+        case 6: return 192;   // 512B (1.5x increase)
+        default: return 128;  // 1KB (2x increase)
     }
 }
 
@@ -57,15 +60,16 @@ int tiny_mag_default_cap(int class_idx) {
 }
 
 // Maximum allowed TLS magazine capacities per class
+// Phase 10: Raise ceilings to allow aggressive cache growth
 int tiny_cap_max_for_class(int class_idx) {
     switch (class_idx) {
-        case 0: return 2048;
-        case 1: return 1024;
-        case 2: return 768;
-        case 3: return 512;
-        case 4: return 160;
-        case 5: return 256;
-        case 6: return 128;
-        default: return 64;
+        case 0: return 4096;   // 8B  (2x increase: allow massive caching)
+        case 1: return 4096;   // 16B (4x increase: hot class)
+        case 2: return 2048;   // 32B (2.67x increase: hot class)
+        case 3: return 1536;   // 64B (3x increase: hot class)
+        case 4: return 512;    // 128B (3.2x increase: medium class)
+        case 5: return 768;    // 256B (3x increase: medium class)
+        case 6: return 384;    // 512B (3x increase)
+        default: return 256;   // 1KB (4x increase)
     }
 }
diff --git a/core/hakmem_tiny_config.h b/core/hakmem_tiny_config.h
index d597fe89..3dcf7cde 100644
--- a/core/hakmem_tiny_config.h
+++ b/core/hakmem_tiny_config.h
@@ -146,10 +146,11 @@ int tiny_cap_max_for_class(int class_idx);
 extern int g_sfc_enabled;
 
 // SFC Default Configuration (can be overridden via ENV)
-// ENV: HAKMEM_SFC_CAPACITY (default: 128, range: 16-256)
-// ENV: HAKMEM_SFC_REFILL_COUNT (default: 64, range: 8-256)
-#define SFC_DEFAULT_CAPACITY 128
-#define SFC_DEFAULT_REFILL_COUNT 64
+// Phase 10: Aggressive SFC defaults to maximize front cache hit rate
+// ENV: HAKMEM_SFC_CAPACITY (default: 256, range: 16-512)
+// ENV: HAKMEM_SFC_REFILL_COUNT (default: 128, range: 8-256)
+#define SFC_DEFAULT_CAPACITY 256
+#define SFC_DEFAULT_REFILL_COUNT 128
 
 // SFC Per-Class Overrides (optional)
 // ENV: HAKMEM_SFC_CAPACITY_CLASS{0..7} (per-class capacity)
diff --git a/core/hakmem_tiny_init.inc b/core/hakmem_tiny_init.inc
index cc148555..0dba19de 100644
--- a/core/hakmem_tiny_init.inc
+++ b/core/hakmem_tiny_init.inc
@@ -466,17 +466,23 @@ void hak_tiny_init(void) {
     }
 
     // Front refill count globals
+    // Phase 10: Set aggressive defaults for hot and mid classes
     {
         char* g = getenv("HAKMEM_TINY_REFILL_COUNT");
         if (g) { int v = atoi(g); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_global = v; }
+        else { g_refill_count_global = 64; }  // Phase 10: default 64 (was 16)
+
         char* h = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
         if (h) { int v = atoi(h); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_hot = v; }
+        else { g_refill_count_hot = 128; }  // Phase 10: default 128 for hot classes (C0-C3)
+
         char* m = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
         if (m) { int v = atoi(m); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_mid = v; }
+        else { g_refill_count_mid = 96; }  // Phase 10: default 96 for mid classes (C4-C7)
     }
     // Sensible default for class 7 (1024B): favor larger refill to reduce refills/syscalls
     if (g_refill_count_class[7] == 0) {
-        g_refill_count_class[7] = 64;  // can be overridden by env HAKMEM_TINY_REFILL_COUNT_C7
+        g_refill_count_class[7] = 128;  // Phase 10: increased from 64 to 128
     }
     {
         char* fast_env = getenv("HAKMEM_TINY_FAST");
diff --git a/core/tiny_adaptive_sizing.c b/core/tiny_adaptive_sizing.c
index fb88f726..eefd6c1d 100644
--- a/core/tiny_adaptive_sizing.c
+++ b/core/tiny_adaptive_sizing.c
@@ -59,7 +59,9 @@ void adaptive_sizing_init(void) {
 void grow_tls_cache(int class_idx) {
     TLSCacheStats* stats = &g_tls_cache_stats[class_idx];
 
-    size_t new_capacity = stats->capacity * 2;
+    // Phase 10: Aggressive growth - add 50% instead of doubling
+    // This allows more gradual growth to match actual demand
+    size_t new_capacity = stats->capacity + (stats->capacity / 2);
     if (new_capacity > TLS_CACHE_MAX_CAPACITY) {
         new_capacity = TLS_CACHE_MAX_CAPACITY;
     }
@@ -73,7 +75,7 @@ void grow_tls_cache(int class_idx) {
     stats->grow_count++;
 
     if (g_adaptive_logging_enabled) {
-        fprintf(stderr, "[TLS_CACHE] Grow class %d: %zu → %zu slots (grow_count=%zu)\n",
+        fprintf(stderr, "[TLS_CACHE] Grow class %d: %zu → %zu slots (+50%%, grow_count=%zu)\n",
                 class_idx, old_capacity, stats->capacity, stats->grow_count);
     }
 }
diff --git a/core/tiny_adaptive_sizing.h b/core/tiny_adaptive_sizing.h
index 2bd8ce44..10b0f448 100644
--- a/core/tiny_adaptive_sizing.h
+++ b/core/tiny_adaptive_sizing.h
@@ -12,17 +12,20 @@
 // ========== Configuration ==========
 
 // Capacity bounds
-#define TLS_CACHE_MIN_CAPACITY 16      // Minimum cache size
-#define TLS_CACHE_MAX_CAPACITY 2048    // Maximum cache size
-#define TLS_CACHE_INITIAL_CAPACITY 64  // Initial size (reduced from 256)
+// Phase 10: Aggressive adaptive sizing - maximize front cache utilization
+#define TLS_CACHE_MIN_CAPACITY 32      // Minimum cache size (2x increase)
+#define TLS_CACHE_MAX_CAPACITY 4096    // Maximum cache size (2x increase)
+#define TLS_CACHE_INITIAL_CAPACITY 256 // Initial size (4x increase from 64)
 
 // Adaptation triggers
-#define ADAPT_REFILL_THRESHOLD 10      // Adapt every 10 refills
-#define ADAPT_TIME_THRESHOLD_NS (1000000000ULL)  // Or every 1 second
+// Phase 10: More frequent adaptation to respond quickly to workload changes
+#define ADAPT_REFILL_THRESHOLD 5       // Adapt every 5 refills (was 10)
+#define ADAPT_TIME_THRESHOLD_NS (500000000ULL)  // Or every 0.5 seconds (was 1s)
 
 // Growth/shrink thresholds
-#define GROW_THRESHOLD 0.8   // Grow if usage > 80% of capacity
-#define SHRINK_THRESHOLD 0.2 // Shrink if usage < 20% of capacity
+// Phase 10: Aggressive growth, conservative shrinkage
+#define GROW_THRESHOLD 0.7   // Grow if usage > 70% of capacity (was 80%)
+#define SHRINK_THRESHOLD 0.1 // Shrink if usage < 10% of capacity (was 20%)
 
 // ========== Data Structures ==========