Phase 10: TLS/SFC aggressive cache tuning (syscall reduction failed)
Goal: Reduce backend transitions by increasing frontend hit rate Result: +2% best case, syscalls unchanged (root cause: SuperSlab churn) Implementation: 1. Cache capacity expansion (2-8x per-class) - Hot classes (C0-C3): 4x increase (512 slots) - Medium classes (C4-C6): 2-3x increase - Class 7 (1KB): 2x increase (128 slots) - Fast cache: 2x default capacity 2. Refill batch size increase (4-8x) - Global default: 16 → 64 (4x) - Hot classes: 128 (8x) via HAKMEM_TINY_REFILL_COUNT_HOT - Mid classes: 96 (6x) via HAKMEM_TINY_REFILL_COUNT_MID - Class 7: 64 → 128 (2x) - SFC refill: 64 → 128 (2x) 3. Adaptive sizing aggressive parameters - Grow threshold: 80% → 70% (expand earlier) - Shrink threshold: 20% → 10% (shrink less) - Growth rate: 2x → 1.5x (smoother growth) - Max capacity: 2048 → 4096 (2x ceiling) - Adapt frequency: Every 10 → 5 refills (more responsive) Performance Results (100K iterations): Before (Phase 9): - Performance: 9.71M ops/s - Syscalls: 1,729 (mmap:877, munmap:852) After (Phase 10): - Default settings: 8.77M ops/s (-9.7%) ⚠️ - Optimal ENV: 9.89M ops/s (+2%) ✅ - Syscalls: 1,729 (unchanged) ❌ Optimal ENV configuration: export HAKMEM_TINY_REFILL_COUNT_HOT=256 export HAKMEM_TINY_REFILL_COUNT_MID=192 Root Cause Analysis: Bottleneck is NOT TLS/SFC hit rate, but SuperSlab allocation churn: - 877 SuperSlabs allocated (877MB via mmap) - Phase 9 LRU cache not utilized (no frees during benchmark) - All SuperSlabs retained until program exit - System malloc: 9 syscalls vs HAKMEM: 1,729 syscalls (192x gap) Conclusion: TLS/SFC tuning cannot solve SuperSlab allocation policy problem. Next step: Phase 11 SuperSlab Prewarm strategy to eliminate mmap/munmap during benchmark execution. ChatGPT review: Strategy validated, Option A (Prewarm) recommended. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -81,12 +81,13 @@
|
||||
# define HAKMEM_TINY_SAFE_FREE 0
|
||||
#endif
|
||||
|
||||
// Phase 7 refill count defaults (tunable via env vars)
|
||||
// HAKMEM_TINY_REFILL_COUNT: global default (default: 16)
|
||||
// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 16)
|
||||
// HAKMEM_TINY_REFILL_COUNT_MID: class 4-7 (default: 16)
|
||||
// Phase 10: Aggressive refill count defaults (tunable via env vars)
|
||||
// Goal: Reduce backend transitions by refilling in larger batches
|
||||
// HAKMEM_TINY_REFILL_COUNT: global default (default: 64)
|
||||
// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 128)
|
||||
// HAKMEM_TINY_REFILL_COUNT_MID: class 4-7 (default: 96)
|
||||
#ifndef HAKMEM_TINY_REFILL_DEFAULT
|
||||
# define HAKMEM_TINY_REFILL_DEFAULT 16
|
||||
# define HAKMEM_TINY_REFILL_DEFAULT 64
|
||||
#endif
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@ -10,21 +10,22 @@
|
||||
// Fast Cache Configuration
|
||||
// ============================================================================
|
||||
|
||||
// Factory defaults (“balanced”) – mutable at runtime
|
||||
// Small classes (0..2) are given higher caps by default to favor hot small-size throughput.
|
||||
// Factory defaults ("aggressive") – mutable at runtime
|
||||
// Phase 10: Aggressive cache sizing to maximize TLS hit rate
|
||||
// Hot classes (C0-C3) get 2-4x larger caches to reduce backend transitions
|
||||
static const uint16_t k_fast_cap_defaults_factory[TINY_NUM_CLASSES] = {
|
||||
256, // Class 0: 8B (was 128)
|
||||
256, // Class 1: 16B (was 128)
|
||||
256, // Class 2: 32B (was 128)
|
||||
128, // Class 3: 64B (reduced from 512 to limit RSS)
|
||||
128, // Class 4: 128B (trimmed via ACE/TLS caps)
|
||||
224, // Class 5: 256B (bench-optimized default)
|
||||
128, // Class 6: 512B
|
||||
48 // Class 7: 1KB (reduce superslab reliance)
|
||||
512, // Class 0: 8B (2x increase: hot class)
|
||||
512, // Class 1: 16B (2x increase: hot class)
|
||||
512, // Class 2: 32B (2x increase: hot class)
|
||||
384, // Class 3: 64B (3x increase: hot class)
|
||||
256, // Class 4: 128B (2x increase: medium class)
|
||||
384, // Class 5: 256B (1.7x increase: bench-optimized)
|
||||
192, // Class 6: 512B (1.5x increase)
|
||||
96 // Class 7: 1KB (2x increase: reduce superslab reliance)
|
||||
};
|
||||
|
||||
uint16_t g_fast_cap_defaults[TINY_NUM_CLASSES] = {
|
||||
256, 256, 256, 128, 128, 224, 128, 48
|
||||
512, 512, 512, 384, 256, 384, 192, 96
|
||||
};
|
||||
|
||||
void tiny_config_reset_defaults(void) {
|
||||
@ -38,16 +39,18 @@ void tiny_config_reset_defaults(void) {
|
||||
// ============================================================================
|
||||
|
||||
// Default TLS magazine capacities per class
|
||||
// Phase 10: Aggressive cache sizing for hot classes (C0-C3)
|
||||
// Goal: Maximize TLS hit rate, reduce backend transitions
|
||||
int tiny_default_cap(int class_idx) {
|
||||
switch (class_idx) {
|
||||
case 0: return 128; // 8B
|
||||
case 1: return 128; // 16B
|
||||
case 2: return 128; // 32B
|
||||
case 3: return 128; // 64B (reduced from 512 to limit RSS)
|
||||
case 4: return 96; // 128B (aggressively trimmed to limit RSS)
|
||||
case 5: return 128; // 256B
|
||||
case 6: return 128; // 512B
|
||||
default: return 64; // 1KB
|
||||
case 0: return 512; // 8B (4x increase: hot class)
|
||||
case 1: return 512; // 16B (4x increase: hot class)
|
||||
case 2: return 512; // 32B (4x increase: hot class)
|
||||
case 3: return 384; // 64B (3x increase: hot class)
|
||||
case 4: return 192; // 128B (2x increase: medium class)
|
||||
case 5: return 256; // 256B (2x increase: medium class)
|
||||
case 6: return 192; // 512B (1.5x increase)
|
||||
default: return 128; // 1KB (2x increase)
|
||||
}
|
||||
}
|
||||
|
||||
@ -57,15 +60,16 @@ int tiny_mag_default_cap(int class_idx) {
|
||||
}
|
||||
|
||||
// Maximum allowed TLS magazine capacities per class
|
||||
// Phase 10: Raise ceilings to allow aggressive cache growth
|
||||
int tiny_cap_max_for_class(int class_idx) {
|
||||
switch (class_idx) {
|
||||
case 0: return 2048;
|
||||
case 1: return 1024;
|
||||
case 2: return 768;
|
||||
case 3: return 512;
|
||||
case 4: return 160;
|
||||
case 5: return 256;
|
||||
case 6: return 128;
|
||||
default: return 64;
|
||||
case 0: return 4096; // 8B (2x increase: allow massive caching)
|
||||
case 1: return 4096; // 16B (4x increase: hot class)
|
||||
case 2: return 2048; // 32B (2.67x increase: hot class)
|
||||
case 3: return 1536; // 64B (3x increase: hot class)
|
||||
case 4: return 512; // 128B (3.2x increase: medium class)
|
||||
case 5: return 768; // 256B (3x increase: medium class)
|
||||
case 6: return 384; // 512B (3x increase)
|
||||
default: return 256; // 1KB (4x increase)
|
||||
}
|
||||
}
|
||||
|
||||
@ -146,10 +146,11 @@ int tiny_cap_max_for_class(int class_idx);
|
||||
extern int g_sfc_enabled;
|
||||
|
||||
// SFC Default Configuration (can be overridden via ENV)
|
||||
// ENV: HAKMEM_SFC_CAPACITY (default: 128, range: 16-256)
|
||||
// ENV: HAKMEM_SFC_REFILL_COUNT (default: 64, range: 8-256)
|
||||
#define SFC_DEFAULT_CAPACITY 128
|
||||
#define SFC_DEFAULT_REFILL_COUNT 64
|
||||
// Phase 10: Aggressive SFC defaults to maximize front cache hit rate
|
||||
// ENV: HAKMEM_SFC_CAPACITY (default: 256, range: 16-512)
|
||||
// ENV: HAKMEM_SFC_REFILL_COUNT (default: 128, range: 8-256)
|
||||
#define SFC_DEFAULT_CAPACITY 256
|
||||
#define SFC_DEFAULT_REFILL_COUNT 128
|
||||
|
||||
// SFC Per-Class Overrides (optional)
|
||||
// ENV: HAKMEM_SFC_CAPACITY_CLASS{0..7} (per-class capacity)
|
||||
|
||||
@ -466,17 +466,23 @@ void hak_tiny_init(void) {
|
||||
}
|
||||
|
||||
// Front refill count globals
|
||||
// Phase 10: Set aggressive defaults for hot and mid classes
|
||||
{
|
||||
char* g = getenv("HAKMEM_TINY_REFILL_COUNT");
|
||||
if (g) { int v = atoi(g); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_global = v; }
|
||||
else { g_refill_count_global = 64; } // Phase 10: default 64 (was 16)
|
||||
|
||||
char* h = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
|
||||
if (h) { int v = atoi(h); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_hot = v; }
|
||||
else { g_refill_count_hot = 128; } // Phase 10: default 128 for hot classes (C0-C3)
|
||||
|
||||
char* m = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
|
||||
if (m) { int v = atoi(m); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_mid = v; }
|
||||
else { g_refill_count_mid = 96; } // Phase 10: default 96 for mid classes (C4-C7)
|
||||
}
|
||||
// Sensible default for class 7 (1024B): favor larger refill to reduce refills/syscalls
|
||||
if (g_refill_count_class[7] == 0) {
|
||||
g_refill_count_class[7] = 64; // can be overridden by env HAKMEM_TINY_REFILL_COUNT_C7
|
||||
g_refill_count_class[7] = 128; // Phase 10: increased from 64 to 128
|
||||
}
|
||||
{
|
||||
char* fast_env = getenv("HAKMEM_TINY_FAST");
|
||||
|
||||
@ -59,7 +59,9 @@ void adaptive_sizing_init(void) {
|
||||
void grow_tls_cache(int class_idx) {
|
||||
TLSCacheStats* stats = &g_tls_cache_stats[class_idx];
|
||||
|
||||
size_t new_capacity = stats->capacity * 2;
|
||||
// Phase 10: Aggressive growth - add 50% instead of doubling
|
||||
// This allows more gradual growth to match actual demand
|
||||
size_t new_capacity = stats->capacity + (stats->capacity / 2);
|
||||
if (new_capacity > TLS_CACHE_MAX_CAPACITY) {
|
||||
new_capacity = TLS_CACHE_MAX_CAPACITY;
|
||||
}
|
||||
@ -73,7 +75,7 @@ void grow_tls_cache(int class_idx) {
|
||||
stats->grow_count++;
|
||||
|
||||
if (g_adaptive_logging_enabled) {
|
||||
fprintf(stderr, "[TLS_CACHE] Grow class %d: %zu → %zu slots (grow_count=%zu)\n",
|
||||
fprintf(stderr, "[TLS_CACHE] Grow class %d: %zu → %zu slots (+50%%, grow_count=%zu)\n",
|
||||
class_idx, old_capacity, stats->capacity, stats->grow_count);
|
||||
}
|
||||
}
|
||||
|
||||
@ -12,17 +12,20 @@
|
||||
// ========== Configuration ==========
|
||||
|
||||
// Capacity bounds
|
||||
#define TLS_CACHE_MIN_CAPACITY 16 // Minimum cache size
|
||||
#define TLS_CACHE_MAX_CAPACITY 2048 // Maximum cache size
|
||||
#define TLS_CACHE_INITIAL_CAPACITY 64 // Initial size (reduced from 256)
|
||||
// Phase 10: Aggressive adaptive sizing - maximize front cache utilization
|
||||
#define TLS_CACHE_MIN_CAPACITY 32 // Minimum cache size (2x increase)
|
||||
#define TLS_CACHE_MAX_CAPACITY 4096 // Maximum cache size (2x increase)
|
||||
#define TLS_CACHE_INITIAL_CAPACITY 256 // Initial size (4x increase from 64)
|
||||
|
||||
// Adaptation triggers
|
||||
#define ADAPT_REFILL_THRESHOLD 10 // Adapt every 10 refills
|
||||
#define ADAPT_TIME_THRESHOLD_NS (1000000000ULL) // Or every 1 second
|
||||
// Phase 10: More frequent adaptation to respond quickly to workload changes
|
||||
#define ADAPT_REFILL_THRESHOLD 5 // Adapt every 5 refills (was 10)
|
||||
#define ADAPT_TIME_THRESHOLD_NS (500000000ULL) // Or every 0.5 seconds (was 1s)
|
||||
|
||||
// Growth/shrink thresholds
|
||||
#define GROW_THRESHOLD 0.8 // Grow if usage > 80% of capacity
|
||||
#define SHRINK_THRESHOLD 0.2 // Shrink if usage < 20% of capacity
|
||||
// Phase 10: Aggressive growth, conservative shrinkage
|
||||
#define GROW_THRESHOLD 0.7 // Grow if usage > 70% of capacity (was 80%)
|
||||
#define SHRINK_THRESHOLD 0.1 // Shrink if usage < 10% of capacity (was 20%)
|
||||
|
||||
// ========== Data Structures ==========
|
||||
|
||||
|
||||
Reference in New Issue
Block a user