// tiny_front_c23.h - Ultra-Simple Front Path for C2/C3 (Phase B) // Purpose: Bypass SFC/SLL/Magazine complexity for 128B/256B allocations // Target: 15-20M ops/s (vs current 8-9M ops/s) // // Architecture: // - C2/C3 only (class_idx 2 or 3) // - Direct FastCache access (no SLL/Magazine overhead) // - Direct SuperSlab refill (ss_refill_fc_fill) // - ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 // // Performance Strategy: // - Minimize layers: FC → SS (2 layers instead of 5+) // - Minimize branches: ENV check cached in TLS // - Minimize overhead: No stats, no logging in hot path // // Box Theory Compliance: // - Clear boundary: Front ← Backend (ss_refill_fc_fill) // - Safe fallback: NULL return → caller handles slow path // - Header preservation: BASE pointers only, HAK_RET_ALLOC at caller #ifndef TINY_FRONT_C23_H #define TINY_FRONT_C23_H #include #include #include #include #include "../hakmem_build_flags.h" // Forward declarations (functions from other modules) // These are declared in hakmem_tiny_fastcache.inc.h and refill/ss_refill_fc.h extern void* fastcache_pop(int class_idx); extern int fastcache_push(int class_idx, void* ptr); extern int ss_refill_fc_fill(int class_idx, int want); // ENV-gated enable/disable (TLS cached for zero overhead after first check) static inline int tiny_front_c23_enabled(void) { static __thread int cached = -1; if (__builtin_expect(cached == -1, 0)) { const char* env = getenv("HAKMEM_TINY_FRONT_C23_SIMPLE"); cached = (env && atoi(env) == 1) ? 1 : 0; if (cached) { fprintf(stderr, "[TINY_FRONT_C23] Enabled for C2/C3 (128B/256B)\n"); } } return cached; } // Refill target: 64 blocks (optimized via A/B testing) // A/B Results (100K iterations): // 128B: refill=64 → 9.55M ops/s (+15.5% vs baseline 8.27M) // 256B: refill=64 → 8.47M ops/s (+7.2% vs baseline 7.90M) // 256B: refill=32 → 8.61M ops/s (+9.0%, slightly better for 256B) // Decision: refill=64 for balanced performance across C2/C3 static inline int tiny_front_c23_refill_target(int class_idx) { (void)class_idx; static __thread int target = -1; if (__builtin_expect(target == -1, 0)) { const char* env = getenv("HAKMEM_TINY_FRONT_C23_REFILL"); target = (env && *env) ? atoi(env) : 64; // Default: 64 (A/B optimized) if (target <= 0) target = 64; if (target > 128) target = 128; // Cap at 128 to avoid excessive latency } return target; } // Ultra-simple alloc for C2/C3 // Returns: BASE pointer or NULL // // Flow: // 1. Try FastCache pop (L1, ultra-fast array access) // 2. If miss, call ss_refill_fc_fill (SuperSlab → FC direct, bypass SLL) // 3. Try FastCache pop again (should succeed after refill) // 4. Return NULL if all failed (caller handles slow path) // // Contract: // - Input: size (64-1024B), class_idx (2 or 3) // - Output: BASE pointer (header at ptr-1 for C2/C3) // - Caller: Must call HAK_RET_ALLOC(class_idx, ptr) to convert BASE → USER // - Safety: NULL checks, class_idx bounds checks, fallback to slow path // // Performance: // - Hot path (FC hit): ~3-5 instructions (array[top--]) // - Cold path (FC miss): ~20-50 instructions (ss_refill_fc_fill + retry) // - Expected hit rate: 90-95% (based on Phase 7 results) static inline void* tiny_front_c23_alloc(size_t size, int class_idx) { // Safety: Bounds check (should never fail, but defense-in-depth) if (__builtin_expect(class_idx < 2 || class_idx > 3, 0)) { return NULL; // Not C2/C3, caller should use generic path } (void)size; // Unused, class_idx already determined by caller // Step 1: Try FastCache pop (L1, ultra-fast) void* ptr = fastcache_pop(class_idx); if (__builtin_expect(ptr != NULL, 1)) { // FastCache hit! Return BASE pointer (caller will apply HAK_RET_ALLOC) return ptr; } // Step 2: FastCache miss → Refill from SuperSlab int want = tiny_front_c23_refill_target(class_idx); int refilled = ss_refill_fc_fill(class_idx, want); if (__builtin_expect(refilled <= 0, 0)) { // Refill failed (OOM or capacity exhausted) return NULL; // Caller will try slow path } // Step 3: Retry FastCache pop (should succeed now) ptr = fastcache_pop(class_idx); if (__builtin_expect(ptr != NULL, 1)) { // Success! Return BASE pointer return ptr; } // Step 4: Still NULL (rare, indicates FC capacity issue or race) // Fallback: Let caller try slow path return NULL; } // Performance Notes: // // Expected improvement over generic path: // - Generic: FC → SLL → Magazine → Backend (4-5 layers) // - C23: FC → SS (2 layers) // - Reduction: -50-60% instructions in refill path // // Expected latency: // - Hot path (FC hit): 3-5 instructions (1-2 cycles) // - Cold path (refill): 20-50 instructions (10-20 cycles) // - vs Generic cold: 50-100+ instructions (25-50 cycles) // // Memory impact: // - Zero additional memory (reuses existing FastCache) // - No new TLS state (uses existing ss_refill_fc_fill backend) // // Integration Notes: // // Usage (from tiny_alloc_fast.inc.h): // if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) { // void* ptr = tiny_front_c23_alloc(size, class_idx); // if (ptr) return ptr; // Success via C23 fast path // // Fall through to existing path if C23 path failed // } // // ENV Controls: // HAKMEM_TINY_FRONT_C23_SIMPLE=1 - Enable C23 fast path // HAKMEM_TINY_FRONT_C23_REFILL=N - Set refill target (default: 16) // // A/B Testing: // export HAKMEM_TINY_FRONT_C23_SIMPLE=1 // export HAKMEM_TINY_FRONT_C23_REFILL=16 # Conservative // export HAKMEM_TINY_FRONT_C23_REFILL=32 # Balanced // export HAKMEM_TINY_FRONT_C23_REFILL=64 # Aggressive #endif // TINY_FRONT_C23_H