diff --git a/core/front/tiny_front_c23.h b/core/front/tiny_front_c23.h new file mode 100644 index 00000000..30599b61 --- /dev/null +++ b/core/front/tiny_front_c23.h @@ -0,0 +1,155 @@ +// tiny_front_c23.h - Ultra-Simple Front Path for C2/C3 (Phase B) +// Purpose: Bypass SFC/SLL/Magazine complexity for 128B/256B allocations +// Target: 15-20M ops/s (vs current 8-9M ops/s) +// +// Architecture: +// - C2/C3 only (class_idx 2 or 3) +// - Direct FastCache access (no SLL/Magazine overhead) +// - Direct SuperSlab refill (ss_refill_fc_fill) +// - ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 +// +// Performance Strategy: +// - Minimize layers: FC → SS (2 layers instead of 5+) +// - Minimize branches: ENV check cached in TLS +// - Minimize overhead: No stats, no logging in hot path +// +// Box Theory Compliance: +// - Clear boundary: Front ← Backend (ss_refill_fc_fill) +// - Safe fallback: NULL return → caller handles slow path +// - Header preservation: BASE pointers only, HAK_RET_ALLOC at caller + +#ifndef TINY_FRONT_C23_H +#define TINY_FRONT_C23_H + +#include +#include +#include +#include +#include "../hakmem_build_flags.h" + +// Forward declarations (functions from other modules) +// These are declared in hakmem_tiny_fastcache.inc.h and refill/ss_refill_fc.h +extern void* fastcache_pop(int class_idx); +extern int fastcache_push(int class_idx, void* ptr); +extern int ss_refill_fc_fill(int class_idx, int want); + +// ENV-gated enable/disable (TLS cached for zero overhead after first check) +static inline int tiny_front_c23_enabled(void) { + static __thread int cached = -1; + if (__builtin_expect(cached == -1, 0)) { + const char* env = getenv("HAKMEM_TINY_FRONT_C23_SIMPLE"); + cached = (env && atoi(env) == 1) ? 1 : 0; + if (cached) { + fprintf(stderr, "[TINY_FRONT_C23] Enabled for C2/C3 (128B/256B)\n"); + } + } + return cached; +} + +// Refill target (conservative start: 16 blocks) +// Tunable via A/B testing: 16/32/64 +// Smaller = lower latency, higher refill frequency +// Larger = higher latency, lower refill frequency +static inline int tiny_front_c23_refill_target(int class_idx) { + (void)class_idx; + static __thread int target = -1; + if (__builtin_expect(target == -1, 0)) { + const char* env = getenv("HAKMEM_TINY_FRONT_C23_REFILL"); + target = (env && *env) ? atoi(env) : 16; + if (target <= 0) target = 16; + if (target > 128) target = 128; // Cap at 128 to avoid excessive latency + } + return target; +} + +// Ultra-simple alloc for C2/C3 +// Returns: BASE pointer or NULL +// +// Flow: +// 1. Try FastCache pop (L1, ultra-fast array access) +// 2. If miss, call ss_refill_fc_fill (SuperSlab → FC direct, bypass SLL) +// 3. Try FastCache pop again (should succeed after refill) +// 4. Return NULL if all failed (caller handles slow path) +// +// Contract: +// - Input: size (64-1024B), class_idx (2 or 3) +// - Output: BASE pointer (header at ptr-1 for C2/C3) +// - Caller: Must call HAK_RET_ALLOC(class_idx, ptr) to convert BASE → USER +// - Safety: NULL checks, class_idx bounds checks, fallback to slow path +// +// Performance: +// - Hot path (FC hit): ~3-5 instructions (array[top--]) +// - Cold path (FC miss): ~20-50 instructions (ss_refill_fc_fill + retry) +// - Expected hit rate: 90-95% (based on Phase 7 results) +static inline void* tiny_front_c23_alloc(size_t size, int class_idx) { + // Safety: Bounds check (should never fail, but defense-in-depth) + if (__builtin_expect(class_idx < 2 || class_idx > 3, 0)) { + return NULL; // Not C2/C3, caller should use generic path + } + + (void)size; // Unused, class_idx already determined by caller + + // Step 1: Try FastCache pop (L1, ultra-fast) + void* ptr = fastcache_pop(class_idx); + if (__builtin_expect(ptr != NULL, 1)) { + // FastCache hit! Return BASE pointer (caller will apply HAK_RET_ALLOC) + return ptr; + } + + // Step 2: FastCache miss → Refill from SuperSlab + int want = tiny_front_c23_refill_target(class_idx); + int refilled = ss_refill_fc_fill(class_idx, want); + + if (__builtin_expect(refilled <= 0, 0)) { + // Refill failed (OOM or capacity exhausted) + return NULL; // Caller will try slow path + } + + // Step 3: Retry FastCache pop (should succeed now) + ptr = fastcache_pop(class_idx); + if (__builtin_expect(ptr != NULL, 1)) { + // Success! Return BASE pointer + return ptr; + } + + // Step 4: Still NULL (rare, indicates FC capacity issue or race) + // Fallback: Let caller try slow path + return NULL; +} + +// Performance Notes: +// +// Expected improvement over generic path: +// - Generic: FC → SLL → Magazine → Backend (4-5 layers) +// - C23: FC → SS (2 layers) +// - Reduction: -50-60% instructions in refill path +// +// Expected latency: +// - Hot path (FC hit): 3-5 instructions (1-2 cycles) +// - Cold path (refill): 20-50 instructions (10-20 cycles) +// - vs Generic cold: 50-100+ instructions (25-50 cycles) +// +// Memory impact: +// - Zero additional memory (reuses existing FastCache) +// - No new TLS state (uses existing ss_refill_fc_fill backend) +// +// Integration Notes: +// +// Usage (from tiny_alloc_fast.inc.h): +// if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) { +// void* ptr = tiny_front_c23_alloc(size, class_idx); +// if (ptr) return ptr; // Success via C23 fast path +// // Fall through to existing path if C23 path failed +// } +// +// ENV Controls: +// HAKMEM_TINY_FRONT_C23_SIMPLE=1 - Enable C23 fast path +// HAKMEM_TINY_FRONT_C23_REFILL=N - Set refill target (default: 16) +// +// A/B Testing: +// export HAKMEM_TINY_FRONT_C23_SIMPLE=1 +// export HAKMEM_TINY_FRONT_C23_REFILL=16 # Conservative +// export HAKMEM_TINY_FRONT_C23_REFILL=32 # Balanced +// export HAKMEM_TINY_FRONT_C23_REFILL=64 # Aggressive + +#endif // TINY_FRONT_C23_H diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h index 080356db..1b26a4cb 100644 --- a/core/tiny_alloc_fast.inc.h +++ b/core/tiny_alloc_fast.inc.h @@ -26,6 +26,9 @@ #include "box/front_gate_box.h" #endif #include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection +#ifdef HAKMEM_TINY_HEADER_CLASSIDX +#include "front/tiny_front_c23.h" // Phase B: Ultra-simple C2/C3 front +#endif #include // Phase 7 Task 2: Aggressive inline TLS cache access @@ -583,6 +586,19 @@ static inline void* tiny_alloc_fast(size_t size) { void* ptr = NULL; const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5); + // Phase B: Ultra-simple front for C2/C3 (128B/256B) + // ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 + // Target: 15-20M ops/s (vs current 8-9M ops/s) +#ifdef HAKMEM_TINY_HEADER_CLASSIDX + if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) { + void* c23_ptr = tiny_front_c23_alloc(size, class_idx); + if (c23_ptr) { + HAK_RET_ALLOC(class_idx, c23_ptr); + } + // Fall through to existing path if C23 path failed (NULL) + } +#endif + // NEW: Front-Direct/SLL-OFF bypass control (TLS cached, lazy init) static __thread int s_front_direct_alloc = -1; if (__builtin_expect(s_front_direct_alloc == -1, 0)) {