Files
hakmem/core/front/tiny_front_c23.h

158 lines
5.8 KiB
C
Raw Normal View History

Phase B: TinyFrontC23Box - Ultra-simple front path for C2/C3 Implemented dedicated fast path for C2/C3 (128B/256B) to bypass SFC/SLL/Magazine complexity and directly access FastCache + SuperSlab. Changes: - core/front/tiny_front_c23.h: New ultra-simple front path (NEW) - Direct FC → SS refill (2 layers vs 5+ in generic path) - ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 - Refill target: 64 blocks (optimized via A/B testing) - core/tiny_alloc_fast.inc.h: Hook at entry point (+11 lines) - Early return for C2/C3 when C23 path enabled - Safe fallback to generic path on failure Results (100K iterations, A/B tested refill=16/32/64/128): - 128B: 8.27M → 9.55M ops/s (+15.5% with refill=64) ✅ - 256B: 7.90M → 8.61M ops/s (+9.0% with refill=32) ✅ - 256B: 7.90M → 8.47M ops/s (+7.2% with refill=64) ✅ Optimal Refill: 64 blocks - Balanced performance across C2/C3 - 128B best case: +15.5% - 256B good performance: +7.2% - Simple single-value default Architecture: - Flow: FC pop → (miss) → ss_refill_fc_fill(64) → FC pop retry - Bypassed layers: SLL, Magazine, SFC, MidTC - Preserved: Box boundaries, safety checks, fallback paths - Free path: Unchanged (TLS SLL + drain) Box Theory Compliance: - Clear Front ← Backend boundary (ss_refill_fc_fill) - ENV-gated A/B testing (default OFF, opt-in) - Safe fallback: NULL → generic path handles slow case - Zero impact when disabled Performance Gap Analysis: - Current: 8-9M ops/s - After Phase B: 9-10M ops/s (+10-15%) - Target: 15-20M ops/s - Remaining gap: ~2x (suggests deeper bottlenecks remain) Next Steps: - Perf profiling to identify next bottleneck - Current hypotheses: classify_ptr, drain overhead, refill path - Phase C candidates: FC-direct free, inline optimizations ENV Usage: # Enable C23 fast path (default: OFF) export HAKMEM_TINY_FRONT_C23_SIMPLE=1 # Optional: Override refill target (default: 64) export HAKMEM_TINY_FRONT_C23_REFILL=32 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 19:27:45 +09:00
// tiny_front_c23.h - Ultra-Simple Front Path for C2/C3 (Phase B)
// Purpose: Bypass SFC/SLL/Magazine complexity for 128B/256B allocations
// Target: 15-20M ops/s (vs current 8-9M ops/s)
//
// Architecture:
// - C2/C3 only (class_idx 2 or 3)
// - Direct FastCache access (no SLL/Magazine overhead)
// - Direct SuperSlab refill (ss_refill_fc_fill)
// - ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1
//
// Performance Strategy:
// - Minimize layers: FC → SS (2 layers instead of 5+)
// - Minimize branches: ENV check cached in TLS
// - Minimize overhead: No stats, no logging in hot path
//
// Box Theory Compliance:
// - Clear boundary: Front ← Backend (ss_refill_fc_fill)
// - Safe fallback: NULL return → caller handles slow path
// - Header preservation: BASE pointers only, HAK_RET_ALLOC at caller
#ifndef TINY_FRONT_C23_H
#define TINY_FRONT_C23_H
#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include "../hakmem_build_flags.h"
// Forward declarations (functions from other modules)
// These are declared in hakmem_tiny_fastcache.inc.h and refill/ss_refill_fc.h
extern void* fastcache_pop(int class_idx);
extern int fastcache_push(int class_idx, void* ptr);
extern int ss_refill_fc_fill(int class_idx, int want);
// ENV-gated enable/disable (TLS cached for zero overhead after first check)
static inline int tiny_front_c23_enabled(void) {
static __thread int cached = -1;
if (__builtin_expect(cached == -1, 0)) {
const char* env = getenv("HAKMEM_TINY_FRONT_C23_SIMPLE");
cached = (env && atoi(env) == 1) ? 1 : 0;
if (cached) {
fprintf(stderr, "[TINY_FRONT_C23] Enabled for C2/C3 (128B/256B)\n");
}
}
return cached;
}
// Refill target: 64 blocks (optimized via A/B testing)
// A/B Results (100K iterations):
// 128B: refill=64 → 9.55M ops/s (+15.5% vs baseline 8.27M)
// 256B: refill=64 → 8.47M ops/s (+7.2% vs baseline 7.90M)
// 256B: refill=32 → 8.61M ops/s (+9.0%, slightly better for 256B)
// Decision: refill=64 for balanced performance across C2/C3
Phase B: TinyFrontC23Box - Ultra-simple front path for C2/C3 Implemented dedicated fast path for C2/C3 (128B/256B) to bypass SFC/SLL/Magazine complexity and directly access FastCache + SuperSlab. Changes: - core/front/tiny_front_c23.h: New ultra-simple front path (NEW) - Direct FC → SS refill (2 layers vs 5+ in generic path) - ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 - Refill target: 64 blocks (optimized via A/B testing) - core/tiny_alloc_fast.inc.h: Hook at entry point (+11 lines) - Early return for C2/C3 when C23 path enabled - Safe fallback to generic path on failure Results (100K iterations, A/B tested refill=16/32/64/128): - 128B: 8.27M → 9.55M ops/s (+15.5% with refill=64) ✅ - 256B: 7.90M → 8.61M ops/s (+9.0% with refill=32) ✅ - 256B: 7.90M → 8.47M ops/s (+7.2% with refill=64) ✅ Optimal Refill: 64 blocks - Balanced performance across C2/C3 - 128B best case: +15.5% - 256B good performance: +7.2% - Simple single-value default Architecture: - Flow: FC pop → (miss) → ss_refill_fc_fill(64) → FC pop retry - Bypassed layers: SLL, Magazine, SFC, MidTC - Preserved: Box boundaries, safety checks, fallback paths - Free path: Unchanged (TLS SLL + drain) Box Theory Compliance: - Clear Front ← Backend boundary (ss_refill_fc_fill) - ENV-gated A/B testing (default OFF, opt-in) - Safe fallback: NULL → generic path handles slow case - Zero impact when disabled Performance Gap Analysis: - Current: 8-9M ops/s - After Phase B: 9-10M ops/s (+10-15%) - Target: 15-20M ops/s - Remaining gap: ~2x (suggests deeper bottlenecks remain) Next Steps: - Perf profiling to identify next bottleneck - Current hypotheses: classify_ptr, drain overhead, refill path - Phase C candidates: FC-direct free, inline optimizations ENV Usage: # Enable C23 fast path (default: OFF) export HAKMEM_TINY_FRONT_C23_SIMPLE=1 # Optional: Override refill target (default: 64) export HAKMEM_TINY_FRONT_C23_REFILL=32 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 19:27:45 +09:00
static inline int tiny_front_c23_refill_target(int class_idx) {
(void)class_idx;
static __thread int target = -1;
if (__builtin_expect(target == -1, 0)) {
const char* env = getenv("HAKMEM_TINY_FRONT_C23_REFILL");
target = (env && *env) ? atoi(env) : 64; // Default: 64 (A/B optimized)
if (target <= 0) target = 64;
Phase B: TinyFrontC23Box - Ultra-simple front path for C2/C3 Implemented dedicated fast path for C2/C3 (128B/256B) to bypass SFC/SLL/Magazine complexity and directly access FastCache + SuperSlab. Changes: - core/front/tiny_front_c23.h: New ultra-simple front path (NEW) - Direct FC → SS refill (2 layers vs 5+ in generic path) - ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 - Refill target: 64 blocks (optimized via A/B testing) - core/tiny_alloc_fast.inc.h: Hook at entry point (+11 lines) - Early return for C2/C3 when C23 path enabled - Safe fallback to generic path on failure Results (100K iterations, A/B tested refill=16/32/64/128): - 128B: 8.27M → 9.55M ops/s (+15.5% with refill=64) ✅ - 256B: 7.90M → 8.61M ops/s (+9.0% with refill=32) ✅ - 256B: 7.90M → 8.47M ops/s (+7.2% with refill=64) ✅ Optimal Refill: 64 blocks - Balanced performance across C2/C3 - 128B best case: +15.5% - 256B good performance: +7.2% - Simple single-value default Architecture: - Flow: FC pop → (miss) → ss_refill_fc_fill(64) → FC pop retry - Bypassed layers: SLL, Magazine, SFC, MidTC - Preserved: Box boundaries, safety checks, fallback paths - Free path: Unchanged (TLS SLL + drain) Box Theory Compliance: - Clear Front ← Backend boundary (ss_refill_fc_fill) - ENV-gated A/B testing (default OFF, opt-in) - Safe fallback: NULL → generic path handles slow case - Zero impact when disabled Performance Gap Analysis: - Current: 8-9M ops/s - After Phase B: 9-10M ops/s (+10-15%) - Target: 15-20M ops/s - Remaining gap: ~2x (suggests deeper bottlenecks remain) Next Steps: - Perf profiling to identify next bottleneck - Current hypotheses: classify_ptr, drain overhead, refill path - Phase C candidates: FC-direct free, inline optimizations ENV Usage: # Enable C23 fast path (default: OFF) export HAKMEM_TINY_FRONT_C23_SIMPLE=1 # Optional: Override refill target (default: 64) export HAKMEM_TINY_FRONT_C23_REFILL=32 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 19:27:45 +09:00
if (target > 128) target = 128; // Cap at 128 to avoid excessive latency
}
return target;
}
// Ultra-simple alloc for C2/C3
// Returns: BASE pointer or NULL
//
// Flow:
// 1. Try FastCache pop (L1, ultra-fast array access)
// 2. If miss, call ss_refill_fc_fill (SuperSlab → FC direct, bypass SLL)
// 3. Try FastCache pop again (should succeed after refill)
// 4. Return NULL if all failed (caller handles slow path)
//
// Contract:
// - Input: size (64-1024B), class_idx (2 or 3)
// - Output: BASE pointer (header at ptr-1 for C2/C3)
// - Caller: Must call HAK_RET_ALLOC(class_idx, ptr) to convert BASE → USER
// - Safety: NULL checks, class_idx bounds checks, fallback to slow path
//
// Performance:
// - Hot path (FC hit): ~3-5 instructions (array[top--])
// - Cold path (FC miss): ~20-50 instructions (ss_refill_fc_fill + retry)
// - Expected hit rate: 90-95% (based on Phase 7 results)
static inline void* tiny_front_c23_alloc(size_t size, int class_idx) {
// Safety: Bounds check (should never fail, but defense-in-depth)
if (__builtin_expect(class_idx < 2 || class_idx > 3, 0)) {
return NULL; // Not C2/C3, caller should use generic path
}
(void)size; // Unused, class_idx already determined by caller
// Step 1: Try FastCache pop (L1, ultra-fast)
void* ptr = fastcache_pop(class_idx);
if (__builtin_expect(ptr != NULL, 1)) {
// FastCache hit! Return BASE pointer (caller will apply HAK_RET_ALLOC)
return ptr;
}
// Step 2: FastCache miss → Refill from SuperSlab
int want = tiny_front_c23_refill_target(class_idx);
int refilled = ss_refill_fc_fill(class_idx, want);
if (__builtin_expect(refilled <= 0, 0)) {
// Refill failed (OOM or capacity exhausted)
return NULL; // Caller will try slow path
}
// Step 3: Retry FastCache pop (should succeed now)
ptr = fastcache_pop(class_idx);
if (__builtin_expect(ptr != NULL, 1)) {
// Success! Return BASE pointer
return ptr;
}
// Step 4: Still NULL (rare, indicates FC capacity issue or race)
// Fallback: Let caller try slow path
return NULL;
}
// Performance Notes:
//
// Expected improvement over generic path:
// - Generic: FC → SLL → Magazine → Backend (4-5 layers)
// - C23: FC → SS (2 layers)
// - Reduction: -50-60% instructions in refill path
//
// Expected latency:
// - Hot path (FC hit): 3-5 instructions (1-2 cycles)
// - Cold path (refill): 20-50 instructions (10-20 cycles)
// - vs Generic cold: 50-100+ instructions (25-50 cycles)
//
// Memory impact:
// - Zero additional memory (reuses existing FastCache)
// - No new TLS state (uses existing ss_refill_fc_fill backend)
//
// Integration Notes:
//
// Usage (from tiny_alloc_fast.inc.h):
// if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) {
// void* ptr = tiny_front_c23_alloc(size, class_idx);
// if (ptr) return ptr; // Success via C23 fast path
// // Fall through to existing path if C23 path failed
// }
//
// ENV Controls:
// HAKMEM_TINY_FRONT_C23_SIMPLE=1 - Enable C23 fast path
// HAKMEM_TINY_FRONT_C23_REFILL=N - Set refill target (default: 16)
//
// A/B Testing:
// export HAKMEM_TINY_FRONT_C23_SIMPLE=1
// export HAKMEM_TINY_FRONT_C23_REFILL=16 # Conservative
// export HAKMEM_TINY_FRONT_C23_REFILL=32 # Balanced
// export HAKMEM_TINY_FRONT_C23_REFILL=64 # Aggressive
#endif // TINY_FRONT_C23_H