// tiny_ring_cache.h - Phase 21-1: Array-based hot cache (C2/C3 only) // // Goal: Eliminate pointer chasing in TLS SLL by using ring buffer // Target: +15-20% performance (54.4M → 62-65M ops/s) // // Design (ChatGPT feedback): // - Ring → SLL → SuperSlab (3-layer hierarchy) // - Ring size: 128 slots (ENV: 64/128/256 A/B test) // - C2/C3 only (hot classes, 33-128B) // - Replaces UltraHot (Phase 19-3: +12.9% by removing UltraHot) // // Performance: // - Alloc: 1-2 instructions (array access, no pointer chasing) // - Free: 1-2 instructions (array write, no pointer chasing) // - vs TLS SLL: 3 mem accesses → 2 mem accesses, 1 cache miss → 0 // // ENV Variables: // HAKMEM_TINY_HOT_RING_ENABLE=1 # Enable Ring cache (default: 0) // HAKMEM_TINY_HOT_RING_C2=128 # C2 ring size (default: 128) // HAKMEM_TINY_HOT_RING_C3=128 # C3 ring size (default: 128) // HAKMEM_TINY_HOT_RING_CASCADE=1 # Enable SLL → Ring refill (default: 0) #ifndef HAK_FRONT_TINY_RING_CACHE_H #define HAK_FRONT_TINY_RING_CACHE_H #include #include #include #include "../hakmem_build_flags.h" // ============================================================================ // Ring Buffer Structure // ============================================================================ typedef struct { void** slots; // Dynamic array (allocated at init, power-of-2 size) uint16_t head; // Pop index (consumer) uint16_t tail; // Push index (producer) uint16_t capacity; // Ring size (power of 2 for fast modulo: & (capacity-1)) uint16_t mask; // Capacity - 1 (for fast modulo) } TinyRingCache; // ============================================================================ // External TLS Variables (defined in hakmem_tiny.c) // ============================================================================ extern __thread TinyRingCache g_ring_cache_c2; extern __thread TinyRingCache g_ring_cache_c3; // ============================================================================ // ENV Control (cached, lazy init) // ============================================================================ // Enable flag (default: 0, OFF) static inline int ring_cache_enabled(void) { static int g_enable = -1; if (__builtin_expect(g_enable == -1, 0)) { const char* e = getenv("HAKMEM_TINY_HOT_RING_ENABLE"); g_enable = (e && *e && *e != '0') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE if (g_enable) { fprintf(stderr, "[Ring-INIT] ring_cache_enabled() = %d\n", g_enable); fflush(stderr); } #endif } return g_enable; } // C2 capacity (default: 128) static inline size_t ring_capacity_c2(void) { static size_t g_cap = 0; if (__builtin_expect(g_cap == 0, 0)) { const char* e = getenv("HAKMEM_TINY_HOT_RING_C2"); g_cap = (e && *e) ? (size_t)atoi(e) : 128; // Default: 128 // Round up to power of 2 (for fast modulo) if (g_cap < 32) g_cap = 32; if (g_cap > 256) g_cap = 256; // Ensure power of 2 size_t pow2 = 32; while (pow2 < g_cap) pow2 *= 2; g_cap = pow2; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[Ring-INIT] C2 capacity = %zu (power of 2)\n", g_cap); fflush(stderr); #endif } return g_cap; } // C3 capacity (default: 128) static inline size_t ring_capacity_c3(void) { static size_t g_cap = 0; if (__builtin_expect(g_cap == 0, 0)) { const char* e = getenv("HAKMEM_TINY_HOT_RING_C3"); g_cap = (e && *e) ? (size_t)atoi(e) : 128; // Default: 128 // Round up to power of 2 if (g_cap < 32) g_cap = 32; if (g_cap > 256) g_cap = 256; size_t pow2 = 32; while (pow2 < g_cap) pow2 *= 2; g_cap = pow2; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[Ring-INIT] C3 capacity = %zu (power of 2)\n", g_cap); fflush(stderr); #endif } return g_cap; } // Cascade enable flag (default: 0, OFF) static inline int ring_cascade_enabled(void) { static int g_enable = -1; if (__builtin_expect(g_enable == -1, 0)) { const char* e = getenv("HAKMEM_TINY_HOT_RING_CASCADE"); g_enable = (e && *e && *e != '0') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE if (g_enable) { fprintf(stderr, "[Ring-INIT] ring_cascade_enabled() = %d\n", g_enable); fflush(stderr); } #endif } return g_enable; } // ============================================================================ // Ultra-Fast Pop/Push (1-2 instructions) // ============================================================================ // Pop from ring (alloc fast path) // Returns: BASE pointer (caller must convert to USER with +1) static inline void* ring_cache_pop(int class_idx) { TinyRingCache* ring = (class_idx == 2) ? &g_ring_cache_c2 : &g_ring_cache_c3; // Empty check if (__builtin_expect(ring->head == ring->tail, 0)) { return NULL; // Empty } // Pop from head (consumer) void* base = ring->slots[ring->head]; ring->head = (ring->head + 1) & ring->mask; // Fast modulo (power of 2) return base; // Return BASE pointer } // Push to ring (free fast path) // Input: BASE pointer (caller must pass BASE, not USER) // Returns: 1=SUCCESS, 0=FULL static inline int ring_cache_push(int class_idx, void* base) { TinyRingCache* ring = (class_idx == 2) ? &g_ring_cache_c2 : &g_ring_cache_c3; uint16_t next_tail = (ring->tail + 1) & ring->mask; // Full check (leave 1 slot empty to distinguish full/empty) if (__builtin_expect(next_tail == ring->head, 0)) { return 0; // Full } // Push to tail (producer) ring->slots[ring->tail] = base; ring->tail = next_tail; return 1; // SUCCESS } // ============================================================================ // Refill from TLS SLL (cascade, Phase 21-1-C) // ============================================================================ // Forward declaration (defined in tiny_ring_cache.c) int ring_refill_from_sll(int class_idx, int target_count); // ============================================================================ // Init/Shutdown (called from hakmem_tiny.c) // ============================================================================ void ring_cache_init(void); void ring_cache_shutdown(void); void ring_cache_print_stats(void); #endif // HAK_FRONT_TINY_RING_CACHE_H