hakmem/core/front/tiny_unified_cache.h

// tiny_unified_cache.h - Phase 23: Unified Frontend Cache (tcache-style)
//
// Goal: Flatten 4-5 layer frontend cascade into single-layer array cache
// Target: +50-100% performance (20.3M → 30-40M ops/s)
//
// Design (Task-sensei analysis):
//   - Replace: Ring → FastCache → SFC → TLS SLL (4 layers, 8-10 cache misses)
//   - With: Single unified array cache per class (1 layer, 2-3 cache misses)
//   - Fallback: Direct SuperSlab refill (skip intermediate layers)
//
// Performance:
//   - Alloc: 2-3 cache misses (TLS access + array access)
//   - Free: 2-3 cache misses (similar to System malloc tcache)
//   - vs Current: 8-10 cache misses → 2-3 cache misses (70% reduction)
//
// ENV Variables:
//   HAKMEM_TINY_UNIFIED_CACHE=1  # Enable Unified cache (default: 0, OFF)
//   HAKMEM_TINY_UNIFIED_C0=128   # C0 cache size (default: 128)
//   ...
//   HAKMEM_TINY_UNIFIED_C7=128   # C7 cache size (default: 128)

#ifndef HAK_FRONT_TINY_UNIFIED_CACHE_H
#define HAK_FRONT_TINY_UNIFIED_CACHE_H

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include "../hakmem_build_flags.h"
#include "../hakmem_tiny_config.h"  // For TINY_NUM_CLASSES

// ============================================================================
// Unified Cache Structure (per class)
// ============================================================================

typedef struct {
    void** slots;      // Dynamic array (allocated at init, power-of-2 size)
    uint16_t head;     // Pop index (consumer)
    uint16_t tail;     // Push index (producer)
    uint16_t capacity; // Cache size (power of 2 for fast modulo: & (capacity-1))
    uint16_t mask;     // Capacity - 1 (for fast modulo)
} TinyUnifiedCache;

// ============================================================================
// External TLS Variables (defined in tiny_unified_cache.c)
// ============================================================================

extern __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];

// ============================================================================
// Metrics (Phase 23, optional for debugging)
// ============================================================================

#if !HAKMEM_BUILD_RELEASE
extern __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES];    // Alloc hits
extern __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES];   // Alloc misses
extern __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES];   // Free pushes
extern __thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES];   // Free full (fallback to SuperSlab)
#endif

// ============================================================================
// ENV Control (cached, lazy init)
// ============================================================================

// Enable flag (default: 0, OFF)
static inline int unified_cache_enabled(void) {
    static int g_enable = -1;
    if (__builtin_expect(g_enable == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_UNIFIED_CACHE");
        g_enable = (e && *e && *e != '0') ? 1 : 0;
#if !HAKMEM_BUILD_RELEASE
        if (g_enable) {
            fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);
            fflush(stderr);
        }
#endif
    }
    return g_enable;
}

// Per-class capacity (default: 128 for all classes)
static inline size_t unified_capacity(int class_idx) {
    static size_t g_cap[TINY_NUM_CLASSES] = {0};
    if (__builtin_expect(g_cap[class_idx] == 0, 0)) {
        char env_name[64];
        snprintf(env_name, sizeof(env_name), "HAKMEM_TINY_UNIFIED_C%d", class_idx);
        const char* e = getenv(env_name);
        g_cap[class_idx] = (e && *e) ? (size_t)atoi(e) : 128;  // Default: 128

        // Round up to power of 2 (for fast modulo)
        if (g_cap[class_idx] < 32) g_cap[class_idx] = 32;
        if (g_cap[class_idx] > 512) g_cap[class_idx] = 512;

        // Ensure power of 2
        size_t pow2 = 32;
        while (pow2 < g_cap[class_idx]) pow2 *= 2;
        g_cap[class_idx] = pow2;

#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[Unified-INIT] C%d capacity = %zu (power of 2)\n", class_idx, g_cap[class_idx]);
        fflush(stderr);
#endif
    }
    return g_cap[class_idx];
}

// ============================================================================
// Init/Shutdown Forward Declarations
// ============================================================================

void unified_cache_init(void);
void unified_cache_shutdown(void);
void unified_cache_print_stats(void);

// ============================================================================
// Phase 23-D: Self-Contained Refill (Box U1 + Box U2 integration)
// ============================================================================

// Batch refill from SuperSlab (called on cache miss)
// Returns: BASE pointer (first block), or NULL if failed
void* unified_cache_refill(int class_idx);

// ============================================================================
// Ultra-Fast Pop/Push (2-3 cache misses, tcache-style)
// ============================================================================

// Pop from unified cache (alloc fast path)
// Returns: BASE pointer (caller must convert to USER with +1)
static inline void* unified_cache_pop(int class_idx) {
    // Fast path: Unified cache disabled → return NULL immediately
    if (__builtin_expect(!unified_cache_enabled(), 0)) return NULL;

    TinyUnifiedCache* cache = &g_unified_cache[class_idx];  // 1 cache miss (TLS)

    // Lazy init check (once per thread, per class)
    if (__builtin_expect(cache->slots == NULL, 0)) {
        unified_cache_init();  // First call in this thread
        // Re-check after init (may fail if allocation failed)
        if (cache->slots == NULL) return NULL;
    }

    // Empty check
    if (__builtin_expect(cache->head == cache->tail, 0)) {
#if !HAKMEM_BUILD_RELEASE
        g_unified_cache_miss[class_idx]++;
#endif
        return NULL;  // Empty
    }

    // Pop from head (consumer)
    void* base = cache->slots[cache->head];  // 1 cache miss (array access)
    cache->head = (cache->head + 1) & cache->mask;  // Fast modulo (power of 2)

#if !HAKMEM_BUILD_RELEASE
    g_unified_cache_hit[class_idx]++;
#endif

    return base;  // Return BASE pointer (2-3 cache misses total)
}

// Push to unified cache (free fast path)
// Input: BASE pointer (caller must pass BASE, not USER)
// Returns: 1=SUCCESS, 0=FULL
static inline int unified_cache_push(int class_idx, void* base) {
    // Fast path: Unified cache disabled → return 0 (not handled)
    if (__builtin_expect(!unified_cache_enabled(), 0)) return 0;

    TinyUnifiedCache* cache = &g_unified_cache[class_idx];  // 1 cache miss (TLS)

    // Lazy init check (once per thread, per class)
    if (__builtin_expect(cache->slots == NULL, 0)) {
        unified_cache_init();  // First call in this thread
        // Re-check after init (may fail if allocation failed)
        if (cache->slots == NULL) return 0;
    }

    uint16_t next_tail = (cache->tail + 1) & cache->mask;

    // Full check (leave 1 slot empty to distinguish full/empty)
    if (__builtin_expect(next_tail == cache->head, 0)) {
#if !HAKMEM_BUILD_RELEASE
        g_unified_cache_full[class_idx]++;
#endif
        return 0;  // Full
    }

    // Push to tail (producer)
    cache->slots[cache->tail] = base;  // 1 cache miss (array write)
    cache->tail = next_tail;

#if !HAKMEM_BUILD_RELEASE
    g_unified_cache_push[class_idx]++;
#endif

    return 1;  // SUCCESS (2-3 cache misses total)
}

// ============================================================================
// Phase 23-D: Self-Contained Pop-or-Refill (tcache-style, single-layer)
// ============================================================================

// All-in-one: Pop from cache, or refill from SuperSlab on miss
// Returns: BASE pointer (caller converts to USER), or NULL if failed
// Design: Self-contained, bypasses all other frontend layers (Ring/FC/SFC/SLL)
static inline void* unified_cache_pop_or_refill(int class_idx) {
    // Fast path: Unified cache disabled → return NULL (caller uses legacy cascade)
    if (__builtin_expect(!unified_cache_enabled(), 0)) return NULL;

    TinyUnifiedCache* cache = &g_unified_cache[class_idx];  // 1 cache miss (TLS)

    // Lazy init check (once per thread, per class)
    if (__builtin_expect(cache->slots == NULL, 0)) {
        unified_cache_init();
        if (cache->slots == NULL) return NULL;
    }

    // Try pop from cache (fast path)
    if (__builtin_expect(cache->head != cache->tail, 1)) {
        void* base = cache->slots[cache->head];  // 1 cache miss (array access)
        cache->head = (cache->head + 1) & cache->mask;
#if !HAKMEM_BUILD_RELEASE
        g_unified_cache_hit[class_idx]++;
#endif
        return base;  // Hit! (2-3 cache misses total)
    }

    // Cache miss → Batch refill from SuperSlab
#if !HAKMEM_BUILD_RELEASE
    g_unified_cache_miss[class_idx]++;
#endif
    return unified_cache_refill(class_idx);  // Refill + return first block
}

#endif // HAK_FRONT_TINY_UNIFIED_CACHE_H
Phase 23 Unified Cache + PageFaultTelemetry generalization: Mid/VM page-fault bottleneck identified Summary: - Phase 23 Unified Cache: +30% improvement (Random Mixed 256B: 18.18M → 23.68M ops/s) - PageFaultTelemetry: Extended to generic buckets (C0-C7, MID, L25, SSM) - Measurement-driven decision: Mid/VM page-faults (80-100K) >> Tiny (6K) → prioritize Mid/VM optimization Phase 23 Changes: 1. Unified Cache implementation (core/front/tiny_unified_cache.{c,h}) - Direct SuperSlab carve (TLS SLL bypass) - Self-contained pop-or-refill pattern - ENV: HAKMEM_TINY_UNIFIED_CACHE=1, HAKMEM_TINY_UNIFIED_C{0-7}=128 2. Fast path pruning (tiny_alloc_fast.inc.h, tiny_free_fast_v2.inc.h) - Unified ON → direct cache access (skip all intermediate layers) - Alloc: unified_cache_pop_or_refill() → immediate fail to slow - Free: unified_cache_push() → fallback to SLL only if full PageFaultTelemetry Changes: 3. Generic bucket architecture (core/box/pagefault_telemetry_box.{c,h}) - PF_BUCKET_{C0-C7, MID, L25, SSM} for domain-specific measurement - Integration: hak_pool_try_alloc(), l25_alloc_new_run(), shared_pool_allocate_superslab_unlocked() 4. Measurement results (Random Mixed 500K / 256B): - Tiny C2-C7: 2-33 pages, high reuse (64-3.8 touches/page) - SSM: 512 pages (initialization footprint) - MID/L25: 0 (unused in this workload) - Mid/Large VM benchmarks: 80-100K page-faults (13-16x higher than Tiny) Ring Cache Enhancements: 5. Hot Ring Cache (core/front/tiny_ring_cache.{c,h}) - ENV: HAKMEM_TINY_HOT_RING_ENABLE=1, HAKMEM_TINY_HOT_RING_C{0-7}=size - Conditional compilation cleanup Documentation: 6. Analysis reports - RANDOM_MIXED_BOTTLENECK_ANALYSIS.md: Page-fault breakdown - RANDOM_MIXED_SUMMARY.md: Phase 23 summary - RING_CACHE_ACTIVATION_GUIDE.md: Ring cache usage - CURRENT_TASK.md: Updated with Phase 23 results and Phase 24 plan Next Steps (Phase 24): - Target: Mid/VM PageArena/HotSpanBox (page-fault reduction 80-100K → 30-40K) - Tiny SSM optimization deferred (low ROI, ~6K page-faults already optimal) - Expected improvement: +30-50% for Mid/Large workloads Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-17 02:47:58 +09:00			`// tiny_unified_cache.h - Phase 23: Unified Frontend Cache (tcache-style)`
			`//`
			`// Goal: Flatten 4-5 layer frontend cascade into single-layer array cache`
			`// Target: +50-100% performance (20.3M → 30-40M ops/s)`
			`//`
			`// Design (Task-sensei analysis):`
			`// - Replace: Ring → FastCache → SFC → TLS SLL (4 layers, 8-10 cache misses)`
			`// - With: Single unified array cache per class (1 layer, 2-3 cache misses)`
			`// - Fallback: Direct SuperSlab refill (skip intermediate layers)`
			`//`
			`// Performance:`
			`// - Alloc: 2-3 cache misses (TLS access + array access)`
			`// - Free: 2-3 cache misses (similar to System malloc tcache)`
			`// - vs Current: 8-10 cache misses → 2-3 cache misses (70% reduction)`
			`//`
			`// ENV Variables:`
			`// HAKMEM_TINY_UNIFIED_CACHE=1 # Enable Unified cache (default: 0, OFF)`
			`// HAKMEM_TINY_UNIFIED_C0=128 # C0 cache size (default: 128)`
			`// ...`
			`// HAKMEM_TINY_UNIFIED_C7=128 # C7 cache size (default: 128)`

			`#ifndef HAK_FRONT_TINY_UNIFIED_CACHE_H`
			`#define HAK_FRONT_TINY_UNIFIED_CACHE_H`

			`#include <stdint.h>`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include "../hakmem_build_flags.h"`
			`#include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES`

			`// ============================================================================`
			`// Unified Cache Structure (per class)`
			`// ============================================================================`

			`typedef struct {`
			`void** slots; // Dynamic array (allocated at init, power-of-2 size)`
			`uint16_t head; // Pop index (consumer)`
			`uint16_t tail; // Push index (producer)`
			`uint16_t capacity; // Cache size (power of 2 for fast modulo: & (capacity-1))`
			`uint16_t mask; // Capacity - 1 (for fast modulo)`
			`} TinyUnifiedCache;`

			`// ============================================================================`
			`// External TLS Variables (defined in tiny_unified_cache.c)`
			`// ============================================================================`

			`extern __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];`

			`// ============================================================================`
			`// Metrics (Phase 23, optional for debugging)`
			`// ============================================================================`

			`#if !HAKMEM_BUILD_RELEASE`
			`extern __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES]; // Alloc hits`
			`extern __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES]; // Alloc misses`
			`extern __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES]; // Free pushes`
			`extern __thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES]; // Free full (fallback to SuperSlab)`
			`#endif`

			`// ============================================================================`
			`// ENV Control (cached, lazy init)`
			`// ============================================================================`

			`// Enable flag (default: 0, OFF)`
			`static inline int unified_cache_enabled(void) {`
			`static int g_enable = -1;`
			`if (__builtin_expect(g_enable == -1, 0)) {`
			`const char* e = getenv("HAKMEM_TINY_UNIFIED_CACHE");`
			`g_enable = (e && e && e != '0') ? 1 : 0;`
			`#if !HAKMEM_BUILD_RELEASE`
			`if (g_enable) {`
			`fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);`
			`fflush(stderr);`
			`}`
			`#endif`
			`}`
			`return g_enable;`
			`}`

			`// Per-class capacity (default: 128 for all classes)`
			`static inline size_t unified_capacity(int class_idx) {`
			`static size_t g_cap[TINY_NUM_CLASSES] = {0};`
			`if (__builtin_expect(g_cap[class_idx] == 0, 0)) {`
			`char env_name[64];`
			`snprintf(env_name, sizeof(env_name), "HAKMEM_TINY_UNIFIED_C%d", class_idx);`
			`const char* e = getenv(env_name);`
			`g_cap[class_idx] = (e && *e) ? (size_t)atoi(e) : 128; // Default: 128`

			`// Round up to power of 2 (for fast modulo)`
			`if (g_cap[class_idx] < 32) g_cap[class_idx] = 32;`
			`if (g_cap[class_idx] > 512) g_cap[class_idx] = 512;`

			`// Ensure power of 2`
			`size_t pow2 = 32;`
			`while (pow2 < g_cap[class_idx]) pow2 *= 2;`
			`g_cap[class_idx] = pow2;`

			`#if !HAKMEM_BUILD_RELEASE`
			`fprintf(stderr, "[Unified-INIT] C%d capacity = %zu (power of 2)\n", class_idx, g_cap[class_idx]);`
			`fflush(stderr);`
			`#endif`
			`}`
			`return g_cap[class_idx];`
			`}`

			`// ============================================================================`
			`// Init/Shutdown Forward Declarations`
			`// ============================================================================`

			`void unified_cache_init(void);`
			`void unified_cache_shutdown(void);`
			`void unified_cache_print_stats(void);`

			`// ============================================================================`
			`// Phase 23-D: Self-Contained Refill (Box U1 + Box U2 integration)`
			`// ============================================================================`

			`// Batch refill from SuperSlab (called on cache miss)`
			`// Returns: BASE pointer (first block), or NULL if failed`
			`void* unified_cache_refill(int class_idx);`

			`// ============================================================================`
			`// Ultra-Fast Pop/Push (2-3 cache misses, tcache-style)`
			`// ============================================================================`

			`// Pop from unified cache (alloc fast path)`
			`// Returns: BASE pointer (caller must convert to USER with +1)`
			`static inline void* unified_cache_pop(int class_idx) {`
			`// Fast path: Unified cache disabled → return NULL immediately`
			`if (__builtin_expect(!unified_cache_enabled(), 0)) return NULL;`

			`TinyUnifiedCache* cache = &g_unified_cache[class_idx]; // 1 cache miss (TLS)`

			`// Lazy init check (once per thread, per class)`
			`if (__builtin_expect(cache->slots == NULL, 0)) {`
			`unified_cache_init(); // First call in this thread`
			`// Re-check after init (may fail if allocation failed)`
			`if (cache->slots == NULL) return NULL;`
			`}`

			`// Empty check`
			`if (__builtin_expect(cache->head == cache->tail, 0)) {`
			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_miss[class_idx]++;`
			`#endif`
			`return NULL; // Empty`
			`}`

			`// Pop from head (consumer)`
			`void* base = cache->slots[cache->head]; // 1 cache miss (array access)`
			`cache->head = (cache->head + 1) & cache->mask; // Fast modulo (power of 2)`

			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_hit[class_idx]++;`
			`#endif`

			`return base; // Return BASE pointer (2-3 cache misses total)`
			`}`

			`// Push to unified cache (free fast path)`
			`// Input: BASE pointer (caller must pass BASE, not USER)`
			`// Returns: 1=SUCCESS, 0=FULL`
			`static inline int unified_cache_push(int class_idx, void* base) {`
			`// Fast path: Unified cache disabled → return 0 (not handled)`
			`if (__builtin_expect(!unified_cache_enabled(), 0)) return 0;`

			`TinyUnifiedCache* cache = &g_unified_cache[class_idx]; // 1 cache miss (TLS)`

			`// Lazy init check (once per thread, per class)`
			`if (__builtin_expect(cache->slots == NULL, 0)) {`
			`unified_cache_init(); // First call in this thread`
			`// Re-check after init (may fail if allocation failed)`
			`if (cache->slots == NULL) return 0;`
			`}`

			`uint16_t next_tail = (cache->tail + 1) & cache->mask;`

			`// Full check (leave 1 slot empty to distinguish full/empty)`
			`if (__builtin_expect(next_tail == cache->head, 0)) {`
			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_full[class_idx]++;`
			`#endif`
			`return 0; // Full`
			`}`

			`// Push to tail (producer)`
			`cache->slots[cache->tail] = base; // 1 cache miss (array write)`
			`cache->tail = next_tail;`

			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_push[class_idx]++;`
			`#endif`

			`return 1; // SUCCESS (2-3 cache misses total)`
			`}`

			`// ============================================================================`
			`// Phase 23-D: Self-Contained Pop-or-Refill (tcache-style, single-layer)`
			`// ============================================================================`

			`// All-in-one: Pop from cache, or refill from SuperSlab on miss`
			`// Returns: BASE pointer (caller converts to USER), or NULL if failed`
			`// Design: Self-contained, bypasses all other frontend layers (Ring/FC/SFC/SLL)`
			`static inline void* unified_cache_pop_or_refill(int class_idx) {`
			`// Fast path: Unified cache disabled → return NULL (caller uses legacy cascade)`
			`if (__builtin_expect(!unified_cache_enabled(), 0)) return NULL;`

			`TinyUnifiedCache* cache = &g_unified_cache[class_idx]; // 1 cache miss (TLS)`

			`// Lazy init check (once per thread, per class)`
			`if (__builtin_expect(cache->slots == NULL, 0)) {`
			`unified_cache_init();`
			`if (cache->slots == NULL) return NULL;`
			`}`

			`// Try pop from cache (fast path)`
			`if (__builtin_expect(cache->head != cache->tail, 1)) {`
			`void* base = cache->slots[cache->head]; // 1 cache miss (array access)`
			`cache->head = (cache->head + 1) & cache->mask;`
			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_hit[class_idx]++;`
			`#endif`
			`return base; // Hit! (2-3 cache misses total)`
			`}`

			`// Cache miss → Batch refill from SuperSlab`
			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_miss[class_idx]++;`
			`#endif`
			`return unified_cache_refill(class_idx); // Refill + return first block`
			`}`

			`#endif // HAK_FRONT_TINY_UNIFIED_CACHE_H`