hakmem/core/front/tiny_unified_cache.c

// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation
#include "tiny_unified_cache.h"
#include "../box/unified_batch_box.h"        // Phase 23-D: Box U2 batch alloc (deprecated in 23-E)
#include "../tiny_tls.h"                     // Phase 23-E: TinyTLSSlab, TinySlabMeta
#include "../tiny_box_geometry.h"            // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
#include "../box/tiny_next_ptr_box.h"        // Phase 23-E: tiny_next_read (freelist traversal)
#include "../hakmem_tiny_superslab.h"        // Phase 23-E: SuperSlab
#include "../superslab/superslab_inline.h"   // Phase 23-E: ss_active_add
#include "../box/pagefault_telemetry_box.h"  // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
#include <stdlib.h>
#include <string.h>

// Phase 23-E: Forward declarations
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];  // From hakmem_tiny_superslab.c
extern int superslab_refill(int class_idx);  // From hakmem_tiny_superslab.c

// ============================================================================
// TLS Variables (defined here, extern in header)
// ============================================================================

__thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];

// ============================================================================
// Metrics (Phase 23, optional for debugging)
// ============================================================================

#if !HAKMEM_BUILD_RELEASE
__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
#endif

// ============================================================================
// Init (called at thread start or lazy on first access)
// ============================================================================

void unified_cache_init(void) {
    if (!unified_cache_enabled()) return;

    // Initialize all classes (C0-C7)
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        if (g_unified_cache[cls].slots != NULL) continue;  // Already initialized

        size_t cap = unified_capacity(cls);
        g_unified_cache[cls].slots = (void**)calloc(cap, sizeof(void*));

        if (!g_unified_cache[cls].slots) {
#if !HAKMEM_BUILD_RELEASE
            fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap);
            fflush(stderr);
#endif
            continue;  // Skip this class, try others
        }

        g_unified_cache[cls].capacity = (uint16_t)cap;
        g_unified_cache[cls].mask = (uint16_t)(cap - 1);
        g_unified_cache[cls].head = 0;
        g_unified_cache[cls].tail = 0;

#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n",
                cls, cap, cap * sizeof(void*));
        fflush(stderr);
#endif
    }
}

// ============================================================================
// Shutdown (called at thread exit, optional)
// ============================================================================

void unified_cache_shutdown(void) {
    if (!unified_cache_enabled()) return;

    // TODO: Drain caches to SuperSlab before shutdown (prevent leak)

    // Free cache buffers
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        if (g_unified_cache[cls].slots) {
            free(g_unified_cache[cls].slots);
            g_unified_cache[cls].slots = NULL;
        }
    }

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n");
    fflush(stderr);
#endif
}

// ============================================================================
// Stats (Phase 23 metrics)
// ============================================================================

void unified_cache_print_stats(void) {
    if (!unified_cache_enabled()) return;

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");

    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls];
        uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls];

        if (total_allocs == 0 && total_frees == 0) continue;  // Skip unused classes

        double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0;
        double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0;

        // Current occupancy
        uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head)
                        ? (g_unified_cache[cls].tail - g_unified_cache[cls].head)
                        : (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail);

        fprintf(stderr, "  C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n",
                cls,
                count, g_unified_cache[cls].capacity,
                (unsigned long long)g_unified_cache_hit[cls],
                (unsigned long long)g_unified_cache_miss[cls],
                hit_rate,
                (unsigned long long)g_unified_cache_push[cls],
                (unsigned long long)g_unified_cache_full[cls],
                full_rate);
    }
    fflush(stderr);
#endif
}

// ============================================================================
// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
// ============================================================================

// Batch refill from SuperSlab (called on cache miss)
// Returns: BASE pointer (first block), or NULL if failed
// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
void* unified_cache_refill(int class_idx) {
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];

    // Step 1: Ensure SuperSlab available
    if (!tls->ss) {
        if (!superslab_refill(class_idx)) return NULL;
        tls = &g_tls_slabs[class_idx];  // Reload after refill
    }

    TinyUnifiedCache* cache = &g_unified_cache[class_idx];

    // Step 2: Calculate available room in unified cache
    int room = (int)cache->capacity - 1;  // Leave 1 slot for full detection
    if (cache->head > cache->tail) {
        room = cache->head - cache->tail - 1;
    } else if (cache->head < cache->tail) {
        room = cache->capacity - (cache->tail - cache->head) - 1;
    }

    if (room <= 0) return NULL;
    if (room > 128) room = 128;  // Batch size limit

    // Step 3: Direct carve from SuperSlab into local array (bypass TLS SLL!)
    void* out[128];
    int produced = 0;
    TinySlabMeta* m = tls->meta;
    size_t bs = tiny_stride_for_class(class_idx);
    uint8_t* base = tls->slab_base
                        ? tls->slab_base
                        : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);

    while (produced < room) {
        if (m->freelist) {
            // Freelist pop
            void* p = m->freelist;
            m->freelist = tiny_next_read(class_idx, p);

            // PageFaultTelemetry: record page touch for this BASE
            pagefault_telemetry_touch(class_idx, p);

            // ✅ CRITICAL: Restore header (overwritten by freelist link)
            #if HAKMEM_TINY_HEADER_CLASSIDX
            *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
            #endif

            m->used++;
            out[produced++] = p;

        } else if (m->carved < m->capacity) {
            // Linear carve (fresh block, no freelist link)
            void* p = (void*)(base + ((size_t)m->carved * bs));

            // PageFaultTelemetry: record page touch for this BASE
            pagefault_telemetry_touch(class_idx, p);

            // ✅ CRITICAL: Write header (new block)
            #if HAKMEM_TINY_HEADER_CLASSIDX
            *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
            #endif

            m->carved++;
            m->used++;
            out[produced++] = p;

        } else {
            // SuperSlab exhausted → refill and retry
            if (!superslab_refill(class_idx)) break;

            // ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug)
            tls = &g_tls_slabs[class_idx];
            m = tls->meta;
            base = tls->slab_base
                       ? tls->slab_base
                       : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
        }
    }

    if (produced == 0) return NULL;

    // Step 4: Update active counter
    ss_active_add(tls->ss, (uint32_t)produced);

    // Step 5: Store blocks into unified cache (skip first, return it)
    void* first = out[0];
    for (int i = 1; i < produced; i++) {
        cache->slots[cache->tail] = out[i];
        cache->tail = (cache->tail + 1) & cache->mask;
    }

    #if !HAKMEM_BUILD_RELEASE
    g_unified_cache_miss[class_idx]++;
    #endif

    return first;  // Return first block (BASE pointer)
}
Phase 23 Unified Cache + PageFaultTelemetry generalization: Mid/VM page-fault bottleneck identified Summary: - Phase 23 Unified Cache: +30% improvement (Random Mixed 256B: 18.18M → 23.68M ops/s) - PageFaultTelemetry: Extended to generic buckets (C0-C7, MID, L25, SSM) - Measurement-driven decision: Mid/VM page-faults (80-100K) >> Tiny (6K) → prioritize Mid/VM optimization Phase 23 Changes: 1. Unified Cache implementation (core/front/tiny_unified_cache.{c,h}) - Direct SuperSlab carve (TLS SLL bypass) - Self-contained pop-or-refill pattern - ENV: HAKMEM_TINY_UNIFIED_CACHE=1, HAKMEM_TINY_UNIFIED_C{0-7}=128 2. Fast path pruning (tiny_alloc_fast.inc.h, tiny_free_fast_v2.inc.h) - Unified ON → direct cache access (skip all intermediate layers) - Alloc: unified_cache_pop_or_refill() → immediate fail to slow - Free: unified_cache_push() → fallback to SLL only if full PageFaultTelemetry Changes: 3. Generic bucket architecture (core/box/pagefault_telemetry_box.{c,h}) - PF_BUCKET_{C0-C7, MID, L25, SSM} for domain-specific measurement - Integration: hak_pool_try_alloc(), l25_alloc_new_run(), shared_pool_allocate_superslab_unlocked() 4. Measurement results (Random Mixed 500K / 256B): - Tiny C2-C7: 2-33 pages, high reuse (64-3.8 touches/page) - SSM: 512 pages (initialization footprint) - MID/L25: 0 (unused in this workload) - Mid/Large VM benchmarks: 80-100K page-faults (13-16x higher than Tiny) Ring Cache Enhancements: 5. Hot Ring Cache (core/front/tiny_ring_cache.{c,h}) - ENV: HAKMEM_TINY_HOT_RING_ENABLE=1, HAKMEM_TINY_HOT_RING_C{0-7}=size - Conditional compilation cleanup Documentation: 6. Analysis reports - RANDOM_MIXED_BOTTLENECK_ANALYSIS.md: Page-fault breakdown - RANDOM_MIXED_SUMMARY.md: Phase 23 summary - RING_CACHE_ACTIVATION_GUIDE.md: Ring cache usage - CURRENT_TASK.md: Updated with Phase 23 results and Phase 24 plan Next Steps (Phase 24): - Target: Mid/VM PageArena/HotSpanBox (page-fault reduction 80-100K → 30-40K) - Tiny SSM optimization deferred (low ROI, ~6K page-faults already optimal) - Expected improvement: +30-50% for Mid/Large workloads Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-17 02:47:58 +09:00			`// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation`
			`#include "tiny_unified_cache.h"`
			`#include "../box/unified_batch_box.h" // Phase 23-D: Box U2 batch alloc (deprecated in 23-E)`
			`#include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta`
			`#include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry`
			`#include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal)`
			`#include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab`
			`#include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add`
			`#include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)`
			`#include <stdlib.h>`
			`#include <string.h>`

			`// Phase 23-E: Forward declarations`
			`extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c`
			`extern int superslab_refill(int class_idx); // From hakmem_tiny_superslab.c`

			`// ============================================================================`
			`// TLS Variables (defined here, extern in header)`
			`// ============================================================================`

			`__thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];`

			`// ============================================================================`
			`// Metrics (Phase 23, optional for debugging)`
			`// ============================================================================`

			`#if !HAKMEM_BUILD_RELEASE`
			`__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};`
			`__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};`
			`__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};`
			`__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};`
			`#endif`

			`// ============================================================================`
			`// Init (called at thread start or lazy on first access)`
			`// ============================================================================`

			`void unified_cache_init(void) {`
			`if (!unified_cache_enabled()) return;`

			`// Initialize all classes (C0-C7)`
			`for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {`
			`if (g_unified_cache[cls].slots != NULL) continue; // Already initialized`

			`size_t cap = unified_capacity(cls);`
			`g_unified_cache[cls].slots = (void*)calloc(cap, sizeof(void));`

			`if (!g_unified_cache[cls].slots) {`
			`#if !HAKMEM_BUILD_RELEASE`
			`fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap);`
			`fflush(stderr);`
			`#endif`
			`continue; // Skip this class, try others`
			`}`

			`g_unified_cache[cls].capacity = (uint16_t)cap;`
			`g_unified_cache[cls].mask = (uint16_t)(cap - 1);`
			`g_unified_cache[cls].head = 0;`
			`g_unified_cache[cls].tail = 0;`

			`#if !HAKMEM_BUILD_RELEASE`
			`fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n",`
			`cls, cap, cap * sizeof(void*));`
			`fflush(stderr);`
			`#endif`
			`}`
			`}`

			`// ============================================================================`
			`// Shutdown (called at thread exit, optional)`
			`// ============================================================================`

			`void unified_cache_shutdown(void) {`
			`if (!unified_cache_enabled()) return;`

			`// TODO: Drain caches to SuperSlab before shutdown (prevent leak)`

			`// Free cache buffers`
			`for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {`
			`if (g_unified_cache[cls].slots) {`
			`free(g_unified_cache[cls].slots);`
			`g_unified_cache[cls].slots = NULL;`
			`}`
			`}`

			`#if !HAKMEM_BUILD_RELEASE`
			`fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n");`
			`fflush(stderr);`
			`#endif`
			`}`

			`// ============================================================================`
			`// Stats (Phase 23 metrics)`
			`// ============================================================================`

			`void unified_cache_print_stats(void) {`
			`if (!unified_cache_enabled()) return;`

			`#if !HAKMEM_BUILD_RELEASE`
			`fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");`

			`for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {`
			`uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls];`
			`uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls];`

			`if (total_allocs == 0 && total_frees == 0) continue; // Skip unused classes`

			`double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0;`
			`double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0;`

			`// Current occupancy`
			`uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head)`
			`? (g_unified_cache[cls].tail - g_unified_cache[cls].head)`
			`: (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail);`

			`fprintf(stderr, " C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n",`
			`cls,`
			`count, g_unified_cache[cls].capacity,`
			`(unsigned long long)g_unified_cache_hit[cls],`
			`(unsigned long long)g_unified_cache_miss[cls],`
			`hit_rate,`
			`(unsigned long long)g_unified_cache_push[cls],`
			`(unsigned long long)g_unified_cache_full[cls],`
			`full_rate);`
			`}`
			`fflush(stderr);`
			`#endif`
			`}`

			`// ============================================================================`
			`// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)`
			`// ============================================================================`

			`// Batch refill from SuperSlab (called on cache miss)`
			`// Returns: BASE pointer (first block), or NULL if failed`
			`// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)`
			`void* unified_cache_refill(int class_idx) {`
			`TinyTLSSlab* tls = &g_tls_slabs[class_idx];`

			`// Step 1: Ensure SuperSlab available`
			`if (!tls->ss) {`
			`if (!superslab_refill(class_idx)) return NULL;`
			`tls = &g_tls_slabs[class_idx]; // Reload after refill`
			`}`

			`TinyUnifiedCache* cache = &g_unified_cache[class_idx];`

			`// Step 2: Calculate available room in unified cache`
			`int room = (int)cache->capacity - 1; // Leave 1 slot for full detection`
			`if (cache->head > cache->tail) {`
			`room = cache->head - cache->tail - 1;`
			`} else if (cache->head < cache->tail) {`
			`room = cache->capacity - (cache->tail - cache->head) - 1;`
			`}`

			`if (room <= 0) return NULL;`
			`if (room > 128) room = 128; // Batch size limit`

			`// Step 3: Direct carve from SuperSlab into local array (bypass TLS SLL!)`
			`void* out[128];`
			`int produced = 0;`
			`TinySlabMeta* m = tls->meta;`
			`size_t bs = tiny_stride_for_class(class_idx);`
			`uint8_t* base = tls->slab_base`
			`? tls->slab_base`
			`: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);`

			`while (produced < room) {`
			`if (m->freelist) {`
			`// Freelist pop`
			`void* p = m->freelist;`
			`m->freelist = tiny_next_read(class_idx, p);`

			`// PageFaultTelemetry: record page touch for this BASE`
			`pagefault_telemetry_touch(class_idx, p);`

			`// ✅ CRITICAL: Restore header (overwritten by freelist link)`
			`#if HAKMEM_TINY_HEADER_CLASSIDX`
			`(uint8_t)p = (uint8_t)(0xa0 \| (class_idx & 0x0f));`
			`#endif`

			`m->used++;`
			`out[produced++] = p;`

			`} else if (m->carved < m->capacity) {`
			`// Linear carve (fresh block, no freelist link)`
			`void* p = (void)(base + ((size_t)m->carved bs));`

			`// PageFaultTelemetry: record page touch for this BASE`
			`pagefault_telemetry_touch(class_idx, p);`

			`// ✅ CRITICAL: Write header (new block)`
			`#if HAKMEM_TINY_HEADER_CLASSIDX`
			`(uint8_t)p = (uint8_t)(0xa0 \| (class_idx & 0x0f));`
			`#endif`

			`m->carved++;`
			`m->used++;`
			`out[produced++] = p;`

			`} else {`
			`// SuperSlab exhausted → refill and retry`
			`if (!superslab_refill(class_idx)) break;`

			`// ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug)`
			`tls = &g_tls_slabs[class_idx];`
			`m = tls->meta;`
			`base = tls->slab_base`
			`? tls->slab_base`
			`: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);`
			`}`
			`}`

			`if (produced == 0) return NULL;`

			`// Step 4: Update active counter`
			`ss_active_add(tls->ss, (uint32_t)produced);`

			`// Step 5: Store blocks into unified cache (skip first, return it)`
			`void* first = out[0];`
			`for (int i = 1; i < produced; i++) {`
			`cache->slots[cache->tail] = out[i];`
			`cache->tail = (cache->tail + 1) & cache->mask;`
			`}`

			`#if !HAKMEM_BUILD_RELEASE`
			`g_unified_cache_miss[class_idx]++;`
			`#endif`

			`return first; // Return first block (BASE pointer)`
			`}`