hakmem/core/tiny_c7_ultra.c

// tiny_c7_ultra.c - Phase PERF-ULTRA-ALLOC-OPT-1: Optimized array-based TLS cache for C7 ULTRA

#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include "box/tiny_c7_ultra_box.h"
#include "box/smallobject_hotbox_v3_box.h"
#include "box/tiny_geometry_box.h"
#include "tiny_region_id.h"
#include "box/tiny_c7_ultra_segment_box.h"
#include "box/tiny_front_v3_env_box.h"
#include "box/free_path_stats_box.h"
#include "box/c7_ultra_alloc_depchain_opt_box.h"

// Phase PERF-ULTRA-REFILL-OPT-1a: Import page size shift macro
// (defined in tiny_c7_ultra_segment.c for consistency)
// We'll define it locally here as well for convenience
#define TINY_C7_ULTRA_PAGE_SHIFT 16  // 64KiB = 2^16

#ifndef likely
#define likely(x)   __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif

// TLS context
static __thread tiny_c7_ultra_tls_t g_tiny_c7_ultra_tls = {0};

tiny_c7_ultra_tls_t* tiny_c7_ultra_tls_get(void) {
    return &g_tiny_c7_ultra_tls;
}

// ============================================================================
// Phase PERF-ULTRA-ALLOC-OPT-1: Pure TLS pop alloc (hot path)
// Phase 62A: Dependency Chain Trim optimization
// ============================================================================

void* tiny_c7_ultra_alloc(size_t size) {
    (void)size;  // C7 dedicated, size unused
    tiny_c7_ultra_tls_t* tls = &g_tiny_c7_ultra_tls;

    // Original path (baseline for compatibility/fallback)
    const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled();

    // Phase 62A: Check optimization flag (compile-time in BENCH_MINIMAL)
    if (!c7_ultra_alloc_depchain_opt_enabled()) {
        // Baseline path (default, for compatibility)

        // Hot path: TLS cache hit (single branch)
        uint16_t n = tls->count;
        if (__builtin_expect(n > 0, 1)) {
            void* base = tls->freelist[n - 1];
            tls->count = n - 1;

            // Convert BASE -> USER pointer
            if (header_light) {
                return (uint8_t*)base + 1;  // Header already written
            }
            return tiny_region_id_write_header(base, 7);
        }

        // Cold path: Refill TLS cache from segment
        if (!tiny_c7_ultra_refill(tls)) {
            return so_alloc(7);  // Fallback to v3
        }

        // Retry after refill
        n = tls->count;
        if (__builtin_expect(n > 0, 1)) {
            void* base = tls->freelist[n - 1];
            tls->count = n - 1;

            if (header_light) {
                return (uint8_t*)base + 1;
            }
            return tiny_region_id_write_header(base, 7);
        }

        return so_alloc(7);  // Final fallback
    }

    // Optimized path: Use TLS headers_initialized instead of per-call check
    // This eliminates the per-call tiny_front_v3_c7_ultra_header_light_enabled() check

    // Hot path: TLS cache hit (minimal branches)
    uint16_t n = tls->count;
    if (__builtin_expect(n > 0, 1)) {
        void* base = tls->freelist[n - 1];
        tls->count = n - 1;

        // Skip header write if already initialized during refill
        if (tls->headers_initialized) {
            return (uint8_t*)base + 1;  // Header already written
        }
        return tiny_region_id_write_header(base, 7);
    }

    // Cold path: Refill TLS cache from segment
    if (!tiny_c7_ultra_refill(tls)) {
        return so_alloc(7);  // Fallback to v3
    }

    // Retry after refill (same path as hot hit, headers_initialized set by refill)
    n = tls->count;
    if (__builtin_expect(n > 0, 1)) {
        void* base = tls->freelist[n - 1];
        tls->count = n - 1;

        if (tls->headers_initialized) {
            return (uint8_t*)base + 1;
        }
        return tiny_region_id_write_header(base, 7);
    }

    return so_alloc(7);  // Final fallback
}

// ============================================================================
// Cold path: Refill TLS cache from segment
// ============================================================================

__attribute__((noinline))
bool tiny_c7_ultra_refill(tiny_c7_ultra_tls_t* tls) {
    tiny_c7_ultra_segment_t* seg = tls->seg;
    if (!seg) {
        seg = tiny_c7_ultra_segment_acquire();
        if (!seg) return false;
        tls->seg = seg;
        tls->seg_base = (uintptr_t)seg->base;
        // Phase PERF-ULTRA-REFILL-OPT-1a: Use bit shift instead of multiplication
        tls->seg_end = tls->seg_base + ((size_t)seg->num_pages << TINY_C7_ULTRA_PAGE_SHIFT);
    }

    size_t block_sz = tls->block_size;
    if (block_sz == 0) {
        block_sz = (size_t)tiny_stride_for_class(7);
        tls->block_size = block_sz;
    }
    if (block_sz == 0) return false;

    uint32_t capacity = (uint32_t)(seg->page_size / block_sz);
    if (capacity == 0) return false;

    const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled();

    // Find an empty or partially used page
    uint32_t chosen = seg->num_pages;
    for (uint32_t i = 0; i < seg->num_pages; i++) {
        tiny_c7_ultra_page_meta_t* pm = &seg->pages[i];
        if (pm->capacity == 0 || pm->used < pm->capacity) {
            chosen = i;
            break;
        }
    }
    if (chosen == seg->num_pages) {
        return false;  // No available pages
    }

    tiny_c7_ultra_page_meta_t* page = &seg->pages[chosen];
    // Phase PERF-ULTRA-REFILL-OPT-1a: Use bit shift instead of multiplication
    uint8_t* base = (uint8_t*)seg->base + ((size_t)chosen << TINY_C7_ULTRA_PAGE_SHIFT);

    // If page is uninitialized, carve it
    if (page->capacity == 0) {
        page->capacity = capacity;
        page->used = 0;
        page->freelist = NULL;

        // Carve blocks into TLS cache (fill from end to preserve order)
        uint16_t n = 0;
        for (uint32_t i = 0; i < capacity && n < TINY_C7_ULTRA_CAP; i++) {
            uint8_t* blk = base + ((size_t)i * block_sz);
            if (header_light) {
                tiny_region_id_write_header(blk, 7);  // Write header once
            }
            tls->freelist[n++] = blk;
        }
        tls->count = n;
        tls->page_base = base;
        tls->page_idx = chosen;
        tls->page_meta = page;
        tls->headers_initialized = header_light;
        page->used = n;
        return (n > 0);
    }

    // Page already initialized - collect available blocks into TLS cache
    uint16_t n = 0;
    for (uint32_t i = 0; i < capacity && n < TINY_C7_ULTRA_CAP; i++) {
        if (page->used >= capacity) break;

        uint8_t* blk = base + ((size_t)i * block_sz);
        // Simple heuristic: if used < capacity, try to allocate next block
        // (Real implementation would track per-block state or use a bitmap)
        tls->freelist[n++] = blk;
        page->used++;
    }

    if (n > 0) {
        tls->count = n;
        tls->page_base = base;
        tls->page_idx = chosen;
        tls->page_meta = page;
        tls->headers_initialized = header_light;
        return true;
    }

    return false;
}

// ============================================================================
// Free path: UF-3 segment learning + TLS cache push
// ============================================================================

void tiny_c7_ultra_free(void* ptr) {
    if (!ptr) {
        so_free(7, ptr);
        return;
    }

    tiny_c7_ultra_tls_t* tls = &g_tiny_c7_ultra_tls;
    void* base = (uint8_t*)ptr - 1;  // Convert USER -> BASE pointer

    // Phase PERF-ULTRA-REFILL-OPT-1b: Segment learning moved to refill (alloc cold path)
    // In normal allocation patterns, alloc is always called before free on each thread.
    // Therefore, seg_base/seg_end are guaranteed to be initialized by refill's
    // tiny_c7_ultra_segment_acquire() call (line 82-87).
    //
    // This optimization removes the per-free segment learning overhead.
    // Risk: If a thread does free() before any alloc(), it will fallback to so_free().
    // This is acceptable because it's an unusual pattern.

    // Fast path: assume segment already learned by refill
    // No unlikely() guard needed because refill always runs first in normal patterns
    uintptr_t addr = (uintptr_t)base;
    if (likely(tls->seg_base != 0 &&
               addr >= tls->seg_base &&
               addr < tls->seg_end &&
               tls->count < TINY_C7_ULTRA_CAP)) {
        tls->freelist[tls->count++] = base;
        FREE_PATH_STAT_INC(c7_ultra_fast);
        return;
    }

    // 3) Slow path: fallback to v3 (out of segment or cache full)
    so_free(7, ptr);
}