hakmem/core/tiny_fastcache.h

// tiny_fastcache.h - Ultra-Simple Tiny Fast Path (System tcache style)
// Phase 6-3: Bypass Magazine/SuperSlab for Tiny allocations (<=128B)
// Goal: 3-4 instruction fast path, 70-80% of System tcache performance
#pragma once

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <stdlib.h>  // For getenv()
#include "box/tiny_next_ptr_box.h"  // Box API: Next pointer read/write

// ========== Configuration ==========

// Enable Tiny Fast Path (default: ON for Phase 6-3)
#ifndef HAKMEM_TINY_FAST_PATH
#define HAKMEM_TINY_FAST_PATH 1
#endif

// Tiny class count (sizes: 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128)
#define TINY_FAST_CLASS_COUNT 16

// Fast cache capacity per class (default: 64 slots, like System tcache)
#ifndef TINY_FAST_CACHE_CAP
#define TINY_FAST_CACHE_CAP 64
#endif

// Tiny size threshold (<=128B goes to fast path)
#define TINY_FAST_THRESHOLD 128

// ========== TLS Cache (System tcache style) ==========

// Per-thread fast cache: array of freelist heads (defined in tiny_fastcache.c)
extern __thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT];

// Per-thread cache counts (for capacity management)
extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];

// Initialized flag
extern __thread int g_tiny_fast_initialized;

// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
// Separate free staging area to reduce cache line bouncing

extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT];
extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT];

// ========== RDTSC Profiling (Phase 6-8) ==========
// Extern declarations for inline functions to access profiling counters

extern __thread uint64_t g_tiny_malloc_count;
extern __thread uint64_t g_tiny_malloc_cycles;
extern __thread uint64_t g_tiny_free_count;
extern __thread uint64_t g_tiny_free_cycles;
extern __thread uint64_t g_tiny_refill_cycles;
extern __thread uint64_t g_tiny_migration_count;
extern __thread uint64_t g_tiny_migration_cycles;

#ifdef __x86_64__
static inline uint64_t tiny_fast_rdtsc(void) {
    unsigned int lo, hi;
    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
    return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
#endif

extern int g_profile_enabled;
static inline int tiny_fast_profile_enabled(void) {
#if !HAKMEM_BUILD_RELEASE
    extern int g_profile_enabled;
    if (__builtin_expect(g_profile_enabled == -1, 0)) {
        const char* env = getenv("HAKMEM_TINY_PROFILE");
        g_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
    }
    return g_profile_enabled;
#else
    return 0;
#endif
}

// ========== Size to Class Mapping ==========
// Inline size-to-class for fast path (O(1) lookup table)

static inline int tiny_fast_size_to_class(size_t size) {
    // Optimized: Lookup table for O(1) mapping (vs 11-branch linear search)
    // Table indexed by (size >> 3) for sizes 0-128
    // Class mapping: 0:16B, 1:24B, 2:32B, 3:40B, 4:48B, 5:56B, 6:64B, 7:80B, 8:96B, 9:112B, 10:128B

    static const int8_t size_to_class_lut[17] = {
        0,   // 0-7    → 16B (class 0)
        0,   // 8-15   → 16B (class 0)
        0,   // 16     → 16B (class 0)
        1,   // 17-23  → 24B (class 1)
        1,   // 24     → 24B (class 1)
        2,   // 25-31  → 32B (class 2)
        2,   // 32     → 32B (class 2)
        3,   // 33-39  → 40B (class 3)
        3,   // 40     → 40B (class 3)
        4,   // 41-47  → 48B (class 4)
        4,   // 48     → 48B (class 4)
        5,   // 49-55  → 56B (class 5)
        5,   // 56     → 56B (class 5)
        6,   // 57-63  → 64B (class 6)
        6,   // 64     → 64B (class 6)
        7,   // 65-79  → 80B (class 7)
        8    // 80-95  → 96B (class 8)
    };

    if (__builtin_expect(size > 128, 0)) return -1;  // Not tiny

    // Fast path: Direct lookup (1-2 instructions!)
    unsigned int idx = size >> 3;  // size / 8
    if (__builtin_expect(idx < 17, 1)) {
        return size_to_class_lut[idx];
    }

    // Size 96-128: class 9-10
    if (size <= 112) return 9;   // 112B (class 9)
    return 10;                   // 128B (class 10)
}

// ========== Forward Declarations ==========
// Slow path: refill from Magazine/SuperSlab (implemented in tiny_fastcache.c)
void* tiny_fast_refill(int class_idx);
void tiny_fast_drain(int class_idx);

// ========== Fast Path: Alloc (3-4 instructions!) ==========

static inline void* tiny_fast_alloc(size_t size) {
    uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0;

    // Step 1: Size to class (1-2 instructions, branch predictor friendly)
    int cls = tiny_fast_size_to_class(size);
    do {
        static _Atomic uint32_t g_tfa_diag = 0;
        uint32_t n = atomic_fetch_add_explicit(&g_tfa_diag, 1, memory_order_relaxed);
        if (n < 4) {
            fprintf(stderr, "[TINY_FAST_ALLOC_DIAG] size=%zu cls=%d cache_head=%p free_head=%p\n",
                    size, cls, g_tiny_fast_cache[cls], g_tiny_fast_free_head[cls]);
        }
    } while (0);
    if (__builtin_expect(cls < 0, 0)) return NULL;  // Not tiny (rare)

    // Step 2: Pop from alloc_head (hot allocation path)
    void* ptr = g_tiny_fast_cache[cls];
    if (__builtin_expect(ptr != NULL, 1)) {
        // Fast path: Pop head, decrement count
        g_tiny_fast_cache[cls] = tiny_next_read(cls, ptr);
        g_tiny_fast_count[cls]--;

        if (start) {
            g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
            g_tiny_malloc_count++;
        }
        return ptr;
    }

    // ========================================================================
    // Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2)
    // If alloc_head empty but free_head has blocks, migrate with pointer swap
    // This is mimalloc's key optimization: batched migration, zero overhead
    // ========================================================================
    if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) {
        uint64_t mig_start = start ? tiny_fast_rdtsc() : 0;

        // Migrate entire free_head → alloc_head (pointer swap, instant!)
        g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls];
        g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls];
        g_tiny_fast_free_head[cls] = NULL;
        g_tiny_fast_free_count[cls] = 0;

        // Now pop one from newly migrated list
        ptr = g_tiny_fast_cache[cls];
        g_tiny_fast_cache[cls] = tiny_next_read(cls, ptr);
        g_tiny_fast_count[cls]--;

        if (mig_start) {
            g_tiny_migration_cycles += (tiny_fast_rdtsc() - mig_start);
            g_tiny_migration_count++;
        }

        if (start) {
            g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
            g_tiny_malloc_count++;
        }
        return ptr;
    }

    // Step 3: Slow path - refill from Magazine/SuperSlab
    ptr = tiny_fast_refill(cls);

    if (start) {
        g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start);
        g_tiny_malloc_count++;
    }
    return ptr;
}

// ========== Fast Path: Free (2-3 instructions!) ==========

static inline void tiny_fast_free(void* ptr, size_t size) {
    uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0;

    // Step 1: Size to class
    int cls = tiny_fast_size_to_class(size);
    if (__builtin_expect(cls < 0, 0)) return;  // Not tiny (error)

    // ========================================================================
    // Phase 6-7: Push to free_head (Phase 2)
    // Separate free staging area reduces cache line contention with alloc_head
    // mimalloc's key insight: alloc/free touch different cache lines
    // ========================================================================

    // Step 2: Check free_head capacity
    if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) {
        // Free cache full - drain to Magazine/SuperSlab
        tiny_fast_drain(cls);
    }

    // Step 3: Push to free_head (separate cache line from alloc_head!)
    // Phase E1-CORRECT: All tiny classes have 1-byte header; use BASE pointer.
    void* base = (uint8_t*)ptr - 1;
    tiny_next_write(cls, base, g_tiny_fast_free_head[cls]);
    g_tiny_fast_free_head[cls] = base;
    g_tiny_fast_free_count[cls]++;

    if (start) {
        g_tiny_free_cycles += (tiny_fast_rdtsc() - start);
        g_tiny_free_count++;
    }
}

// ========== Initialization ==========

static inline void tiny_fast_init(void) {
    if (g_tiny_fast_initialized) return;

    memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache));
    memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count));

    // Phase 6-7: Initialize dual free lists (Phase 2)
    memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head));
    memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count));

    g_tiny_fast_initialized = 1;
}