hakmem/core/hakmem_tiny_hot_pop_v4.inc.h

// hakmem_tiny_hot_pop_v4.inc.h
// Phase 4-A1: TLS-BUMP Immediate-Value Hot Functions
//
// This file contains Phase 4-A1 optimized hot-path functions with:
// - Immediate-value block sizes (no g_tiny_class_sizes[] lookup)
// - Direct TLS bump allocation (2-register path)
// - Branch minimization
//
// Expected improvement: +5-8% (16.53 → 17.5-18.0 M ops/sec)

#ifndef HAKMEM_TINY_HOT_POP_V4_INC_H
#define HAKMEM_TINY_HOT_POP_V4_INC_H

#include "hakmem_tiny.h"
#include <stdint.h>

// External TLS variables
extern int g_fast_enable;
extern uint16_t g_fast_cap[TINY_NUM_CLASSES];
extern __thread void* g_fast_head[TINY_NUM_CLASSES];
extern __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES];

// ============================================================================
// Phase 4-A1: Immediate-Value TLS-BUMP Functions
// ============================================================================

// Class 0: 8B (immediate value)
static inline __attribute__((always_inline))
void* tiny_hot_bump_class0_v4(void) {
    uint8_t* p = g_tls_bcur[0];
    if (__builtin_expect(p != NULL, 1)) {
        uint8_t* n = p + 8;  // ← Immediate value!
        uint8_t* end = g_tls_bend[0];
        if (__builtin_expect(n <= end, 1)) {
            g_tls_bcur[0] = n;
            return p;  // 2-register hot path! 🚀
        }
        // Window exhausted - clear and fallback
        g_tls_bcur[0] = NULL;
        g_tls_bend[0] = NULL;
    }
    return NULL;  // Fallback to next tier
}

// Class 1: 16B (immediate value)
static inline __attribute__((always_inline))
void* tiny_hot_bump_class1_v4(void) {
    uint8_t* p = g_tls_bcur[1];
    if (__builtin_expect(p != NULL, 1)) {
        uint8_t* n = p + 16;  // ← Immediate value!
        uint8_t* end = g_tls_bend[1];
        if (__builtin_expect(n <= end, 1)) {
            g_tls_bcur[1] = n;
            return p;
        }
        g_tls_bcur[1] = NULL;
        g_tls_bend[1] = NULL;
    }
    return NULL;
}

// Class 2: 32B (immediate value)
static inline __attribute__((always_inline))
void* tiny_hot_bump_class2_v4(void) {
    uint8_t* p = g_tls_bcur[2];
    if (__builtin_expect(p != NULL, 1)) {
        uint8_t* n = p + 32;  // ← Immediate value!
        uint8_t* end = g_tls_bend[2];
        if (__builtin_expect(n <= end, 1)) {
            g_tls_bcur[2] = n;
            return p;
        }
        g_tls_bcur[2] = NULL;
        g_tls_bend[2] = NULL;
    }
    return NULL;
}

// ============================================================================
// Phase 4-A1: Hot-Class Wrapper Functions (BUMP → Linked-List)
// ============================================================================

// Phase 4-A1: Replace original hot-pop functions (keep same names for compatibility)
static inline __attribute__((always_inline))
void* tiny_hot_pop_class0(void) {
    // Layer 1: TLS-BUMP (immediate-value, 2-register)
    void* bump = tiny_hot_bump_class0_v4();
    if (__builtin_expect(bump != NULL, 1)) {
        return bump;
    }

    // Layer 2: Linked-list fallback (existing hot path)
    if (__builtin_expect(!g_fast_enable, 0)) return NULL;
    uint16_t cap = g_fast_cap[0];
    if (__builtin_expect(cap == 0, 0)) return NULL;
    void* head = g_fast_head[0];
    if (__builtin_expect(head == NULL, 0)) return NULL;
    g_fast_head[0] = *(void**)head;
    uint16_t count = g_fast_count[0];
    if (count > 0) {
        g_fast_count[0] = (uint16_t)(count - 1);
    } else {
        g_fast_count[0] = 0;
    }
    return head;
}

static inline __attribute__((always_inline))
void* tiny_hot_pop_class1(void) {
    void* bump = tiny_hot_bump_class1_v4();
    if (__builtin_expect(bump != NULL, 1)) {
        return bump;
    }

    if (__builtin_expect(!g_fast_enable, 0)) return NULL;
    uint16_t cap = g_fast_cap[1];
    if (__builtin_expect(cap == 0, 0)) return NULL;
    void* head = g_fast_head[1];
    if (__builtin_expect(head == NULL, 0)) return NULL;
    g_fast_head[1] = *(void**)head;
    uint16_t count = g_fast_count[1];
    if (count > 0) {
        g_fast_count[1] = (uint16_t)(count - 1);
    } else {
        g_fast_count[1] = 0;
    }
    return head;
}

static inline __attribute__((always_inline))
void* tiny_hot_pop_class2(void) {
    void* bump = tiny_hot_bump_class2_v4();
    if (__builtin_expect(bump != NULL, 1)) {
        return bump;
    }

    if (__builtin_expect(!g_fast_enable, 0)) return NULL;
    uint16_t cap = g_fast_cap[2];
    if (__builtin_expect(cap == 0, 0)) return NULL;
    void* head = g_fast_head[2];
    if (__builtin_expect(head == NULL, 0)) return NULL;
    g_fast_head[2] = *(void**)head;
    uint16_t count = g_fast_count[2];
    if (count > 0) {
        g_fast_count[2] = (uint16_t)(count - 1);
    } else {
        g_fast_count[2] = 0;
    }
    return head;
}

// Class 3: 64B - Keep original (no immediate optimization for now)
// Forward declaration
static inline void* superslab_tls_bump_fast(int class_idx);

static inline __attribute__((always_inline))
void* tiny_hot_pop_class3(void) {
    // Use original implementation (from hakmem_tiny_hot_pop.inc.h)
    extern int g_ultra_bump_shadow;
    if (__builtin_expect(g_ultra_bump_shadow != 0, 1)) {  // Expect enabled!
        void* bump = superslab_tls_bump_fast(3);
        if (__builtin_expect(bump != NULL, 1)) {
            return bump;
        }
    }
    if (__builtin_expect(!g_fast_enable, 0)) return NULL;
    uint16_t cap = g_fast_cap[3];
    if (__builtin_expect(cap == 0, 0)) return NULL;
    void* head = g_fast_head[3];
    if (__builtin_expect(head == NULL, 0)) return NULL;
    g_fast_head[3] = *(void**)head;
    uint16_t count = g_fast_count[3];
    if (count > 0) {
        g_fast_count[3] = (uint16_t)(count - 1);
    } else {
        g_fast_count[3] = 0;
    }
    return head;
}

#endif // HAKMEM_TINY_HOT_POP_V4_INC_H