hakmem/core/hakmem_tiny.h

#ifndef HAKMEM_TINY_H
#define HAKMEM_TINY_H

#include <stddef.h>
#include <stdint.h>
#include "hakmem_build_flags.h"
#include "hakmem_trace.h"  // Optional USDT (perf) tracepoints
#include <pthread.h>
#include <stdatomic.h>

// Include page mini-magazine module (Phase 1: Hybrid optimization)
#include "hakmem_tiny_mini_mag.h"

// Forward declaration for initialization guard
int hak_is_initializing(void);

// Phase 6.12: Tiny Pool - Slab Allocator for ≤1KB allocations
// 8 size classes: 8B, 16B, 32B, 64B, 128B, 256B, 512B, 1KB

// ============================================================================
// Configuration
// ============================================================================

#define TINY_NUM_CLASSES 8
#define TINY_SLAB_SIZE (64 * 1024)  // 64KB per slab
// Phase E1-CORRECT: All Tiny classes use a 1-byte header.
// C7 stride=1024B → usable 1023B (1024-1). 1024B は Mid allocator に委譲する。
#define TINY_MAX_SIZE 1024          // Tiny handles up to 1024B (C7 total size) - default

// Phase 16: Dynamic Tiny max size control (ENV: HAKMEM_TINY_MAX_CLASS)
// Strategy: Reduce Tiny coverage to ~256B, delegate 512/1024B to Mid
// ENV values:
//   HAKMEM_TINY_MAX_CLASS=5 → Tiny handles up to 255B (C0-C5)
//   HAKMEM_TINY_MAX_CLASS=7 → Tiny handles up to 1023B (C0-C7, default)
// Forward declaration (implementation in hakmem_tiny.c)
size_t tiny_get_max_size(void);

// ============================================================================
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL Structure
// ============================================================================
//
// Goal: Improve L1D cache hit rate by merging head+count into same struct.
//
// OLD (cache line split):
//   __thread void*    g_tls_sll_head[8];   // 64 bytes (cache line 0)
//   __thread uint32_t g_tls_sll_count[8];  // 32 bytes (cache line 1)
//   → 2 L1D loads per operation (head from line 0, count from line 1)
//
// NEW (unified):
//   __thread TinyTLSSLL g_tls_sll[8];  // 128 bytes = 2 cache lines
//   → 1 L1D load per operation (head+count in same 16B struct)
//
// Expected: +12-18% improvement from cache locality
//
typedef struct {
    void*    head;   // SLL head pointer (8 bytes)
    uint32_t count;  // Number of elements in SLL (4 bytes)
    uint32_t _pad;   // Padding to 16 bytes for cache alignment (4 bytes)
} TinyTLSSLL;

// ============================================================================
// Size Classes
// ============================================================================

// Size class table (branchless lookup)
// Note: Definition is in hakmem_tiny.c to avoid multiple definition errors
// Declaration is in hakmem_tiny_config.h as: extern const size_t g_tiny_class_sizes[TINY_NUM_CLASSES];
// Box 3 (tiny_box_geometry.h) uses this via hakmem_tiny_config.h
// (Definition removed from header - see hakmem_tiny.c)

// Full LUT (1..2048) for branchless size-to-class mapping (index by size).
// Phase C7-UPGRADE: Expanded from 1025 -> 2049 to support 2048B stride (C7).
// Memory cost ~2KB. Zero hot-path arithmetic for all Tiny sizes.
// Generate repeated values via helper macros to keep the source compact.
#define HAK_R1(x)   x
#define HAK_R2(x)   HAK_R1(x), HAK_R1(x)
#define HAK_R4(x)   HAK_R2(x), HAK_R2(x)
#define HAK_R8(x)   HAK_R4(x), HAK_R4(x)
#define HAK_R16(x)  HAK_R8(x), HAK_R8(x)
#define HAK_R32(x)  HAK_R16(x), HAK_R16(x)
#define HAK_R64(x)  HAK_R32(x), HAK_R32(x)
#define HAK_R128(x) HAK_R64(x), HAK_R64(x)
#define HAK_R256(x) HAK_R128(x), HAK_R128(x)
#define HAK_R512(x) HAK_R256(x), HAK_R256(x)
#define HAK_R1024(x) HAK_R512(x), HAK_R512(x)

static const int8_t g_size_to_class_lut_2k[2049] = {
    -1,                 // index 0: invalid
    HAK_R8(0),          // 1..8    -> class 0
    HAK_R8(1),          // 9..16   -> class 1
    HAK_R16(2),         // 17..32  -> class 2
    HAK_R32(3),         // 33..64  -> class 3
    HAK_R64(4),         // 65..128 -> class 4
    HAK_R128(5),        // 129..256 -> class 5
    HAK_R256(6),        // 257..512 -> class 6
    HAK_R1024(7),       // 513..1536 -> class 7 (1024 entries)
    HAK_R512(7),        // 1537..2048 -> class 7 (512 entries)
};

#undef HAK_R1024
#undef HAK_R512
#undef HAK_R256
#undef HAK_R128
#undef HAK_R64
#undef HAK_R32
#undef HAK_R16
#undef HAK_R8
#undef HAK_R4
#undef HAK_R2
#undef HAK_R1

// Blocks per slab for each class
static const uint16_t g_tiny_blocks_per_slab[TINY_NUM_CLASSES] = {
    8192,   // Class 0:  64KB /    8B = 8192 blocks
    4096,   // Class 1:  64KB /   16B = 4096 blocks
    2048,   // Class 2:  64KB /   32B = 2048 blocks
    1024,   // Class 3:  64KB /   64B = 1024 blocks
    512,    // Class 4:  64KB /  128B =  512 blocks
    256,    // Class 5:  64KB /  256B =  256 blocks
    128,    // Class 6:  64KB /  512B =  128 blocks
    32      // Class 7:  64KB / 2048B =   32 blocks
};

// Bitmap size (uint64_t words) for each class
static const uint8_t g_tiny_bitmap_words[TINY_NUM_CLASSES] = {
    128,    // Class 0: 8192 blocks / 64 = 128 words
    64,     // Class 1: 4096 blocks / 64 =  64 words
    32,     // Class 2: 2048 blocks / 64 =  32 words
    16,     // Class 3: 1024 blocks / 64 =  16 words
    8,      // Class 4:  512 blocks / 64 =   8 words
    4,      // Class 5:  256 blocks / 64 =   4 words
    2,      // Class 6:  128 blocks / 64 =   2 words
    1       // Class 7:   64 blocks / 64 =   1 word
};

// ============================================================================
// Data Structures
// ============================================================================

// Forward declaration
typedef struct TinySlab TinySlab;

// Step 2: Slab Registry (Hash Table for O(1) lookup)
#define SLAB_REGISTRY_SIZE 1024
#define SLAB_REGISTRY_MASK (SLAB_REGISTRY_SIZE - 1)
#define SLAB_REGISTRY_MAX_PROBE 8

typedef struct {
    uintptr_t slab_base;         // 64KB aligned base address (0 = empty slot)
    _Atomic(TinySlab*) owner;    // Atomic pointer to TinySlab metadata (MT-safe)
} SlabRegistryEntry;

// Global registry (extern for access from multiple translation units)
extern SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];

// Tiny Pool initialization flag (extern for inline function access)
extern int g_tiny_initialized;

// Per-class locks to protect slab lists and bitmaps (padded to avoid false sharing)
typedef struct __attribute__((aligned(64))) { pthread_mutex_t m; char _pad[64]; } PaddedLock;
extern PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];

// Slab header (one per 64KB slab)
typedef struct TinySlab {
    void* base;                     // Base address (64KB aligned)
    uint64_t* bitmap;               // Free block bitmap (dynamic size)
    uint16_t free_count;            // Number of free blocks
    uint16_t total_count;           // Total blocks in slab
    uint8_t class_idx;              // Size class index (0-7)
    uint8_t _padding[3];
    struct TinySlab* next;          // Next slab in list
    // MPSC remote-free stack head (lock-free). Stores user ptrs; next is embedded in block.
    atomic_uintptr_t remote_head;
    // Approximate count of pending remote frees (for drain thresholding)
    atomic_uint remote_count;
    // Targeted remote-drain queue linkage and state (for BG drain targeting)
    struct TinySlab* remote_q_next;   // Intrusive next pointer for target stack
    atomic_uint remote_queued;        // 0=not enqueued, 1=enqueued (CAS guarded)
    // Owning thread (for remote detection). Allocations from this thread use TLS fast path.
    pthread_t owner_tid;
    // Hint for next scan start (reduces bitmap word scanning)
    uint16_t hint_word;
    // Summary bitmap (2nd level): per 64-word group, bit=1 if the group has any free block
    uint8_t summary_words;          // number of summary words (=(bitmap_words+63)/64)
    uint8_t _pad_sum[1];
    uint64_t* summary;              // length = summary_words

    // Phase 1: Page Mini-Magazine (Hybrid bitmap+free-list optimization)
    // Fast LIFO cache (16-32 items) for O(1) allocation without bitmap scan
    // Cost: 1-2 ns (vs 5-6 ns bitmap scan)
    PageMiniMag mini_mag;           // LIFO free-list cache
} TinySlab;

// Global Tiny Pool state
typedef struct {
    TinySlab* free_slabs[TINY_NUM_CLASSES];   // Slabs with free blocks
    TinySlab* full_slabs[TINY_NUM_CLASSES];   // Full slabs (no free blocks)
    uint64_t alloc_count[TINY_NUM_CLASSES];   // Allocation count per class
    uint64_t free_count[TINY_NUM_CLASSES];    // Free count per class
    uint64_t slab_count[TINY_NUM_CLASSES];    // Total slabs per class
} TinyPool;

// Global pool instance (defined in hakmem_tiny.c)
extern TinyPool g_tiny_pool;

// ============================================================================
// API Functions
// ============================================================================

// Initialize Tiny Pool
void hak_tiny_init(void);

// Allocate from Tiny Pool (returns NULL if size > 1KB)
void* hak_tiny_alloc(size_t size);

// Free to Tiny Pool (no-op if ptr is not managed by Tiny Pool)
void hak_tiny_free(void* ptr);

// Phase 6.12.1: Free with pre-calculated slab (avoids duplicate owner_slab lookup)
void hak_tiny_free_with_slab(void* ptr, TinySlab* slab);

// Check if pointer is managed by Tiny Pool
int hak_tiny_is_managed(void* ptr);
int hak_tiny_is_managed_superslab(void* ptr);

// Return the usable size for a Tiny-managed pointer (0 if unknown/not tiny).
// For SuperSlab-backed blocks, uses size class from the owning SuperSlab.
// For TinySlab-backed blocks, uses class_idx from the owning slab.
size_t hak_tiny_usable_size(void* ptr);

// Get statistics
void hak_tiny_get_stats(uint64_t* alloc_count, uint64_t* free_count, uint64_t* slab_count);

// Print statistics (debug)
void hak_tiny_print_stats(void);

// Phase 7.7: Magazine flush API (reduce memory footprint)
// Flush Magazine cache to freelists, enabling empty SuperSlab detection
void hak_tiny_magazine_flush(int class_idx);
void hak_tiny_magazine_flush_all(void);

// Trim empty Tiny slabs by releasing fully-free slabs back to the system.
// Safe to call anytime; holds per-class locks while trimming.
void hak_tiny_trim(void);

// Optional shutdown hook for Tiny subsystem.
// Stops background threads (e.g., Deferred Intelligence) and performs
// any best-effort cleanup needed during process shutdown.
void hak_tiny_shutdown(void);

// Phase 8.2: Memory profiling (toggle with HAKMEM_DEBUG_MEMORY)
// Print detailed breakdown of memory usage by component
void hak_tiny_print_memory_profile(void);

// Debug: dump Ultra Tiny counters (pop hits/refills/resets)
void hak_tiny_ultra_debug_dump(void);
void hak_tiny_path_debug_dump(void);

// ============================================================================
// ACE Learning Layer: Runtime parameter adjustment
// ============================================================================

// Exported for ACE controller access
extern int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES];

// Set remote drain threshold for a specific size class
void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold);

// ============================================================================
// Internal Helpers (branchless size-to-class)
// ============================================================================

// Convert size to class index (branchless lookup)
// Phase C7-UPGRADE: ALL classes have 1-byte header
// C7 max usable: 2047B (2048B total with header)
// malloc(2048+) → routed to Mid allocator
static inline int hak_tiny_size_to_class(size_t size) {
    if (size == 0) return -1;
#if HAKMEM_TINY_HEADER_CLASSIDX
    // Phase C7-UPGRADE: ALL classes have 1-byte header
    // Box: [Header 1B][Data NB] = (N+1) bytes total
    // g_tiny_class_sizes stores TOTAL size, so we need size+1 bytes
    // User requests N bytes → need (N+1) total → look up class with stride ≥ (N+1)
    // Max usable: 2047B (C7 stride=2048B)
    if (size > 2047) return -1;  // 2048+ → Mid allocator
    // Find smallest class where stride ≥ (size + 1)
    // LUT maps total_size → class, so lookup (size + 1) to find class with that stride
    size_t needed = size + 1;  // total bytes needed (data + header)
    if (needed > 2048) return -1;
    return g_size_to_class_lut_2k[needed];
#else
    if (size > 1024) return -1;
    return g_size_to_class_lut_2k[size];  // 1..1024
#endif
}

// ============================================================================
// Phase 6.12.1: O(1) Slab Lookup (Embedded Metadata)
// ============================================================================

// Phase 6.12.1: Find slab owner by pointer
// NOTE: Reverted from O(1) embedded metadata to O(N) linear search for safety
// Embedded metadata requires dereferencing potentially unmapped memory
// This is still faster than before because Option C eliminates duplicate calls
TinySlab* hak_tiny_owner_slab(void* ptr);

// ============================================================================
// Bitmap Operations (inline for speed)
// ============================================================================

// Set block as used
static inline void hak_tiny_set_used(TinySlab* slab, int block_idx) {
    int word_idx = block_idx / 64;
    int bit_idx = block_idx % 64;
    uint64_t v = slab->bitmap[word_idx] | (1ULL << bit_idx);
    slab->bitmap[word_idx] = v;
    // update summary: set to 1 if any free bit remains, else 0
    int sum_word = word_idx / 64;
    int sum_bit  = word_idx % 64;
    uint64_t has_free = ~v;  // any zero in word means free
    if (has_free != 0) {
        slab->summary[sum_word] |= (1ULL << sum_bit);
    } else {
        slab->summary[sum_word] &= ~(1ULL << sum_bit);
    }
}

// Set block as free
static inline void hak_tiny_set_free(TinySlab* slab, int block_idx) {
    int word_idx = block_idx / 64;
    int bit_idx = block_idx % 64;
    uint64_t v = slab->bitmap[word_idx] & ~(1ULL << bit_idx);
    slab->bitmap[word_idx] = v;
    // update summary: this word now certainly has a free bit
    int sum_word = word_idx / 64;
    int sum_bit  = word_idx % 64;
    slab->summary[sum_word] |= (1ULL << sum_bit);
}

// Check if block is used
static inline int hak_tiny_is_used(TinySlab* slab, int block_idx) {
    int word_idx = block_idx / 64;
    int bit_idx = block_idx % 64;
    return (slab->bitmap[word_idx] & (1ULL << bit_idx)) != 0;
}

// Find first free block (returns -1 if none)
static inline int hak_tiny_find_free_block(TinySlab* slab) {
    // Trace bitmap scan attempts
    HAK_TP1(bitmap_scan, slab->class_idx);
    const int bw = g_tiny_bitmap_words[slab->class_idx];
    const int sw = slab->summary_words;
    if (bw <= 0 || sw <= 0) return -1;

    int start_word = slab->hint_word % bw;
    int start_sw = start_word / 64;
    int start_sb = start_word % 64;

    for (int k = 0; k < sw; k++) {
        int idx = start_sw + k;
        if (idx >= sw) idx -= sw;
        uint64_t bits = slab->summary[idx];
        // mask low bits on first iteration
        if (k == 0) {
            bits &= (~0ULL) << start_sb;
        }
        // mask out-of-range bits in last summary word
        if (idx == sw - 1 && (bw % 64) != 0) {
            uint64_t mask = (bw % 64) == 64 ? ~0ULL : ((1ULL << (bw % 64)) - 1ULL);
            bits &= mask;
        }
        if (bits == 0) continue;
        int woff = __builtin_ctzll(bits);              // word offset within this summary word
        int word_idx = idx * 64 + woff;                // bitmap word index
        if (word_idx >= bw) continue;                  // safety
        uint64_t used = slab->bitmap[word_idx];
        uint64_t free_bits = ~used;
        if (free_bits == 0) continue;                  // should not happen if summary correct
        int bit_idx = __builtin_ctzll(free_bits);      // first free block within word
        slab->hint_word = (uint16_t)((word_idx + 1) % bw);
        return word_idx * 64 + bit_idx;
    }
    return -1;
}

#endif  // HAKMEM_TINY_H