hakmem/core/hakmem_super_registry.h

#pragma once

#include <stdio.h>
#include <stdlib.h>

// Phase 1: SuperSlab Registry - Thread-safe O(1) lookup for SuperSlab ownership
//
// Purpose: Replace mincore() syscall (50-100ns) with userspace hash table lookup
// Performance: ~5-10ns per lookup, 10-20x faster than mincore()
//
// Thread Safety:
//   - Readers: Lock-free with acquire semantics
//   - Writers: Mutex-protected with release semantics
//   - Publish order: ss initialization → release fence → base write
//   - Unpublish order: base = 0 (release) → munmap (prevents reader deref)

#include <stdatomic.h>
#include <pthread.h>
#include <stdint.h>
#include "hakmem_tiny_superslab.h"  // For SuperSlab and SUPERSLAB_MAGIC
#include "box/ss_addr_map_box.h"    // Phase 9-1: O(1) hash table lookup

// Registry configuration
// Increased from 4096 to 32768 to avoid registry exhaustion under
// high-churn microbenchmarks (e.g., larson with many active SuperSlabs).
// Still a power of two for fast masking.
#define SUPER_REG_SIZE      1048576   // Power of 2 for fast modulo (1M entries)
#define SUPER_REG_MASK      (SUPER_REG_SIZE - 1)
#define SUPER_MAX_PROBE     32     // Linear probing limit (increased from 8 for Phase 15 fix)

// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
// - Refill scan: O(class_size) instead of O(262144)
// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
#define SUPER_REG_PER_CLASS 16384  // Per-class registry capacity (increased for high-churn workloads)

// Registry entry: base address → SuperSlab pointer mapping
typedef struct {
    _Atomic(uintptr_t) base;  // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync]
    _Atomic(SuperSlab*) ss;   // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race)
    uint8_t   lg_size;        // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
    uint8_t   _pad[7];        // Padding to 24 bytes (cache-friendly)
} SuperRegEntry;

// Global registry (lock-free reads, mutex-protected writes)
extern SuperRegEntry g_super_reg[SUPER_REG_SIZE];
extern pthread_mutex_t g_super_reg_lock;
extern int g_super_reg_initialized;

// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
// - Protected by g_super_reg_lock (shared with main registry)
#ifndef TINY_NUM_CLASSES
#define TINY_NUM_CLASSES 8  // Fallback if hakmem_tiny.h not included yet
#endif
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
extern int g_super_reg_class_size[TINY_NUM_CLASSES];

// ============================================================================
// Phase 9: Lazy Deallocation - LRU Cache Manager
// ============================================================================

// Global LRU cache for empty SuperSlabs (lazy deallocation)
typedef struct {
    SuperSlab* lru_head;           // LRU list head (most recently used)
    SuperSlab* lru_tail;           // LRU list tail (least recently used)
    uint32_t total_count;          // Total SuperSlabs in cache
    uint32_t max_cached;           // Maximum cached SuperSlabs (default: 256)
    uint64_t total_memory_mb;      // Total memory in cache (MB)
    uint64_t max_memory_mb;        // Maximum memory limit (MB, default: 512)
    uint64_t ttl_ns;               // Time-to-live (nanoseconds, default: 60s)
    uint32_t generation;           // Current generation counter
} SuperSlabLRUCache;

extern SuperSlabLRUCache g_ss_lru_cache;

// Initialize LRU cache (called once at startup)
void hak_ss_lru_init(void);

// Try to reuse a cached SuperSlab (returns NULL if cache is empty)
SuperSlab* hak_ss_lru_pop(uint8_t size_class);

// Add SuperSlab to LRU cache (returns 1 if cached, 0 if evicted immediately)
int hak_ss_lru_push(SuperSlab* ss);

// Evict old SuperSlabs based on policy (TTL, max_cached, max_memory_mb)
void hak_ss_lru_evict(void);

// Mark SuperSlab as recently used (update timestamp, move to head)
void hak_ss_lru_touch(SuperSlab* ss);

// ============================================================================
// Phase 11: SuperSlab Prewarm - Eliminate mmap/munmap bottleneck
// ============================================================================

// Prewarm: Allocate SuperSlabs at startup and add to LRU cache
void hak_ss_prewarm_init(void);

// Prewarm specific size class with count SuperSlabs
void hak_ss_prewarm_class(int size_class, uint32_t count);

// Prewarm all classes (counts[i] = number of SuperSlabs for class i)
void hak_ss_prewarm_all(const uint32_t counts[TINY_NUM_CLASSES]);

// Initialize registry (call once at startup)
void hak_super_registry_init(void);

// Hash function for aligned addresses (variable size)
static inline int hak_super_hash(uintptr_t base, int lg_size) {
    // Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB)
    return (int)((base >> lg_size) & SUPER_REG_MASK);
}

// Lookup SuperSlab by pointer (lock-free, thread-safe)
// Returns: SuperSlab* if found, NULL otherwise
// Phase 9-1: Optimized with hash table O(1) lookup (replaced linear probing)
static inline SuperSlab* hak_super_lookup(void* ptr) {
    if (!g_super_reg_initialized) return NULL;

    // Phase 9-1: Use new O(1) hash table lookup
    // Replaces old linear probing (50-80 cycles → 10-20 cycles)
    SuperSlab* ss = ss_map_lookup(&g_ss_addr_map, ptr);

    // Fallback: If hash map misses (e.g., map not populated yet), probe the
    // legacy registry table to avoid NULL for valid SuperSlabs.
    if (__builtin_expect(ss == NULL, 0)) {
        uintptr_t p = (uintptr_t)ptr;
        for (int lg = SUPERSLAB_LG_MIN; lg <= SUPERSLAB_LG_MAX; lg++) {
            uintptr_t base = p & ~(((uintptr_t)1 << lg) - 1);
            int h = hak_super_hash(base, lg);
            for (int i = 0; i < SUPER_MAX_PROBE; i++) {
                SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK];
                uintptr_t reg_base = atomic_load_explicit(&e->base, memory_order_acquire);
                if (reg_base == 0) {
                    break;  // empty slot
                }
                if (reg_base == base && e->lg_size == lg) {
                    ss = atomic_load_explicit(&e->ss, memory_order_acquire);
                    goto reg_probe_done;
                }
            }
        }
reg_probe_done:
        ;
    }

#if !HAKMEM_BUILD_RELEASE
    // Debug logging (ENV-gated)
    static __thread int s_dbg = -1;
    if (__builtin_expect(s_dbg == -1, 0)) {
        const char* e = getenv("HAKMEM_SUPER_LOOKUP_DEBUG");
        s_dbg = (e && *e && *e != '0') ? 1 : 0;
    }

    if (s_dbg == 1) {
        if (ss) {
            fprintf(stderr, "[SUPER_LOOKUP] ptr=%p -> ss=%p (hash table hit)\n", ptr, (void*)ss);
        } else {
            fprintf(stderr, "[SUPER_LOOKUP] ptr=%p -> NULL (hash table miss)\n", ptr);
        }
    }
#endif

    // Magic check for safety (same as before)
    if (ss && ss->magic != SUPERSLAB_MAGIC) {
#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[SUPER_LOOKUP] WARNING: ss=%p has bad magic=%llx (being freed)\n",
                (void*)ss, (unsigned long long)ss->magic);
#endif
        return NULL;  // Being freed
    }

    return ss;
}

// Register SuperSlab (mutex-protected, called after SuperSlab initialization)
// Returns: 1 on success, 0 if registry is full
int hak_super_register(uintptr_t base, SuperSlab* ss);

// Unregister SuperSlab (mutex-protected, MUST call before munmap)
// Critical: base = 0 happens BEFORE munmap to prevent reader segfault
void hak_super_unregister(uintptr_t base);

// Debug: Get registry statistics
typedef struct {
    int total_slots;
    int used_slots;
    int max_probe_depth;
} SuperRegStats;

void hak_super_registry_stats(SuperRegStats* stats);