Files
hakmem/core/hakmem_super_registry.h
Moe Charm (CI) d9991f39ff Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update
Add comprehensive design docs and research boxes:
- docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation
- docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs
- docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research
- docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design
- docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings
- docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results
- docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation

Research boxes (SS page table):
- core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate
- core/box/ss_pt_types_box.h: 2-level page table structures
- core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation
- core/box/ss_pt_register_box.h: Page table registration
- core/box/ss_pt_impl.c: Global definitions

Updates:
- docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars
- core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration
- core/box/pool_mid_inuse_deferred_box.h: Deferred API updates
- core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection
- core/hakmem_super_registry: SS page table integration

Current Status:
- FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption
- ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box
- Next: Optimization roadmap per ROI (mimalloc gap 2.5x)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-13 05:35:46 +09:00

211 lines
8.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <stdio.h>
#include <stdlib.h>
// Phase 1: SuperSlab Registry - Thread-safe O(1) lookup for SuperSlab ownership
//
// Purpose: Replace mincore() syscall (50-100ns) with userspace hash table lookup
// Performance: ~5-10ns per lookup, 10-20x faster than mincore()
//
// Thread Safety:
// - Readers: Lock-free with acquire semantics
// - Writers: Mutex-protected with release semantics
// - Publish order: ss initialization → release fence → base write
// - Unpublish order: base = 0 (release) → munmap (prevents reader deref)
#include <stdatomic.h>
#include <pthread.h>
#include <stdint.h>
#include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC
#include "box/ss_addr_map_box.h" // Phase 9-1: O(1) hash table lookup
#include "box/super_reg_box.h" // Phase X: profile-aware logical registry sizing
#include "box/ss_pt_lookup_box.h" // Phase 9-2: O(1) page table lookup
#include "box/ss_pt_env_box.h" // Phase 9-2: ENV gate for PT vs hash
// Registry configuration
// Increased from 4096 to 32768 to avoid registry exhaustion under
// high-churn microbenchmarks (e.g., larson with many active SuperSlabs).
// Still a power of two for fast masking.
#define SUPER_REG_SIZE 1048576 // Power of 2 for fast modulo (1M entries)
#define SUPER_REG_MASK (SUPER_REG_SIZE - 1)
#define SUPER_MAX_PROBE 32 // Linear probing limit (increased from 8 for Phase 15 fix)
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
// - Refill scan: O(class_size) instead of O(262144)
// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
#define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads)
// Registry entry: base address → SuperSlab pointer mapping
typedef struct SuperRegEntry {
_Atomic(uintptr_t) base; // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync]
_Atomic(SuperSlab*) ss; // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race)
uint8_t lg_size; // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
uint8_t _pad[7]; // Padding to 24 bytes (cache-friendly)
} SuperRegEntry;
// Global registry (lock-free reads, mutex-protected writes)
extern pthread_mutex_t g_super_reg_lock;
extern int g_super_reg_initialized;
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
// - Protected by g_super_reg_lock (shared with main registry)
#ifndef TINY_NUM_CLASSES
#define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet
#endif
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
// ============================================================================
// Phase 9: Lazy Deallocation - LRU Cache Manager
// ============================================================================
// Global LRU cache for empty SuperSlabs (lazy deallocation)
typedef struct {
SuperSlab* lru_head; // LRU list head (most recently used)
SuperSlab* lru_tail; // LRU list tail (least recently used)
uint32_t total_count; // Total SuperSlabs in cache
uint32_t max_cached; // Maximum cached SuperSlabs (default: 256)
uint64_t total_memory_mb; // Total memory in cache (MB)
uint64_t max_memory_mb; // Maximum memory limit (MB, default: 512)
uint64_t ttl_ns; // Time-to-live (nanoseconds, default: 60s)
uint32_t generation; // Current generation counter
} SuperSlabLRUCache;
extern SuperSlabLRUCache g_ss_lru_cache;
// Initialize LRU cache (called once at startup)
void hak_ss_lru_init(void);
// Try to reuse a cached SuperSlab (returns NULL if cache is empty)
SuperSlab* hak_ss_lru_pop(uint8_t size_class);
// Add SuperSlab to LRU cache (returns 1 if cached, 0 if evicted immediately)
int hak_ss_lru_push(SuperSlab* ss);
// Evict old SuperSlabs based on policy (TTL, max_cached, max_memory_mb)
void hak_ss_lru_evict(void);
// Mark SuperSlab as recently used (update timestamp, move to head)
void hak_ss_lru_touch(SuperSlab* ss);
// ============================================================================
// Phase 11: SuperSlab Prewarm - Eliminate mmap/munmap bottleneck
// ============================================================================
// Prewarm: Allocate SuperSlabs at startup and add to LRU cache
void hak_ss_prewarm_init(void);
// Prewarm specific size class with count SuperSlabs
void hak_ss_prewarm_class(int size_class, uint32_t count);
// Prewarm all classes (counts[i] = number of SuperSlabs for class i)
void hak_ss_prewarm_all(const uint32_t counts[TINY_NUM_CLASSES]);
// Initialize registry (call once at startup)
void hak_super_registry_init(void);
// Hash function for aligned addresses (variable size)
static inline int hak_super_hash(uintptr_t base, int lg_size) {
// Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB)
return (int)((base >> lg_size) & super_reg_effective_mask());
}
// Lookup SuperSlab by pointer (lock-free, thread-safe)
// Returns: SuperSlab* if found, NULL otherwise
// Phase 9-2: Dispatch between page table (O(1) absolute) vs hash table (O(1) amortized)
static inline SuperSlab* hak_super_lookup(void* ptr) {
if (!g_super_reg_initialized) return NULL;
SuperSlab* ss = NULL;
// Phase 9-2: Try page table first if enabled
if (hak_ss_lookup_pt_enabled()) {
ss = ss_pt_lookup(ptr);
if (ss) return ss;
// Fallback to hash on miss (out_of_range or not registered)
}
// Phase 9-1: Use hash table lookup
// Replaces old linear probing (50-80 cycles → 10-20 cycles)
ss = ss_map_lookup(&g_ss_addr_map, ptr);
// Fallback: If hash map misses (e.g., map not populated yet), probe the
// legacy registry table to avoid NULL for valid SuperSlabs.
if (__builtin_expect(ss == NULL, 0)) {
SuperRegEntry* reg = super_reg_entries();
if (!reg) return NULL;
uintptr_t p = (uintptr_t)ptr;
for (int lg = SUPERSLAB_LG_MIN; lg <= SUPERSLAB_LG_MAX; lg++) {
uintptr_t base = p & ~(((uintptr_t)1 << lg) - 1);
int h = hak_super_hash(base, lg);
int eff_mask = super_reg_effective_mask();
int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE
? SUPER_MAX_PROBE
: super_reg_effective_size();
for (int i = 0; i < probe_limit; i++) {
SuperRegEntry* e = &reg[(h + i) & eff_mask];
uintptr_t reg_base = atomic_load_explicit(&e->base, memory_order_acquire);
if (reg_base == 0) {
break; // empty slot
}
if (reg_base == base && e->lg_size == lg) {
ss = atomic_load_explicit(&e->ss, memory_order_acquire);
goto reg_probe_done;
}
}
}
reg_probe_done:
;
}
#if !HAKMEM_BUILD_RELEASE
// Debug logging (ENV-gated)
static __thread int s_dbg = -1;
if (__builtin_expect(s_dbg == -1, 0)) {
const char* e = getenv("HAKMEM_SUPER_LOOKUP_DEBUG");
s_dbg = (e && *e && *e != '0') ? 1 : 0;
}
if (s_dbg == 1) {
if (ss) {
fprintf(stderr, "[SUPER_LOOKUP] ptr=%p -> ss=%p (hash table hit)\n", ptr, (void*)ss);
} else {
fprintf(stderr, "[SUPER_LOOKUP] ptr=%p -> NULL (hash table miss)\n", ptr);
}
}
#endif
// Magic check for safety (same as before)
if (ss && ss->magic != SUPERSLAB_MAGIC) {
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[SUPER_LOOKUP] WARNING: ss=%p has bad magic=%llx (being freed)\n",
(void*)ss, (unsigned long long)ss->magic);
#endif
return NULL; // Being freed
}
return ss;
}
// Register SuperSlab (mutex-protected, called after SuperSlab initialization)
// Returns: 1 on success, 0 if registry is full
int hak_super_register(uintptr_t base, SuperSlab* ss);
// Unregister SuperSlab (mutex-protected, MUST call before munmap)
// Critical: base = 0 happens BEFORE munmap to prevent reader segfault
void hak_super_unregister(uintptr_t base);
// Debug: Get registry statistics
typedef struct {
int total_slots;
int used_slots;
int max_probe_depth;
} SuperRegStats;
void hak_super_registry_stats(SuperRegStats* stats);