Files
hakmem/core/hakmem_super_registry.h
Moe Charm (CI) 2be754853f Phase 11: SuperSlab Prewarm implementation (startup pre-allocation)
## Summary
Pre-allocate SuperSlabs at startup to eliminate runtime mmap overhead.
Result: +6.4% improvement (8.82M → 9.38M ops/s) but still 9x slower than System malloc.

## Key Findings (Lesson Learned)
- Syscall reduction strategy targeted WRONG bottleneck
- Real bottleneck: SuperSlab allocation churn (877 SuperSlabs needed)
- Prewarm reduces mmap frequency but doesn't solve fundamental architecture issue

## Implementation
- Two-phase allocation with atomic bypass flag
- Environment variable: HAKMEM_PREWARM_SUPERSLABS (default: 0)
- Best result: Prewarm=8 → 9.38M ops/s (+6.4%)

## Next Step
Pivot to Phase 12: Shared SuperSlab Pool (mimalloc-style)
- Expected: 877 → 100-200 SuperSlabs (-70-80%)
- This addresses ROOT CAUSE (allocation churn) not symptoms

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:45:43 +09:00

169 lines
7.2 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <stdio.h>
#include <stdlib.h>
// Phase 1: SuperSlab Registry - Thread-safe O(1) lookup for SuperSlab ownership
//
// Purpose: Replace mincore() syscall (50-100ns) with userspace hash table lookup
// Performance: ~5-10ns per lookup, 10-20x faster than mincore()
//
// Thread Safety:
// - Readers: Lock-free with acquire semantics
// - Writers: Mutex-protected with release semantics
// - Publish order: ss initialization → release fence → base write
// - Unpublish order: base = 0 (release) → munmap (prevents reader deref)
#include <stdatomic.h>
#include <pthread.h>
#include <stdint.h>
#include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC
// Registry configuration
// Increased from 4096 to 32768 to avoid registry exhaustion under
// high-churn microbenchmarks (e.g., larson with many active SuperSlabs).
// Still a power of two for fast masking.
#define SUPER_REG_SIZE 262144 // Power of 2 for fast modulo (8x larger for workloads)
#define SUPER_REG_MASK (SUPER_REG_SIZE - 1)
#define SUPER_MAX_PROBE 8 // Linear probing limit
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
// - Refill scan: O(class_size) instead of O(262144)
// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
#define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads)
// Registry entry: base address → SuperSlab pointer mapping
typedef struct {
_Atomic(uintptr_t) base; // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync]
_Atomic(SuperSlab*) ss; // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race)
uint8_t lg_size; // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
uint8_t _pad[7]; // Padding to 24 bytes (cache-friendly)
} SuperRegEntry;
// Global registry (lock-free reads, mutex-protected writes)
extern SuperRegEntry g_super_reg[SUPER_REG_SIZE];
extern pthread_mutex_t g_super_reg_lock;
extern int g_super_reg_initialized;
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
// - Protected by g_super_reg_lock (shared with main registry)
#ifndef TINY_NUM_CLASSES
#define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet
#endif
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
// ============================================================================
// Phase 9: Lazy Deallocation - LRU Cache Manager
// ============================================================================
// Global LRU cache for empty SuperSlabs (lazy deallocation)
typedef struct {
SuperSlab* lru_head; // LRU list head (most recently used)
SuperSlab* lru_tail; // LRU list tail (least recently used)
uint32_t total_count; // Total SuperSlabs in cache
uint32_t max_cached; // Maximum cached SuperSlabs (default: 256)
uint64_t total_memory_mb; // Total memory in cache (MB)
uint64_t max_memory_mb; // Maximum memory limit (MB, default: 512)
uint64_t ttl_ns; // Time-to-live (nanoseconds, default: 60s)
uint32_t generation; // Current generation counter
} SuperSlabLRUCache;
extern SuperSlabLRUCache g_ss_lru_cache;
// Initialize LRU cache (called once at startup)
void hak_ss_lru_init(void);
// Try to reuse a cached SuperSlab (returns NULL if cache is empty)
SuperSlab* hak_ss_lru_pop(uint8_t size_class);
// Add SuperSlab to LRU cache (returns 1 if cached, 0 if evicted immediately)
int hak_ss_lru_push(SuperSlab* ss);
// Evict old SuperSlabs based on policy (TTL, max_cached, max_memory_mb)
void hak_ss_lru_evict(void);
// Mark SuperSlab as recently used (update timestamp, move to head)
void hak_ss_lru_touch(SuperSlab* ss);
// ============================================================================
// Phase 11: SuperSlab Prewarm - Eliminate mmap/munmap bottleneck
// ============================================================================
// Prewarm: Allocate SuperSlabs at startup and add to LRU cache
void hak_ss_prewarm_init(void);
// Prewarm specific size class with count SuperSlabs
void hak_ss_prewarm_class(int size_class, uint32_t count);
// Prewarm all classes (counts[i] = number of SuperSlabs for class i)
void hak_ss_prewarm_all(const uint32_t counts[TINY_NUM_CLASSES]);
// Initialize registry (call once at startup)
void hak_super_registry_init(void);
// Hash function for aligned addresses (variable size)
static inline int hak_super_hash(uintptr_t base, int lg_size) {
// Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB)
return (int)((base >> lg_size) & SUPER_REG_MASK);
}
// Lookup SuperSlab by pointer (lock-free, thread-safe)
// Returns: SuperSlab* if found, NULL otherwise
// Phase 8.3: ACE - Supports both 1MB and 2MB SuperSlabs
static inline SuperSlab* hak_super_lookup(void* ptr) {
if (!g_super_reg_initialized) return NULL;
// Try both 1MB and 2MB alignments (1MB first for Step 1 default)
// ACE will use both sizes dynamically in Step 3
for (int lg = 20; lg <= 21; lg++) {
uintptr_t mask = (1UL << lg) - 1;
uintptr_t base = (uintptr_t)ptr & ~mask;
int h = hak_super_hash(base, lg);
// Linear probing with acquire semantics
for (int i = 0; i < SUPER_MAX_PROBE; i++) {
SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK];
uintptr_t b = atomic_load_explicit(&e->base, memory_order_acquire);
// Match both base address AND lg_size
if (b == base && e->lg_size == lg) {
// Atomic load to prevent TOCTOU race with unregister
SuperSlab* ss = atomic_load_explicit(&e->ss, memory_order_acquire);
if (!ss) return NULL; // Entry cleared by unregister
// CRITICAL: Check magic BEFORE returning pointer to prevent TOCTOU
// Race scenario: lookup → free (clear magic, munmap) → caller checks magic
// Fix: Check magic HERE while we're certain ss is still registered
if (ss->magic != SUPERSLAB_MAGIC) return NULL; // Being freed
return ss;
}
if (b == 0) break; // Empty slot, try next lg_size
}
}
return NULL; // Not found
}
// Register SuperSlab (mutex-protected, called after SuperSlab initialization)
// Returns: 1 on success, 0 if registry is full
int hak_super_register(uintptr_t base, SuperSlab* ss);
// Unregister SuperSlab (mutex-protected, MUST call before munmap)
// Critical: base = 0 happens BEFORE munmap to prevent reader segfault
void hak_super_unregister(uintptr_t base);
// Debug: Get registry statistics
typedef struct {
int total_slots;
int used_slots;
int max_probe_depth;
} SuperRegStats;
void hak_super_registry_stats(SuperRegStats* stats);