2025-11-05 12:31:14 +09:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 1: SuperSlab Registry - Thread-safe O(1) lookup for SuperSlab ownership
|
|
|
|
|
|
//
|
|
|
|
|
|
// Purpose: Replace mincore() syscall (50-100ns) with userspace hash table lookup
|
|
|
|
|
|
// Performance: ~5-10ns per lookup, 10-20x faster than mincore()
|
|
|
|
|
|
//
|
|
|
|
|
|
// Thread Safety:
|
|
|
|
|
|
// - Readers: Lock-free with acquire semantics
|
|
|
|
|
|
// - Writers: Mutex-protected with release semantics
|
|
|
|
|
|
// - Publish order: ss initialization → release fence → base write
|
|
|
|
|
|
// - Unpublish order: base = 0 (release) → munmap (prevents reader deref)
|
|
|
|
|
|
|
|
|
|
|
|
#include <stdatomic.h>
|
|
|
|
|
|
#include <pthread.h>
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
#include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC
|
|
|
|
|
|
|
|
|
|
|
|
// Registry configuration
|
|
|
|
|
|
// Increased from 4096 to 32768 to avoid registry exhaustion under
|
|
|
|
|
|
// high-churn microbenchmarks (e.g., larson with many active SuperSlabs).
|
|
|
|
|
|
// Still a power of two for fast masking.
|
|
|
|
|
|
#define SUPER_REG_SIZE 262144 // Power of 2 for fast modulo (8x larger for workloads)
|
|
|
|
|
|
#define SUPER_REG_MASK (SUPER_REG_SIZE - 1)
|
2025-11-16 00:38:29 +09:00
|
|
|
|
#define SUPER_MAX_PROBE 32 // Linear probing limit (increased from 8 for Phase 15 fix)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
|
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
|
|
|
|
|
|
// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
|
|
|
|
|
|
// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
|
|
|
|
|
|
// - Refill scan: O(class_size) instead of O(262144)
|
|
|
|
|
|
// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
|
|
|
|
|
|
#define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads)
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Registry entry: base address → SuperSlab pointer mapping
|
|
|
|
|
|
typedef struct {
|
2025-11-07 18:07:48 +09:00
|
|
|
|
_Atomic(uintptr_t) base; // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync]
|
2025-11-05 12:31:14 +09:00
|
|
|
|
_Atomic(SuperSlab*) ss; // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race)
|
|
|
|
|
|
uint8_t lg_size; // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
|
|
|
|
|
|
uint8_t _pad[7]; // Padding to 24 bytes (cache-friendly)
|
|
|
|
|
|
} SuperRegEntry;
|
|
|
|
|
|
|
|
|
|
|
|
// Global registry (lock-free reads, mutex-protected writes)
|
|
|
|
|
|
extern SuperRegEntry g_super_reg[SUPER_REG_SIZE];
|
|
|
|
|
|
extern pthread_mutex_t g_super_reg_lock;
|
|
|
|
|
|
extern int g_super_reg_initialized;
|
|
|
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
|
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
|
|
|
|
|
|
// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
|
|
|
|
|
|
// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
|
|
|
|
|
|
// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
|
|
|
|
|
|
// - Protected by g_super_reg_lock (shared with main registry)
|
|
|
|
|
|
#ifndef TINY_NUM_CLASSES
|
|
|
|
|
|
#define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet
|
|
|
|
|
|
#endif
|
|
|
|
|
|
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
|
|
|
|
|
|
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 9: Lazy Deallocation - LRU Cache Manager
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Global LRU cache for empty SuperSlabs (lazy deallocation)
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
|
SuperSlab* lru_head; // LRU list head (most recently used)
|
|
|
|
|
|
SuperSlab* lru_tail; // LRU list tail (least recently used)
|
|
|
|
|
|
uint32_t total_count; // Total SuperSlabs in cache
|
|
|
|
|
|
uint32_t max_cached; // Maximum cached SuperSlabs (default: 256)
|
|
|
|
|
|
uint64_t total_memory_mb; // Total memory in cache (MB)
|
|
|
|
|
|
uint64_t max_memory_mb; // Maximum memory limit (MB, default: 512)
|
|
|
|
|
|
uint64_t ttl_ns; // Time-to-live (nanoseconds, default: 60s)
|
|
|
|
|
|
uint32_t generation; // Current generation counter
|
|
|
|
|
|
} SuperSlabLRUCache;
|
|
|
|
|
|
|
|
|
|
|
|
extern SuperSlabLRUCache g_ss_lru_cache;
|
|
|
|
|
|
|
|
|
|
|
|
// Initialize LRU cache (called once at startup)
|
|
|
|
|
|
void hak_ss_lru_init(void);
|
|
|
|
|
|
|
|
|
|
|
|
// Try to reuse a cached SuperSlab (returns NULL if cache is empty)
|
|
|
|
|
|
SuperSlab* hak_ss_lru_pop(uint8_t size_class);
|
|
|
|
|
|
|
|
|
|
|
|
// Add SuperSlab to LRU cache (returns 1 if cached, 0 if evicted immediately)
|
|
|
|
|
|
int hak_ss_lru_push(SuperSlab* ss);
|
|
|
|
|
|
|
|
|
|
|
|
// Evict old SuperSlabs based on policy (TTL, max_cached, max_memory_mb)
|
|
|
|
|
|
void hak_ss_lru_evict(void);
|
|
|
|
|
|
|
|
|
|
|
|
// Mark SuperSlab as recently used (update timestamp, move to head)
|
|
|
|
|
|
void hak_ss_lru_touch(SuperSlab* ss);
|
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 11: SuperSlab Prewarm - Eliminate mmap/munmap bottleneck
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Prewarm: Allocate SuperSlabs at startup and add to LRU cache
|
|
|
|
|
|
void hak_ss_prewarm_init(void);
|
|
|
|
|
|
|
|
|
|
|
|
// Prewarm specific size class with count SuperSlabs
|
|
|
|
|
|
void hak_ss_prewarm_class(int size_class, uint32_t count);
|
|
|
|
|
|
|
|
|
|
|
|
// Prewarm all classes (counts[i] = number of SuperSlabs for class i)
|
|
|
|
|
|
void hak_ss_prewarm_all(const uint32_t counts[TINY_NUM_CLASSES]);
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Initialize registry (call once at startup)
|
|
|
|
|
|
void hak_super_registry_init(void);
|
|
|
|
|
|
|
|
|
|
|
|
// Hash function for aligned addresses (variable size)
|
|
|
|
|
|
static inline int hak_super_hash(uintptr_t base, int lg_size) {
|
|
|
|
|
|
// Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB)
|
|
|
|
|
|
return (int)((base >> lg_size) & SUPER_REG_MASK);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Lookup SuperSlab by pointer (lock-free, thread-safe)
|
|
|
|
|
|
// Returns: SuperSlab* if found, NULL otherwise
|
|
|
|
|
|
// Phase 8.3: ACE - Supports both 1MB and 2MB SuperSlabs
|
|
|
|
|
|
static inline SuperSlab* hak_super_lookup(void* ptr) {
|
|
|
|
|
|
if (!g_super_reg_initialized) return NULL;
|
|
|
|
|
|
|
2025-11-28 01:45:45 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-16 00:38:29 +09:00
|
|
|
|
// Debug logging (ENV-gated)
|
|
|
|
|
|
static __thread int s_dbg = -1;
|
|
|
|
|
|
if (__builtin_expect(s_dbg == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_SUPER_LOOKUP_DEBUG");
|
|
|
|
|
|
s_dbg = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
2025-11-28 01:45:45 +09:00
|
|
|
|
#else
|
|
|
|
|
|
static const int s_dbg = 0;
|
|
|
|
|
|
#endif
|
2025-11-16 00:38:29 +09:00
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Try both 1MB and 2MB alignments (1MB first for Step 1 default)
|
|
|
|
|
|
// ACE will use both sizes dynamically in Step 3
|
|
|
|
|
|
for (int lg = 20; lg <= 21; lg++) {
|
|
|
|
|
|
uintptr_t mask = (1UL << lg) - 1;
|
|
|
|
|
|
uintptr_t base = (uintptr_t)ptr & ~mask;
|
|
|
|
|
|
int h = hak_super_hash(base, lg);
|
|
|
|
|
|
|
2025-11-16 00:38:29 +09:00
|
|
|
|
if (s_dbg == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SUPER_LOOKUP] ptr=%p lg=%d aligned_base=%p hash=%d\n",
|
|
|
|
|
|
ptr, lg, (void*)base, h);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Linear probing with acquire semantics
|
|
|
|
|
|
for (int i = 0; i < SUPER_MAX_PROBE; i++) {
|
|
|
|
|
|
SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK];
|
2025-11-07 18:07:48 +09:00
|
|
|
|
uintptr_t b = atomic_load_explicit(&e->base, memory_order_acquire);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
2025-11-16 00:38:29 +09:00
|
|
|
|
if (s_dbg == 1 && b != 0) {
|
|
|
|
|
|
fprintf(stderr, "[SUPER_LOOKUP] probe[%d] entry_base=%p entry_lg=%d (match=%d)\n",
|
|
|
|
|
|
i, (void*)b, e->lg_size, (b == base && e->lg_size == lg));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Match both base address AND lg_size
|
|
|
|
|
|
if (b == base && e->lg_size == lg) {
|
|
|
|
|
|
// Atomic load to prevent TOCTOU race with unregister
|
|
|
|
|
|
SuperSlab* ss = atomic_load_explicit(&e->ss, memory_order_acquire);
|
2025-11-16 00:38:29 +09:00
|
|
|
|
if (!ss) {
|
|
|
|
|
|
if (s_dbg == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SUPER_LOOKUP] MATCH but ss=NULL (unregistered)\n");
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL; // Entry cleared by unregister
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// CRITICAL: Check magic BEFORE returning pointer to prevent TOCTOU
|
|
|
|
|
|
// Race scenario: lookup → free (clear magic, munmap) → caller checks magic
|
|
|
|
|
|
// Fix: Check magic HERE while we're certain ss is still registered
|
2025-11-16 00:38:29 +09:00
|
|
|
|
if (ss->magic != SUPERSLAB_MAGIC) {
|
|
|
|
|
|
if (s_dbg == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SUPER_LOOKUP] MATCH but bad magic=%llx (being freed)\n",
|
|
|
|
|
|
(unsigned long long)ss->magic);
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL; // Being freed
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (s_dbg == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SUPER_LOOKUP] FOUND: ss=%p magic=%llx\n",
|
|
|
|
|
|
(void*)ss, (unsigned long long)ss->magic);
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return ss;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (b == 0) break; // Empty slot, try next lg_size
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-16 00:38:29 +09:00
|
|
|
|
|
|
|
|
|
|
if (s_dbg == 1) {
|
|
|
|
|
|
fprintf(stderr, "[SUPER_LOOKUP] NOT FOUND (all lg sizes exhausted)\n");
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return NULL; // Not found
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Register SuperSlab (mutex-protected, called after SuperSlab initialization)
|
|
|
|
|
|
// Returns: 1 on success, 0 if registry is full
|
|
|
|
|
|
int hak_super_register(uintptr_t base, SuperSlab* ss);
|
|
|
|
|
|
|
|
|
|
|
|
// Unregister SuperSlab (mutex-protected, MUST call before munmap)
|
|
|
|
|
|
// Critical: base = 0 happens BEFORE munmap to prevent reader segfault
|
|
|
|
|
|
void hak_super_unregister(uintptr_t base);
|
|
|
|
|
|
|
|
|
|
|
|
// Debug: Get registry statistics
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
|
int total_slots;
|
|
|
|
|
|
int used_slots;
|
|
|
|
|
|
int max_probe_depth;
|
|
|
|
|
|
} SuperRegStats;
|
|
|
|
|
|
|
|
|
|
|
|
void hak_super_registry_stats(SuperRegStats* stats);
|