Files
hakmem/core/hakmem_super_registry.h
Moe Charm (CI) fb10d1710b Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance

Implementation:

1. mincore removal (100% elimination)
   - Deleted: hakmem_internal.h hak_is_memory_readable() syscall
   - Deleted: tiny_free_fast_v2.inc.h safety checks
   - Alternative: Internal metadata (Registry + Header magic validation)
   - Result: 841 mincore calls → 0 calls 

2. SuperSlab Lazy Deallocation
   - Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
   - Extended SuperSlab: last_used_ns, generation, lru_prev/next
   - Deallocation policy: Count/Memory/TTL based eviction
   - Environment variables:
     * HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
     * HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
     * HAKMEM_SUPERSLAB_TTL_SEC=60 (default)

3. Integration
   - superslab_allocate: Try LRU cache first before mmap
   - superslab_free: Push to LRU cache instead of immediate munmap
   - Lazy deallocation: Defer munmap until cache limits exceeded

Performance Results (100K iterations, 256B allocations):

Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)

After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)

Key Achievements:
-  mincore: 100% elimination (841 → 0)
-  mmap: -30% reduction (1,250 → 877)
-  munmap: -35% reduction (1,321 → 852)
-  Total syscalls: -49% reduction (3,412 → 1,729)
-  Performance: +251% improvement (2.76M → 9.71M ops/s)

System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)

Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache

Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00

156 lines
6.6 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <stdio.h>
#include <stdlib.h>
// Phase 1: SuperSlab Registry - Thread-safe O(1) lookup for SuperSlab ownership
//
// Purpose: Replace mincore() syscall (50-100ns) with userspace hash table lookup
// Performance: ~5-10ns per lookup, 10-20x faster than mincore()
//
// Thread Safety:
// - Readers: Lock-free with acquire semantics
// - Writers: Mutex-protected with release semantics
// - Publish order: ss initialization → release fence → base write
// - Unpublish order: base = 0 (release) → munmap (prevents reader deref)
#include <stdatomic.h>
#include <pthread.h>
#include <stdint.h>
#include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC
// Registry configuration
// Increased from 4096 to 32768 to avoid registry exhaustion under
// high-churn microbenchmarks (e.g., larson with many active SuperSlabs).
// Still a power of two for fast masking.
#define SUPER_REG_SIZE 262144 // Power of 2 for fast modulo (8x larger for workloads)
#define SUPER_REG_MASK (SUPER_REG_SIZE - 1)
#define SUPER_MAX_PROBE 8 // Linear probing limit
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
// - Refill scan: O(class_size) instead of O(262144)
// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
#define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads)
// Registry entry: base address → SuperSlab pointer mapping
typedef struct {
_Atomic(uintptr_t) base; // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync]
_Atomic(SuperSlab*) ss; // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race)
uint8_t lg_size; // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
uint8_t _pad[7]; // Padding to 24 bytes (cache-friendly)
} SuperRegEntry;
// Global registry (lock-free reads, mutex-protected writes)
extern SuperRegEntry g_super_reg[SUPER_REG_SIZE];
extern pthread_mutex_t g_super_reg_lock;
extern int g_super_reg_initialized;
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
// - Protected by g_super_reg_lock (shared with main registry)
#ifndef TINY_NUM_CLASSES
#define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet
#endif
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
// ============================================================================
// Phase 9: Lazy Deallocation - LRU Cache Manager
// ============================================================================
// Global LRU cache for empty SuperSlabs (lazy deallocation)
typedef struct {
SuperSlab* lru_head; // LRU list head (most recently used)
SuperSlab* lru_tail; // LRU list tail (least recently used)
uint32_t total_count; // Total SuperSlabs in cache
uint32_t max_cached; // Maximum cached SuperSlabs (default: 256)
uint64_t total_memory_mb; // Total memory in cache (MB)
uint64_t max_memory_mb; // Maximum memory limit (MB, default: 512)
uint64_t ttl_ns; // Time-to-live (nanoseconds, default: 60s)
uint32_t generation; // Current generation counter
} SuperSlabLRUCache;
extern SuperSlabLRUCache g_ss_lru_cache;
// Initialize LRU cache (called once at startup)
void hak_ss_lru_init(void);
// Try to reuse a cached SuperSlab (returns NULL if cache is empty)
SuperSlab* hak_ss_lru_pop(uint8_t size_class);
// Add SuperSlab to LRU cache (returns 1 if cached, 0 if evicted immediately)
int hak_ss_lru_push(SuperSlab* ss);
// Evict old SuperSlabs based on policy (TTL, max_cached, max_memory_mb)
void hak_ss_lru_evict(void);
// Mark SuperSlab as recently used (update timestamp, move to head)
void hak_ss_lru_touch(SuperSlab* ss);
// Initialize registry (call once at startup)
void hak_super_registry_init(void);
// Hash function for aligned addresses (variable size)
static inline int hak_super_hash(uintptr_t base, int lg_size) {
// Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB)
return (int)((base >> lg_size) & SUPER_REG_MASK);
}
// Lookup SuperSlab by pointer (lock-free, thread-safe)
// Returns: SuperSlab* if found, NULL otherwise
// Phase 8.3: ACE - Supports both 1MB and 2MB SuperSlabs
static inline SuperSlab* hak_super_lookup(void* ptr) {
if (!g_super_reg_initialized) return NULL;
// Try both 1MB and 2MB alignments (1MB first for Step 1 default)
// ACE will use both sizes dynamically in Step 3
for (int lg = 20; lg <= 21; lg++) {
uintptr_t mask = (1UL << lg) - 1;
uintptr_t base = (uintptr_t)ptr & ~mask;
int h = hak_super_hash(base, lg);
// Linear probing with acquire semantics
for (int i = 0; i < SUPER_MAX_PROBE; i++) {
SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK];
uintptr_t b = atomic_load_explicit(&e->base, memory_order_acquire);
// Match both base address AND lg_size
if (b == base && e->lg_size == lg) {
// Atomic load to prevent TOCTOU race with unregister
SuperSlab* ss = atomic_load_explicit(&e->ss, memory_order_acquire);
if (!ss) return NULL; // Entry cleared by unregister
// CRITICAL: Check magic BEFORE returning pointer to prevent TOCTOU
// Race scenario: lookup → free (clear magic, munmap) → caller checks magic
// Fix: Check magic HERE while we're certain ss is still registered
if (ss->magic != SUPERSLAB_MAGIC) return NULL; // Being freed
return ss;
}
if (b == 0) break; // Empty slot, try next lg_size
}
}
return NULL; // Not found
}
// Register SuperSlab (mutex-protected, called after SuperSlab initialization)
// Returns: 1 on success, 0 if registry is full
int hak_super_register(uintptr_t base, SuperSlab* ss);
// Unregister SuperSlab (mutex-protected, MUST call before munmap)
// Critical: base = 0 happens BEFORE munmap to prevent reader segfault
void hak_super_unregister(uintptr_t base);
// Debug: Get registry statistics
typedef struct {
int total_slots;
int used_slots;
int max_probe_depth;
} SuperRegStats;
void hak_super_registry_stats(SuperRegStats* stats);