2025-11-05 12:31:14 +09:00
|
|
|
#include "hakmem_super_registry.h"
|
|
|
|
|
#include "hakmem_tiny_superslab.h"
|
2025-11-29 06:22:49 +09:00
|
|
|
#include "box/ss_allocation_box.h" // For superslab_allocate() declaration
|
2025-11-30 07:16:50 +09:00
|
|
|
#include "box/ss_addr_map_box.h" // Phase 9-1: SuperSlab address map
|
2025-12-02 19:43:23 +09:00
|
|
|
#include "box/ss_cold_start_box.inc.h" // Phase 11+: Cold Start prewarm defaults
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
#include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
|
2025-12-07 03:12:27 +09:00
|
|
|
#include <stdlib.h>
|
2025-11-05 12:31:14 +09:00
|
|
|
#include <string.h>
|
|
|
|
|
#include <stdio.h>
|
C7 Stride Upgrade: Fix 1024B→2048B alignment corruption (ROOT CAUSE)
## Problem
C7 (1KB class) blocks were being carved with 1024B stride but expected
to align with 2048B stride, causing systematic NXT_MISALIGN errors with
characteristic pattern: delta_mod = 1026, 1028, 1030, 1032... (1024*N + offset).
This caused crashes, double-frees, and alignment violations in 1024B workloads.
## Root Cause
The global array `g_tiny_class_sizes[]` was correctly updated to 2048B,
but `tiny_block_stride_for_class()` contained a LOCAL static const array
with the old 1024B value:
```c
// hakmem_tiny_superslab.h:52 (BEFORE)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 1024};
^^^^
```
This local table was used by ALL carve operations, causing every C7 block
to be allocated with 1024B stride despite the 2048B upgrade.
## Fix
Updated local stride table in `tiny_block_stride_for_class()`:
```c
// hakmem_tiny_superslab.h:52 (AFTER)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 2048};
^^^^
```
## Verification
**Before**: NXT_MISALIGN delta_mod shows 1024B pattern (1026, 1028, 1030...)
**After**: NXT_MISALIGN delta_mod shows random values (227, 994, 195...)
→ No more 1024B alignment pattern = stride upgrade successful ✓
## Additional Safety Layers (Defense in Depth)
1. **Validation Logic Fix** (tiny_nextptr.h:100)
- Changed stride check to use `tiny_block_stride_for_class()` (includes header)
- Was using `g_tiny_class_sizes[]` (raw size without header)
2. **TLS SLL Purge** (hakmem_tiny_lazy_init.inc.h:83-87)
- Clear TLS SLL on lazy class initialization
- Prevents stale blocks from previous runs
3. **Pre-Carve Geometry Validation** (hakmem_tiny_refill_p0.inc.h:273-297)
- Validates slab capacity matches current stride before carving
- Reinitializes if geometry is stale (e.g., after stride upgrade)
4. **LRU Stride Validation** (hakmem_super_registry.c:369-458)
- Validates cached SuperSlabs have compatible stride
- Evicts incompatible SuperSlabs immediately
5. **Shared Pool Geometry Fix** (hakmem_shared_pool.c:722-733)
- Reinitializes slab geometry on acquisition if capacity mismatches
6. **Legacy Backend Validation** (ss_legacy_backend_box.c:138-155)
- Validates geometry before allocation in legacy path
## Impact
- Eliminates 100% of 1024B-pattern alignment errors
- Fixes crashes in 1024B workloads (bench_random_mixed 1024B now stable)
- Establishes multiple validation layers to prevent future stride issues
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 22:55:17 +09:00
|
|
|
#include <sys/mman.h> // munmap for incompatible SuperSlab eviction
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-12-07 03:12:27 +09:00
|
|
|
// Global registry storage (allocated via SuperRegBox)
|
|
|
|
|
static SuperRegEntry* reg_entries(void) {
|
|
|
|
|
return super_reg_entries();
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
pthread_mutex_t g_super_reg_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
|
int g_super_reg_initialized = 0;
|
|
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
// Per-class registry storage (Phase 6: Registry Optimization)
|
|
|
|
|
int g_super_reg_class_size[TINY_NUM_CLASSES];
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
// Phase 9: Lazy Deallocation - LRU Cache Storage
|
|
|
|
|
SuperSlabLRUCache g_ss_lru_cache = {0};
|
|
|
|
|
static int g_ss_lru_initialized = 0;
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
// Phase 11: Prewarm bypass flag (disable LRU pop during prewarm)
|
|
|
|
|
static _Atomic int g_ss_prewarm_bypass = 0;
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// Initialize registry (call once at startup)
|
|
|
|
|
void hak_super_registry_init(void) {
|
|
|
|
|
if (g_super_reg_initialized) return;
|
|
|
|
|
|
2025-12-07 03:12:27 +09:00
|
|
|
super_reg_init(NULL, NULL);
|
|
|
|
|
|
|
|
|
|
SuperRegEntry* entries = reg_entries();
|
|
|
|
|
int reg_cap = super_reg_effective_size();
|
|
|
|
|
if (!entries) {
|
|
|
|
|
fprintf(stderr, "[SUPER_REG] init failed: no registry entries\n");
|
|
|
|
|
abort();
|
|
|
|
|
}
|
2025-11-05 17:02:31 +09:00
|
|
|
// Zero-initialize all entries (hash table)
|
2025-12-07 03:12:27 +09:00
|
|
|
memset(entries, 0, (size_t)reg_cap * sizeof(SuperRegEntry));
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
// Zero-initialize per-class registry (Phase 6: Registry Optimization)
|
2025-12-07 03:12:27 +09:00
|
|
|
SuperSlab** by_class = super_reg_by_class_slots();
|
|
|
|
|
int stride = super_reg_by_class_stride();
|
|
|
|
|
if (by_class && stride > 0) {
|
|
|
|
|
memset(by_class, 0, (size_t)TINY_NUM_CLASSES * (size_t)stride * sizeof(SuperSlab*));
|
|
|
|
|
}
|
2025-11-05 17:02:31 +09:00
|
|
|
memset(g_super_reg_class_size, 0, sizeof(g_super_reg_class_size));
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// Memory fence to ensure initialization is visible to all threads
|
|
|
|
|
atomic_thread_fence(memory_order_release);
|
|
|
|
|
|
|
|
|
|
g_super_reg_initialized = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Register SuperSlab (mutex-protected)
|
|
|
|
|
// CRITICAL: Call AFTER SuperSlab is fully initialized
|
|
|
|
|
// Publish order: ss init → release fence → base write
|
|
|
|
|
// Phase 8.3: ACE - lg_size aware registration
|
2025-11-05 17:02:31 +09:00
|
|
|
// Phase 6: Registry Optimization - Also add to per-class registry for fast refill scan
|
2025-11-05 12:31:14 +09:00
|
|
|
int hak_super_register(uintptr_t base, SuperSlab* ss) {
|
|
|
|
|
if (!g_super_reg_initialized) {
|
|
|
|
|
hak_super_registry_init();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
int lg = ss->lg_size; // Phase 8.3: Get lg_size from SuperSlab
|
2025-11-16 00:38:29 +09:00
|
|
|
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate debug syscall overhead)
|
2025-11-28 01:46:50 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
int dbg = HAK_ENV_SUPER_REG_DEBUG();
|
2025-11-28 01:46:50 +09:00
|
|
|
#else
|
|
|
|
|
const int dbg = 0;
|
|
|
|
|
#endif
|
2025-11-16 00:38:29 +09:00
|
|
|
|
2025-12-07 03:12:27 +09:00
|
|
|
SuperRegEntry* entries = reg_entries();
|
|
|
|
|
if (!entries) {
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
int h = hak_super_hash(base, lg);
|
2025-12-07 03:12:27 +09:00
|
|
|
const int mask = super_reg_effective_mask();
|
|
|
|
|
const int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE
|
|
|
|
|
? SUPER_MAX_PROBE
|
|
|
|
|
: super_reg_effective_size();
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
// Step 1: Register in hash table (for address → SuperSlab lookup)
|
|
|
|
|
int hash_registered = 0;
|
2025-12-07 03:12:27 +09:00
|
|
|
for (int i = 0; i < probe_limit; i++) {
|
|
|
|
|
SuperRegEntry* e = &entries[(h + i) & mask];
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-11-07 18:07:48 +09:00
|
|
|
if (atomic_load_explicit(&e->base, memory_order_acquire) == 0) {
|
2025-11-05 12:31:14 +09:00
|
|
|
// Found empty slot
|
|
|
|
|
// Step 1: Write SuperSlab pointer and lg_size (atomic for MT-safety)
|
|
|
|
|
atomic_store_explicit(&e->ss, ss, memory_order_release);
|
|
|
|
|
e->lg_size = lg; // Phase 8.3: Store lg_size for fast lookup
|
|
|
|
|
|
|
|
|
|
// Step 2: Release fence (ensures ss/lg_size write is visible before base)
|
|
|
|
|
atomic_thread_fence(memory_order_release);
|
|
|
|
|
|
|
|
|
|
// Step 3: Publish base address (makes entry visible to readers)
|
2025-11-07 18:07:48 +09:00
|
|
|
atomic_store_explicit(&e->base, base, memory_order_release);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
hash_registered = 1;
|
2025-11-16 00:38:29 +09:00
|
|
|
if (dbg == 1) {
|
2025-11-13 16:33:03 +09:00
|
|
|
fprintf(stderr, "[SUPER_REG] register base=%p lg=%d slot=%d magic=%llx\n",
|
2025-12-07 03:12:27 +09:00
|
|
|
(void*)base, lg, (h + i) & mask,
|
2025-11-07 18:07:48 +09:00
|
|
|
(unsigned long long)ss->magic);
|
|
|
|
|
}
|
2025-11-05 17:02:31 +09:00
|
|
|
break;
|
2025-11-05 12:31:14 +09:00
|
|
|
}
|
|
|
|
|
|
2025-11-07 18:07:48 +09:00
|
|
|
if (atomic_load_explicit(&e->base, memory_order_acquire) == base && e->lg_size == lg) {
|
2025-11-05 12:31:14 +09:00
|
|
|
// Already registered (duplicate registration)
|
2025-11-05 17:02:31 +09:00
|
|
|
hash_registered = 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!hash_registered) {
|
|
|
|
|
// Hash table full (probing limit reached)
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
fprintf(stderr, "HAKMEM: SuperSlab registry full! Increase SUPER_REG_SIZE\n");
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-13 16:33:03 +09:00
|
|
|
// Phase 12: per-class registry not keyed by ss->size_class anymore.
|
|
|
|
|
// Keep existing global hash registration only.
|
2025-11-30 07:16:50 +09:00
|
|
|
|
|
|
|
|
// Phase 9-1: Also register in new hash table (for optimized lookup)
|
|
|
|
|
ss_map_insert(&g_ss_addr_map, (void*)base, ss);
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
2025-11-05 17:02:31 +09:00
|
|
|
return 1;
|
2025-11-05 12:31:14 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Unregister SuperSlab (mutex-protected)
|
|
|
|
|
// CRITICAL: Call BEFORE munmap to prevent reader segfault
|
|
|
|
|
// Unpublish order: base = 0 (release) → munmap outside this function
|
|
|
|
|
// Phase 8.3: ACE - Try both lg_sizes (we don't know which one was used)
|
2025-11-05 17:02:31 +09:00
|
|
|
// Phase 6: Registry Optimization - Also remove from per-class registry
|
2025-11-05 12:31:14 +09:00
|
|
|
void hak_super_unregister(uintptr_t base) {
|
2025-11-28 01:46:50 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-07 18:07:48 +09:00
|
|
|
static int dbg_once = -1; // shared with register path for debug toggle
|
2025-11-28 01:46:50 +09:00
|
|
|
#else
|
|
|
|
|
static const int dbg_once = 0;
|
|
|
|
|
#endif
|
2025-12-10 09:08:18 +09:00
|
|
|
(void)dbg_once;
|
2025-11-05 12:31:14 +09:00
|
|
|
if (!g_super_reg_initialized) return;
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
// Step 1: Find and remove from hash table
|
|
|
|
|
SuperSlab* ss = NULL; // Save SuperSlab pointer for per-class removal
|
2025-12-07 03:12:27 +09:00
|
|
|
SuperRegEntry* entries = reg_entries();
|
|
|
|
|
if (!entries) {
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
for (int lg = 20; lg <= 21; lg++) {
|
|
|
|
|
int h = hak_super_hash(base, lg);
|
2025-12-07 03:12:27 +09:00
|
|
|
const int mask = super_reg_effective_mask();
|
|
|
|
|
const int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE
|
|
|
|
|
? SUPER_MAX_PROBE
|
|
|
|
|
: super_reg_effective_size();
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
// Linear probing to find matching entry
|
2025-12-07 03:12:27 +09:00
|
|
|
for (int i = 0; i < probe_limit; i++) {
|
|
|
|
|
SuperRegEntry* e = &entries[(h + i) & mask];
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-11-07 18:07:48 +09:00
|
|
|
if (atomic_load_explicit(&e->base, memory_order_acquire) == base && e->lg_size == lg) {
|
2025-11-05 12:31:14 +09:00
|
|
|
// Found entry to remove
|
2025-11-05 17:02:31 +09:00
|
|
|
// Save SuperSlab pointer BEFORE clearing (for per-class removal)
|
|
|
|
|
ss = atomic_load_explicit(&e->ss, memory_order_acquire);
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// Step 1: Clear SuperSlab pointer (atomic, prevents TOCTOU race)
|
|
|
|
|
atomic_store_explicit(&e->ss, NULL, memory_order_release);
|
|
|
|
|
|
|
|
|
|
// Step 2: Unpublish base (makes entry invisible to readers)
|
2025-11-07 18:07:48 +09:00
|
|
|
atomic_store_explicit(&e->base, 0, memory_order_release);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
// Step 3: Clear lg_size (optional cleanup)
|
|
|
|
|
e->lg_size = 0;
|
2025-11-28 01:46:50 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate lazy-init overhead)
|
2025-11-07 18:07:48 +09:00
|
|
|
if (__builtin_expect(dbg_once == -1, 0)) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
dbg_once = HAK_ENV_SUPER_REG_DEBUG();
|
2025-11-07 18:07:48 +09:00
|
|
|
}
|
|
|
|
|
if (dbg_once == 1) {
|
|
|
|
|
fprintf(stderr, "[SUPER_REG] unregister base=%p\n", (void*)base);
|
|
|
|
|
}
|
2025-11-28 01:46:50 +09:00
|
|
|
#endif
|
2025-11-05 12:31:14 +09:00
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
// Found in hash table, continue to per-class removal
|
|
|
|
|
goto hash_removed;
|
2025-11-05 12:31:14 +09:00
|
|
|
}
|
|
|
|
|
|
2025-11-07 18:07:48 +09:00
|
|
|
if (atomic_load_explicit(&e->base, memory_order_acquire) == 0) {
|
2025-11-05 12:31:14 +09:00
|
|
|
// Not found in this lg_size, try next
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 17:02:31 +09:00
|
|
|
hash_removed:
|
|
|
|
|
// Step 2: Remove from per-class registry (Phase 6: Registry Optimization)
|
|
|
|
|
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
2025-11-13 16:33:03 +09:00
|
|
|
// Phase 12: per-class registry no longer keyed; no per-class removal required.
|
2025-11-05 17:02:31 +09:00
|
|
|
}
|
|
|
|
|
|
2025-11-30 07:16:50 +09:00
|
|
|
// Phase 9-1: Also remove from new hash table
|
|
|
|
|
ss_map_remove(&g_ss_addr_map, (void*)base);
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
// Not found is not an error (could be duplicate unregister)
|
|
|
|
|
}
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Phase 9: Lazy Deallocation - LRU Cache Implementation
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// hak_now_ns() is defined in superslab/superslab_inline.h - use that
|
|
|
|
|
#include <sys/mman.h> // For munmap
|
|
|
|
|
|
|
|
|
|
// Initialize LRU cache (called once at startup)
|
|
|
|
|
void hak_ss_lru_init(void) {
|
|
|
|
|
if (g_ss_lru_initialized) return;
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
if (g_ss_lru_initialized) {
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate config syscall overhead)
|
|
|
|
|
g_ss_lru_cache.max_cached = (uint32_t)HAK_ENV_SUPERSLAB_MAX_CACHED();
|
|
|
|
|
g_ss_lru_cache.max_memory_mb = (uint64_t)HAK_ENV_SUPERSLAB_MAX_MEMORY_MB();
|
|
|
|
|
uint32_t ttl_sec = (uint32_t)HAK_ENV_SUPERSLAB_TTL_SEC();
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
g_ss_lru_cache.ttl_ns = (uint64_t)ttl_sec * 1000000000ULL;
|
|
|
|
|
|
|
|
|
|
g_ss_lru_cache.lru_head = NULL;
|
|
|
|
|
g_ss_lru_cache.lru_tail = NULL;
|
|
|
|
|
g_ss_lru_cache.total_count = 0;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb = 0;
|
|
|
|
|
g_ss_lru_cache.generation = 0;
|
|
|
|
|
|
|
|
|
|
g_ss_lru_initialized = 1;
|
|
|
|
|
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
fprintf(stderr, "[SS_LRU_INIT] max_cached=%u max_memory_mb=%llu ttl_sec=%u\n",
|
|
|
|
|
g_ss_lru_cache.max_cached,
|
|
|
|
|
(unsigned long long)g_ss_lru_cache.max_memory_mb,
|
|
|
|
|
ttl_sec);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Remove SuperSlab from LRU list (does NOT free memory)
|
|
|
|
|
static void ss_lru_remove(SuperSlab* ss) {
|
|
|
|
|
if (!ss) return;
|
|
|
|
|
|
|
|
|
|
if (ss->lru_prev) {
|
|
|
|
|
ss->lru_prev->lru_next = ss->lru_next;
|
|
|
|
|
} else {
|
|
|
|
|
g_ss_lru_cache.lru_head = ss->lru_next;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ss->lru_next) {
|
|
|
|
|
ss->lru_next->lru_prev = ss->lru_prev;
|
|
|
|
|
} else {
|
|
|
|
|
g_ss_lru_cache.lru_tail = ss->lru_prev;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ss->lru_prev = NULL;
|
|
|
|
|
ss->lru_next = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Insert SuperSlab at head of LRU list (most recently used)
|
|
|
|
|
static void ss_lru_insert_head(SuperSlab* ss) {
|
|
|
|
|
if (!ss) return;
|
|
|
|
|
|
|
|
|
|
ss->lru_next = g_ss_lru_cache.lru_head;
|
|
|
|
|
ss->lru_prev = NULL;
|
|
|
|
|
|
|
|
|
|
if (g_ss_lru_cache.lru_head) {
|
|
|
|
|
g_ss_lru_cache.lru_head->lru_prev = ss;
|
|
|
|
|
} else {
|
|
|
|
|
g_ss_lru_cache.lru_tail = ss;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
g_ss_lru_cache.lru_head = ss;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Mark SuperSlab as recently used (move to head)
|
|
|
|
|
void hak_ss_lru_touch(SuperSlab* ss) {
|
|
|
|
|
if (!ss || !g_ss_lru_initialized) return;
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
ss->last_used_ns = hak_now_ns();
|
|
|
|
|
|
|
|
|
|
// If already in list, remove and re-insert at head
|
|
|
|
|
if (ss->lru_prev || ss->lru_next || g_ss_lru_cache.lru_head == ss) {
|
|
|
|
|
ss_lru_remove(ss);
|
|
|
|
|
ss_lru_insert_head(ss);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Evict one SuperSlab from tail (oldest)
|
|
|
|
|
// Returns: 1 if evicted, 0 if cache is empty
|
|
|
|
|
static int ss_lru_evict_one(void) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate lazy-init static overhead)
|
2025-11-28 01:48:02 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-14 06:49:32 +09:00
|
|
|
static int dbg = -1;
|
|
|
|
|
if (__builtin_expect(dbg == -1, 0)) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
dbg = HAK_ENV_SS_LRU_DEBUG();
|
2025-11-14 06:49:32 +09:00
|
|
|
}
|
2025-11-28 01:48:02 +09:00
|
|
|
#else
|
|
|
|
|
static const int dbg = 0;
|
|
|
|
|
#endif
|
2025-11-14 06:49:32 +09:00
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
SuperSlab* victim = g_ss_lru_cache.lru_tail;
|
|
|
|
|
if (!victim) return 0;
|
|
|
|
|
|
2025-12-04 14:22:48 +09:00
|
|
|
// Safety guard: if the tail SuperSlab is no longer registered in the
|
|
|
|
|
// global registry, its memory may already have been unmapped by another
|
|
|
|
|
// path. In that case, dereferencing victim (or its lru_prev/next) is
|
|
|
|
|
// unsafe. Treat this as a stale LRU entry and conservatively reset the
|
|
|
|
|
// cache to an empty state instead of evicting.
|
|
|
|
|
//
|
|
|
|
|
// NOTE: hak_super_lookup() only consults the registry / address map and
|
|
|
|
|
// never dereferences the SuperSlab pointer itself, so this check is safe
|
|
|
|
|
// even if victim has been munmapped.
|
|
|
|
|
if (hak_super_lookup((void*)victim) == NULL) {
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
static int stale_log_count = 0;
|
|
|
|
|
if (stale_log_count < 4) {
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
"[SS_LRU_STALE_TAIL] victim=%p not in registry; resetting LRU cache\n",
|
|
|
|
|
(void*)victim);
|
|
|
|
|
stale_log_count++;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
g_ss_lru_cache.lru_head = NULL;
|
|
|
|
|
g_ss_lru_cache.lru_tail = NULL;
|
|
|
|
|
g_ss_lru_cache.total_count = 0;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb = 0;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
// Remove from LRU list
|
|
|
|
|
ss_lru_remove(victim);
|
|
|
|
|
g_ss_lru_cache.total_count--;
|
|
|
|
|
size_t ss_size = (size_t)1 << victim->lg_size;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb -= (ss_size / (1024 * 1024));
|
|
|
|
|
|
|
|
|
|
// Unregister and free
|
|
|
|
|
uintptr_t base = (uintptr_t)victim;
|
2025-12-10 09:08:18 +09:00
|
|
|
(void)base;
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
// Debug logging for LRU EVICT
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[LRU_EVICT] ss=%p size=%zu KB (freed)\n",
|
|
|
|
|
(void*)victim, ss_size / 1024);
|
|
|
|
|
}
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
// Already unregistered when added to cache, just munmap
|
|
|
|
|
victim->magic = 0;
|
|
|
|
|
munmap(victim, ss_size);
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
static int evict_log_count = 0;
|
|
|
|
|
if (evict_log_count < 10) {
|
2025-11-13 16:33:03 +09:00
|
|
|
fprintf(stderr, "[SS_LRU_EVICT] ss=%p size=%zu (cache_count=%u)\n",
|
|
|
|
|
victim, ss_size, g_ss_lru_cache.total_count);
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
evict_log_count++;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Evict old SuperSlabs based on policy
|
|
|
|
|
void hak_ss_lru_evict(void) {
|
|
|
|
|
if (!g_ss_lru_initialized) return;
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
uint64_t now = hak_now_ns();
|
|
|
|
|
|
|
|
|
|
// Policy 1: Evict until count <= max_cached
|
|
|
|
|
while (g_ss_lru_cache.total_count > g_ss_lru_cache.max_cached) {
|
|
|
|
|
if (!ss_lru_evict_one()) break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Policy 2: Evict until memory <= max_memory_mb
|
|
|
|
|
while (g_ss_lru_cache.total_memory_mb > g_ss_lru_cache.max_memory_mb) {
|
|
|
|
|
if (!ss_lru_evict_one()) break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Policy 3: Evict expired SuperSlabs (TTL)
|
|
|
|
|
SuperSlab* curr = g_ss_lru_cache.lru_tail;
|
|
|
|
|
while (curr) {
|
|
|
|
|
SuperSlab* prev = curr->lru_prev;
|
|
|
|
|
|
|
|
|
|
uint64_t age = now - curr->last_used_ns;
|
|
|
|
|
if (age > g_ss_lru_cache.ttl_ns) {
|
|
|
|
|
ss_lru_remove(curr);
|
|
|
|
|
g_ss_lru_cache.total_count--;
|
|
|
|
|
size_t ss_size = (size_t)1 << curr->lg_size;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb -= (ss_size / (1024 * 1024));
|
|
|
|
|
|
|
|
|
|
curr->magic = 0;
|
|
|
|
|
munmap(curr, ss_size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
curr = prev;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Try to reuse a cached SuperSlab
|
|
|
|
|
SuperSlab* hak_ss_lru_pop(uint8_t size_class) {
|
|
|
|
|
if (!g_ss_lru_initialized) {
|
|
|
|
|
hak_ss_lru_init();
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
// Phase 11: Bypass LRU cache during prewarm
|
|
|
|
|
if (atomic_load_explicit(&g_ss_prewarm_bypass, memory_order_acquire)) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
|
2025-11-28 01:48:02 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-14 06:49:32 +09:00
|
|
|
static __thread int dbg = -1;
|
|
|
|
|
if (__builtin_expect(dbg == -1, 0)) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
dbg = HAK_ENV_SS_LRU_DEBUG();
|
2025-11-14 06:49:32 +09:00
|
|
|
}
|
2025-11-28 01:48:02 +09:00
|
|
|
#else
|
|
|
|
|
static const int dbg = 0;
|
|
|
|
|
#endif
|
2025-11-14 06:49:32 +09:00
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
C7 Stride Upgrade: Fix 1024B→2048B alignment corruption (ROOT CAUSE)
## Problem
C7 (1KB class) blocks were being carved with 1024B stride but expected
to align with 2048B stride, causing systematic NXT_MISALIGN errors with
characteristic pattern: delta_mod = 1026, 1028, 1030, 1032... (1024*N + offset).
This caused crashes, double-frees, and alignment violations in 1024B workloads.
## Root Cause
The global array `g_tiny_class_sizes[]` was correctly updated to 2048B,
but `tiny_block_stride_for_class()` contained a LOCAL static const array
with the old 1024B value:
```c
// hakmem_tiny_superslab.h:52 (BEFORE)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 1024};
^^^^
```
This local table was used by ALL carve operations, causing every C7 block
to be allocated with 1024B stride despite the 2048B upgrade.
## Fix
Updated local stride table in `tiny_block_stride_for_class()`:
```c
// hakmem_tiny_superslab.h:52 (AFTER)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 2048};
^^^^
```
## Verification
**Before**: NXT_MISALIGN delta_mod shows 1024B pattern (1026, 1028, 1030...)
**After**: NXT_MISALIGN delta_mod shows random values (227, 994, 195...)
→ No more 1024B alignment pattern = stride upgrade successful ✓
## Additional Safety Layers (Defense in Depth)
1. **Validation Logic Fix** (tiny_nextptr.h:100)
- Changed stride check to use `tiny_block_stride_for_class()` (includes header)
- Was using `g_tiny_class_sizes[]` (raw size without header)
2. **TLS SLL Purge** (hakmem_tiny_lazy_init.inc.h:83-87)
- Clear TLS SLL on lazy class initialization
- Prevents stale blocks from previous runs
3. **Pre-Carve Geometry Validation** (hakmem_tiny_refill_p0.inc.h:273-297)
- Validates slab capacity matches current stride before carving
- Reinitializes if geometry is stale (e.g., after stride upgrade)
4. **LRU Stride Validation** (hakmem_super_registry.c:369-458)
- Validates cached SuperSlabs have compatible stride
- Evicts incompatible SuperSlabs immediately
5. **Shared Pool Geometry Fix** (hakmem_shared_pool.c:722-733)
- Reinitializes slab geometry on acquisition if capacity mismatches
6. **Legacy Backend Validation** (ss_legacy_backend_box.c:138-155)
- Validates geometry before allocation in legacy path
## Impact
- Eliminates 100% of 1024B-pattern alignment errors
- Fixes crashes in 1024B workloads (bench_random_mixed 1024B now stable)
- Establishes multiple validation layers to prevent future stride issues
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 22:55:17 +09:00
|
|
|
// Find a compatible SuperSlab in cache (stride must match current config)
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
SuperSlab* curr = g_ss_lru_cache.lru_head;
|
C7 Stride Upgrade: Fix 1024B→2048B alignment corruption (ROOT CAUSE)
## Problem
C7 (1KB class) blocks were being carved with 1024B stride but expected
to align with 2048B stride, causing systematic NXT_MISALIGN errors with
characteristic pattern: delta_mod = 1026, 1028, 1030, 1032... (1024*N + offset).
This caused crashes, double-frees, and alignment violations in 1024B workloads.
## Root Cause
The global array `g_tiny_class_sizes[]` was correctly updated to 2048B,
but `tiny_block_stride_for_class()` contained a LOCAL static const array
with the old 1024B value:
```c
// hakmem_tiny_superslab.h:52 (BEFORE)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 1024};
^^^^
```
This local table was used by ALL carve operations, causing every C7 block
to be allocated with 1024B stride despite the 2048B upgrade.
## Fix
Updated local stride table in `tiny_block_stride_for_class()`:
```c
// hakmem_tiny_superslab.h:52 (AFTER)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 2048};
^^^^
```
## Verification
**Before**: NXT_MISALIGN delta_mod shows 1024B pattern (1026, 1028, 1030...)
**After**: NXT_MISALIGN delta_mod shows random values (227, 994, 195...)
→ No more 1024B alignment pattern = stride upgrade successful ✓
## Additional Safety Layers (Defense in Depth)
1. **Validation Logic Fix** (tiny_nextptr.h:100)
- Changed stride check to use `tiny_block_stride_for_class()` (includes header)
- Was using `g_tiny_class_sizes[]` (raw size without header)
2. **TLS SLL Purge** (hakmem_tiny_lazy_init.inc.h:83-87)
- Clear TLS SLL on lazy class initialization
- Prevents stale blocks from previous runs
3. **Pre-Carve Geometry Validation** (hakmem_tiny_refill_p0.inc.h:273-297)
- Validates slab capacity matches current stride before carving
- Reinitializes if geometry is stale (e.g., after stride upgrade)
4. **LRU Stride Validation** (hakmem_super_registry.c:369-458)
- Validates cached SuperSlabs have compatible stride
- Evicts incompatible SuperSlabs immediately
5. **Shared Pool Geometry Fix** (hakmem_shared_pool.c:722-733)
- Reinitializes slab geometry on acquisition if capacity mismatches
6. **Legacy Backend Validation** (ss_legacy_backend_box.c:138-155)
- Validates geometry before allocation in legacy path
## Impact
- Eliminates 100% of 1024B-pattern alignment errors
- Fixes crashes in 1024B workloads (bench_random_mixed 1024B now stable)
- Establishes multiple validation layers to prevent future stride issues
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 22:55:17 +09:00
|
|
|
extern const size_t g_tiny_class_sizes[];
|
|
|
|
|
size_t expected_stride = g_tiny_class_sizes[size_class];
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
while (curr) {
|
C7 Stride Upgrade: Fix 1024B→2048B alignment corruption (ROOT CAUSE)
## Problem
C7 (1KB class) blocks were being carved with 1024B stride but expected
to align with 2048B stride, causing systematic NXT_MISALIGN errors with
characteristic pattern: delta_mod = 1026, 1028, 1030, 1032... (1024*N + offset).
This caused crashes, double-frees, and alignment violations in 1024B workloads.
## Root Cause
The global array `g_tiny_class_sizes[]` was correctly updated to 2048B,
but `tiny_block_stride_for_class()` contained a LOCAL static const array
with the old 1024B value:
```c
// hakmem_tiny_superslab.h:52 (BEFORE)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 1024};
^^^^
```
This local table was used by ALL carve operations, causing every C7 block
to be allocated with 1024B stride despite the 2048B upgrade.
## Fix
Updated local stride table in `tiny_block_stride_for_class()`:
```c
// hakmem_tiny_superslab.h:52 (AFTER)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 2048};
^^^^
```
## Verification
**Before**: NXT_MISALIGN delta_mod shows 1024B pattern (1026, 1028, 1030...)
**After**: NXT_MISALIGN delta_mod shows random values (227, 994, 195...)
→ No more 1024B alignment pattern = stride upgrade successful ✓
## Additional Safety Layers (Defense in Depth)
1. **Validation Logic Fix** (tiny_nextptr.h:100)
- Changed stride check to use `tiny_block_stride_for_class()` (includes header)
- Was using `g_tiny_class_sizes[]` (raw size without header)
2. **TLS SLL Purge** (hakmem_tiny_lazy_init.inc.h:83-87)
- Clear TLS SLL on lazy class initialization
- Prevents stale blocks from previous runs
3. **Pre-Carve Geometry Validation** (hakmem_tiny_refill_p0.inc.h:273-297)
- Validates slab capacity matches current stride before carving
- Reinitializes if geometry is stale (e.g., after stride upgrade)
4. **LRU Stride Validation** (hakmem_super_registry.c:369-458)
- Validates cached SuperSlabs have compatible stride
- Evicts incompatible SuperSlabs immediately
5. **Shared Pool Geometry Fix** (hakmem_shared_pool.c:722-733)
- Reinitializes slab geometry on acquisition if capacity mismatches
6. **Legacy Backend Validation** (ss_legacy_backend_box.c:138-155)
- Validates geometry before allocation in legacy path
## Impact
- Eliminates 100% of 1024B-pattern alignment errors
- Fixes crashes in 1024B workloads (bench_random_mixed 1024B now stable)
- Establishes multiple validation layers to prevent future stride issues
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 22:55:17 +09:00
|
|
|
// Validate: Check if cached SuperSlab slabs match current stride
|
|
|
|
|
// This prevents reusing old 1024B SuperSlabs for new 2048B C7 allocations
|
|
|
|
|
int is_compatible = 1;
|
|
|
|
|
|
|
|
|
|
// Scan active slabs for stride mismatch
|
|
|
|
|
int cap = ss_slabs_capacity(curr);
|
|
|
|
|
for (int i = 0; i < cap; i++) {
|
|
|
|
|
if (curr->slab_bitmap & (1u << i)) {
|
|
|
|
|
TinySlabMeta* meta = &curr->slabs[i];
|
|
|
|
|
if (meta->capacity > 0) {
|
|
|
|
|
// Calculate implied stride from slab geometry
|
|
|
|
|
// Slab 0: 63488B usable, Others: 65536B usable
|
|
|
|
|
size_t slab_usable = (i == 0) ? 63488 : 65536;
|
|
|
|
|
size_t implied_stride = slab_usable / meta->capacity;
|
|
|
|
|
|
|
|
|
|
// Stride mismatch detected
|
|
|
|
|
if (implied_stride != expected_stride) {
|
|
|
|
|
is_compatible = 0;
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
static _Atomic uint32_t g_incomp_log = 0;
|
|
|
|
|
uint32_t n = atomic_fetch_add(&g_incomp_log, 1);
|
|
|
|
|
if (n < 8) {
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
"[LRU_INCOMPATIBLE] class=%d ss=%p slab=%d expect_stride=%zu implied=%zu (evicting)\n",
|
|
|
|
|
size_class, (void*)curr, i, expected_stride, implied_stride);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (is_compatible) {
|
|
|
|
|
// Compatible - reuse this SuperSlab
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
ss_lru_remove(curr);
|
|
|
|
|
g_ss_lru_cache.total_count--;
|
|
|
|
|
size_t ss_size = (size_t)1 << curr->lg_size;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb -= (ss_size / (1024 * 1024));
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
uint32_t cache_count_after = g_ss_lru_cache.total_count;
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
// Debug logging for LRU POP (hit)
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[LRU_POP] class=%d ss=%p (hit) (cache_size=%u/%u)\n",
|
|
|
|
|
size_class, (void*)curr, cache_count_after, g_ss_lru_cache.max_cached);
|
|
|
|
|
}
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
static int pop_log_count = 0;
|
|
|
|
|
if (pop_log_count < 10) {
|
2025-11-13 16:33:03 +09:00
|
|
|
fprintf(stderr, "[SS_LRU_POP] Reusing ss=%p size=%zu (cache_count=%u)\n",
|
2025-11-14 06:49:32 +09:00
|
|
|
curr, ss_size, cache_count_after);
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
pop_log_count++;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Re-initialize SuperSlab (magic, timestamps, etc.)
|
|
|
|
|
curr->magic = SUPERSLAB_MAGIC;
|
|
|
|
|
curr->last_used_ns = hak_now_ns();
|
|
|
|
|
curr->lru_prev = NULL;
|
|
|
|
|
curr->lru_next = NULL;
|
|
|
|
|
|
2025-12-03 09:15:59 +09:00
|
|
|
// ROOT CAUSE FIX: Re-register in global registry (idempotent)
|
|
|
|
|
// Without this, hak_super_lookup() fails in free() path
|
|
|
|
|
hak_super_register((uintptr_t)curr, curr);
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
return curr;
|
|
|
|
|
}
|
C7 Stride Upgrade: Fix 1024B→2048B alignment corruption (ROOT CAUSE)
## Problem
C7 (1KB class) blocks were being carved with 1024B stride but expected
to align with 2048B stride, causing systematic NXT_MISALIGN errors with
characteristic pattern: delta_mod = 1026, 1028, 1030, 1032... (1024*N + offset).
This caused crashes, double-frees, and alignment violations in 1024B workloads.
## Root Cause
The global array `g_tiny_class_sizes[]` was correctly updated to 2048B,
but `tiny_block_stride_for_class()` contained a LOCAL static const array
with the old 1024B value:
```c
// hakmem_tiny_superslab.h:52 (BEFORE)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 1024};
^^^^
```
This local table was used by ALL carve operations, causing every C7 block
to be allocated with 1024B stride despite the 2048B upgrade.
## Fix
Updated local stride table in `tiny_block_stride_for_class()`:
```c
// hakmem_tiny_superslab.h:52 (AFTER)
static const size_t class_sizes[8] = {8, 16, 32, 64, 128, 256, 512, 2048};
^^^^
```
## Verification
**Before**: NXT_MISALIGN delta_mod shows 1024B pattern (1026, 1028, 1030...)
**After**: NXT_MISALIGN delta_mod shows random values (227, 994, 195...)
→ No more 1024B alignment pattern = stride upgrade successful ✓
## Additional Safety Layers (Defense in Depth)
1. **Validation Logic Fix** (tiny_nextptr.h:100)
- Changed stride check to use `tiny_block_stride_for_class()` (includes header)
- Was using `g_tiny_class_sizes[]` (raw size without header)
2. **TLS SLL Purge** (hakmem_tiny_lazy_init.inc.h:83-87)
- Clear TLS SLL on lazy class initialization
- Prevents stale blocks from previous runs
3. **Pre-Carve Geometry Validation** (hakmem_tiny_refill_p0.inc.h:273-297)
- Validates slab capacity matches current stride before carving
- Reinitializes if geometry is stale (e.g., after stride upgrade)
4. **LRU Stride Validation** (hakmem_super_registry.c:369-458)
- Validates cached SuperSlabs have compatible stride
- Evicts incompatible SuperSlabs immediately
5. **Shared Pool Geometry Fix** (hakmem_shared_pool.c:722-733)
- Reinitializes slab geometry on acquisition if capacity mismatches
6. **Legacy Backend Validation** (ss_legacy_backend_box.c:138-155)
- Validates geometry before allocation in legacy path
## Impact
- Eliminates 100% of 1024B-pattern alignment errors
- Fixes crashes in 1024B workloads (bench_random_mixed 1024B now stable)
- Establishes multiple validation layers to prevent future stride issues
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 22:55:17 +09:00
|
|
|
|
|
|
|
|
// Incompatible SuperSlab - evict immediately
|
|
|
|
|
SuperSlab* next = curr->lru_next;
|
|
|
|
|
ss_lru_remove(curr);
|
|
|
|
|
g_ss_lru_cache.total_count--;
|
|
|
|
|
size_t ss_size = (size_t)1 << curr->lg_size;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb -= (ss_size / (1024 * 1024));
|
|
|
|
|
|
|
|
|
|
// Track evictions for observability
|
|
|
|
|
static _Atomic uint64_t g_incompatible_evictions = 0;
|
|
|
|
|
atomic_fetch_add(&g_incompatible_evictions, 1);
|
|
|
|
|
|
|
|
|
|
// Release memory
|
|
|
|
|
munmap(curr, ss_size);
|
|
|
|
|
|
|
|
|
|
curr = next;
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
}
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
uint32_t cache_count_miss = g_ss_lru_cache.total_count;
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
2025-11-14 06:49:32 +09:00
|
|
|
|
|
|
|
|
// Debug logging for LRU POP (miss)
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[LRU_POP] class=%d (miss) (cache_size=%u/%u)\n",
|
|
|
|
|
size_class, cache_count_miss, g_ss_lru_cache.max_cached);
|
|
|
|
|
}
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
return NULL; // No matching SuperSlab in cache
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add SuperSlab to LRU cache
|
|
|
|
|
int hak_ss_lru_push(SuperSlab* ss) {
|
|
|
|
|
if (!ss || !g_ss_lru_initialized) {
|
|
|
|
|
hak_ss_lru_init();
|
|
|
|
|
}
|
|
|
|
|
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
|
2025-11-28 01:48:02 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-14 06:49:32 +09:00
|
|
|
static __thread int dbg = -1;
|
|
|
|
|
if (__builtin_expect(dbg == -1, 0)) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
dbg = HAK_ENV_SS_LRU_DEBUG();
|
2025-11-14 06:49:32 +09:00
|
|
|
}
|
2025-11-28 01:48:02 +09:00
|
|
|
#else
|
|
|
|
|
static const int dbg = 0;
|
|
|
|
|
#endif
|
2025-11-14 06:49:32 +09:00
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
// Check if we should cache or evict immediately
|
|
|
|
|
size_t ss_size = (size_t)1 << ss->lg_size;
|
|
|
|
|
uint64_t ss_mb = ss_size / (1024 * 1024);
|
|
|
|
|
|
|
|
|
|
// If adding this would exceed limits, evict first
|
|
|
|
|
while (g_ss_lru_cache.total_count >= g_ss_lru_cache.max_cached ||
|
|
|
|
|
g_ss_lru_cache.total_memory_mb + ss_mb > g_ss_lru_cache.max_memory_mb) {
|
|
|
|
|
if (!ss_lru_evict_one()) {
|
|
|
|
|
// Cache is empty but still can't fit - don't cache
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add to cache
|
|
|
|
|
ss->last_used_ns = hak_now_ns();
|
|
|
|
|
ss->generation = g_ss_lru_cache.generation++;
|
|
|
|
|
ss_lru_insert_head(ss);
|
|
|
|
|
g_ss_lru_cache.total_count++;
|
|
|
|
|
g_ss_lru_cache.total_memory_mb += ss_mb;
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
uint32_t cache_count_after = g_ss_lru_cache.total_count;
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
// Debug logging for LRU PUSH
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[LRU_PUSH] ss=%p size=%zu KB (cache_size=%u/%u)\n",
|
|
|
|
|
(void*)ss, ss_size / 1024, cache_count_after, g_ss_lru_cache.max_cached);
|
|
|
|
|
}
|
|
|
|
|
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
static int push_log_count = 0;
|
|
|
|
|
if (push_log_count < 10) {
|
2025-11-13 16:33:03 +09:00
|
|
|
fprintf(stderr, "[SS_LRU_PUSH] Cached ss=%p size=%zu (cache_count=%u)\n",
|
2025-11-14 06:49:32 +09:00
|
|
|
ss, ss_size, cache_count_after);
|
Phase 9: SuperSlab Lazy Deallocation + mincore removal
Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance
Implementation:
1. mincore removal (100% elimination)
- Deleted: hakmem_internal.h hak_is_memory_readable() syscall
- Deleted: tiny_free_fast_v2.inc.h safety checks
- Alternative: Internal metadata (Registry + Header magic validation)
- Result: 841 mincore calls → 0 calls ✅
2. SuperSlab Lazy Deallocation
- Added LRU Cache Manager (470 lines in hakmem_super_registry.c)
- Extended SuperSlab: last_used_ns, generation, lru_prev/next
- Deallocation policy: Count/Memory/TTL based eviction
- Environment variables:
* HAKMEM_SUPERSLAB_MAX_CACHED=256 (default)
* HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default)
* HAKMEM_SUPERSLAB_TTL_SEC=60 (default)
3. Integration
- superslab_allocate: Try LRU cache first before mmap
- superslab_free: Push to LRU cache instead of immediate munmap
- Lazy deallocation: Defer munmap until cache limits exceeded
Performance Results (100K iterations, 256B allocations):
Before (Phase 7-8):
- Performance: 2.76M ops/s
- Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841)
After (Phase 9):
- Performance: 9.71M ops/s (+251%) 🏆
- Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%)
Key Achievements:
- ✅ mincore: 100% elimination (841 → 0)
- ✅ mmap: -30% reduction (1,250 → 877)
- ✅ munmap: -35% reduction (1,321 → 852)
- ✅ Total syscalls: -49% reduction (3,412 → 1,729)
- ✅ Performance: +251% improvement (2.76M → 9.71M ops/s)
System malloc comparison:
- HAKMEM: 9.71M ops/s
- System malloc: 90.04M ops/s
- Achievement: 10.8% (target: 93%)
Next optimization:
- Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap)
- Pre-warm LRU cache
- Adaptive LRU sizing
- Per-class LRU cache
Production ready with recommended settings:
export HAKMEM_SUPERSLAB_MAX_CACHED=256
export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512
./bench_random_mixed_hakmem
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
|
|
|
push_log_count++;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Phase 11: SuperSlab Prewarm - Eliminate mmap/munmap bottleneck
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// Prewarm specific size class with count SuperSlabs
|
|
|
|
|
void hak_ss_prewarm_class(int size_class, uint32_t count) {
|
|
|
|
|
if (size_class < 0 || size_class >= TINY_NUM_CLASSES) {
|
|
|
|
|
fprintf(stderr, "[SS_PREWARM] Invalid size_class=%d (valid: 0-%d)\n",
|
|
|
|
|
size_class, TINY_NUM_CLASSES - 1);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate lazy-init static overhead)
|
2025-11-28 01:48:57 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-14 06:49:32 +09:00
|
|
|
static int dbg = -1;
|
|
|
|
|
if (__builtin_expect(dbg == -1, 0)) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
dbg = HAK_ENV_SS_PREWARM_DEBUG();
|
2025-11-14 06:49:32 +09:00
|
|
|
}
|
2025-11-28 01:48:57 +09:00
|
|
|
#else
|
|
|
|
|
static const int dbg = 0;
|
|
|
|
|
#endif
|
2025-11-14 06:49:32 +09:00
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
// Ensure LRU cache is initialized
|
|
|
|
|
if (!g_ss_lru_initialized) {
|
|
|
|
|
hak_ss_lru_init();
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-02 19:43:23 +09:00
|
|
|
// Phase 11+: Use static array to avoid malloc() during init (causes recursion)
|
|
|
|
|
// Cap at 512 as defined in SS_COLD_START_MAX_COUNT
|
|
|
|
|
#define SS_PREWARM_MAX_BATCH 512
|
|
|
|
|
static SuperSlab* slabs[SS_PREWARM_MAX_BATCH];
|
|
|
|
|
if (count > SS_PREWARM_MAX_BATCH) {
|
|
|
|
|
count = SS_PREWARM_MAX_BATCH;
|
2025-11-13 14:45:43 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Enable prewarm bypass to prevent LRU cache from being used during allocation
|
|
|
|
|
atomic_store_explicit(&g_ss_prewarm_bypass, 1, memory_order_release);
|
|
|
|
|
|
|
|
|
|
uint32_t allocated = 0;
|
|
|
|
|
for (uint32_t i = 0; i < count; i++) {
|
|
|
|
|
// Allocate a SuperSlab for this class
|
|
|
|
|
SuperSlab* ss = superslab_allocate((uint8_t)size_class);
|
|
|
|
|
if (!ss) {
|
|
|
|
|
break; // Stop on OOM
|
|
|
|
|
}
|
|
|
|
|
slabs[allocated++] = ss;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Disable prewarm bypass
|
|
|
|
|
atomic_store_explicit(&g_ss_prewarm_bypass, 0, memory_order_release);
|
|
|
|
|
|
|
|
|
|
// Now push all allocated SuperSlabs to LRU cache
|
|
|
|
|
uint32_t cached = 0;
|
|
|
|
|
for (uint32_t i = 0; i < allocated; i++) {
|
|
|
|
|
int pushed = hak_ss_lru_push(slabs[i]);
|
|
|
|
|
if (pushed) {
|
|
|
|
|
cached++;
|
|
|
|
|
} else {
|
|
|
|
|
// LRU cache full - free remaining SuperSlabs
|
|
|
|
|
for (uint32_t j = i; j < allocated; j++) {
|
|
|
|
|
superslab_free(slabs[j]);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-02 19:43:23 +09:00
|
|
|
// Note: slabs is static array, no free() needed
|
2025-11-13 14:45:43 +09:00
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
// Debug logging for PREWARM
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[PREWARM] Class %d: allocated=%u cached=%u\n",
|
|
|
|
|
size_class, allocated, cached);
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
fprintf(stderr, "[SS_PREWARM] Class %d: allocated=%u cached=%u\n",
|
|
|
|
|
size_class, allocated, cached);
|
|
|
|
|
#else
|
|
|
|
|
(void)cached; // Suppress unused warning
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Prewarm all classes (counts[i] = number of SuperSlabs for class i)
|
|
|
|
|
void hak_ss_prewarm_all(const uint32_t counts[TINY_NUM_CLASSES]) {
|
|
|
|
|
if (!counts) return;
|
|
|
|
|
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (counts[cls] > 0) {
|
|
|
|
|
hak_ss_prewarm_class(cls, counts[cls]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Prewarm: Allocate SuperSlabs at startup and add to LRU cache
|
2025-12-02 19:43:23 +09:00
|
|
|
// Phase 11+: Cold Start Box enables prewarm by default (1 SuperSlab/class)
|
2025-11-13 14:45:43 +09:00
|
|
|
void hak_ss_prewarm_init(void) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate lazy-init static overhead)
|
2025-11-28 01:48:57 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-14 06:49:32 +09:00
|
|
|
static int dbg = -1;
|
|
|
|
|
if (__builtin_expect(dbg == -1, 0)) {
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
dbg = HAK_ENV_SS_PREWARM_DEBUG();
|
2025-11-14 06:49:32 +09:00
|
|
|
}
|
2025-11-28 01:48:57 +09:00
|
|
|
#else
|
|
|
|
|
static const int dbg = 0;
|
|
|
|
|
#endif
|
2025-11-14 06:49:32 +09:00
|
|
|
|
2025-12-02 19:43:23 +09:00
|
|
|
// Phase 11+: Get default from Cold Start Box (enables prewarm by default)
|
|
|
|
|
// Can be disabled via HAKMEM_SS_PREWARM_DISABLE=1 or HAKMEM_SS_PREWARM_COUNT=0
|
|
|
|
|
int cold_start_count = ss_cold_start_get_count();
|
|
|
|
|
ss_cold_start_log_config(); // Log configuration for diagnostics
|
|
|
|
|
|
|
|
|
|
if (cold_start_count == 0) {
|
|
|
|
|
// Prewarm explicitly disabled
|
2025-11-13 14:45:43 +09:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate legacy config syscall overhead)
|
2025-12-02 19:43:23 +09:00
|
|
|
// Check for legacy ENV override (HAKMEM_PREWARM_SUPERSLABS)
|
|
|
|
|
// This takes precedence over Cold Start Box default
|
Priority-2: ENV Cache - SuperSlab Registry/LRU/Prewarm getenv() 置換
変更内容:
- hakmem_env_cache.h: 7つの新ENV変数を追加
(SUPER_REG_DEBUG, SUPERSLAB_MAX_CACHED, SUPERSLAB_MAX_MEMORY_MB,
SUPERSLAB_TTL_SEC, SS_LRU_DEBUG, SS_PREWARM_DEBUG, PREWARM_SUPERSLABS)
- hakmem_super_registry.c: 11箇所の getenv() を置換
(Registry debug, LRU config, LRU debug x3, Prewarm debug x2, Prewarm config)
効果: SuperSlab管理層からも syscall を排除 (ENV変数数: 30→37)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 20:30:29 +09:00
|
|
|
int env_val = HAK_ENV_PREWARM_SUPERSLABS();
|
|
|
|
|
long global = (env_val != 0) ? env_val : cold_start_count; // Default from Cold Start Box
|
|
|
|
|
|
|
|
|
|
if (env_val != 0) {
|
|
|
|
|
// Legacy ENV override active
|
|
|
|
|
global = env_val;
|
|
|
|
|
if (global == 0) {
|
|
|
|
|
// Legacy disable via HAKMEM_PREWARM_SUPERSLABS=0
|
|
|
|
|
return;
|
2025-12-02 19:43:23 +09:00
|
|
|
}
|
2025-11-13 14:45:43 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Cap at reasonable limit (avoid OOM on typo like "10000")
|
|
|
|
|
if (global > 512) {
|
|
|
|
|
fprintf(stderr, "[SS_PREWARM] WARNING: Capping prewarm count from %ld to 512 per class\n", global);
|
|
|
|
|
global = 512;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t prewarm_count = (uint32_t)global;
|
|
|
|
|
|
|
|
|
|
// Expand LRU cache capacity to hold prewarmed SuperSlabs
|
|
|
|
|
uint32_t needed = prewarm_count * TINY_NUM_CLASSES;
|
|
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
if (needed > g_ss_lru_cache.max_cached) {
|
|
|
|
|
g_ss_lru_cache.max_cached = needed;
|
|
|
|
|
// Expand memory limit (1 SuperSlab = 1MB or 2MB)
|
|
|
|
|
// Conservative estimate: 2MB per SuperSlab
|
|
|
|
|
uint64_t needed_mb = (uint64_t)needed * 2;
|
|
|
|
|
if (needed_mb > g_ss_lru_cache.max_memory_mb) {
|
|
|
|
|
g_ss_lru_cache.max_memory_mb = needed_mb;
|
|
|
|
|
}
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
fprintf(stderr, "[SS_PREWARM] Expanded LRU cache: max_cached=%u max_memory_mb=%llu\n",
|
|
|
|
|
g_ss_lru_cache.max_cached, (unsigned long long)g_ss_lru_cache.max_memory_mb);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
// Prewarm all classes uniformly
|
|
|
|
|
uint32_t counts[TINY_NUM_CLASSES];
|
|
|
|
|
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
|
|
|
|
|
counts[i] = prewarm_count;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
// Debug logging for PREWARM initialization
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[PREWARM] Allocating %u SuperSlabs for classes 0-%d (total=%u)\n",
|
|
|
|
|
prewarm_count, TINY_NUM_CLASSES - 1, needed);
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
fprintf(stderr, "[SS_PREWARM] Starting prewarm: %u SuperSlabs per class (%u total)\n",
|
|
|
|
|
prewarm_count, needed);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
hak_ss_prewarm_all(counts);
|
|
|
|
|
|
2025-11-14 06:49:32 +09:00
|
|
|
// Debug logging for PREWARM completion
|
|
|
|
|
if (dbg == 1) {
|
|
|
|
|
fprintf(stderr, "[PREWARM] Complete: %u SuperSlabs cached\n", g_ss_lru_cache.total_count);
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-13 14:45:43 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
fprintf(stderr, "[SS_PREWARM] Prewarm complete (cache_count=%u)\n", g_ss_lru_cache.total_count);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// Debug: Get registry statistics
|
|
|
|
|
void hak_super_registry_stats(SuperRegStats* stats) {
|
|
|
|
|
if (!stats) return;
|
|
|
|
|
|
2025-12-07 03:12:27 +09:00
|
|
|
int eff_size = super_reg_effective_size();
|
|
|
|
|
int eff_mask = super_reg_effective_mask();
|
|
|
|
|
SuperRegEntry* reg = reg_entries();
|
|
|
|
|
|
|
|
|
|
stats->total_slots = eff_size;
|
2025-11-05 12:31:14 +09:00
|
|
|
stats->used_slots = 0;
|
|
|
|
|
stats->max_probe_depth = 0;
|
2025-12-07 03:12:27 +09:00
|
|
|
if (!reg || eff_size <= 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
pthread_mutex_lock(&g_super_reg_lock);
|
|
|
|
|
|
|
|
|
|
// Count used slots
|
2025-12-07 03:12:27 +09:00
|
|
|
for (int i = 0; i < eff_size; i++) {
|
|
|
|
|
if (atomic_load_explicit(®[i].base, memory_order_acquire) != 0) {
|
2025-11-05 12:31:14 +09:00
|
|
|
stats->used_slots++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Calculate max probe depth
|
2025-12-07 03:12:27 +09:00
|
|
|
for (int i = 0; i < eff_size; i++) {
|
|
|
|
|
if (atomic_load_explicit(®[i].base, memory_order_acquire) != 0) {
|
|
|
|
|
uintptr_t base = atomic_load_explicit(®[i].base, memory_order_acquire);
|
|
|
|
|
int lg = reg[i].lg_size; // Phase 8.3: Use stored lg_size
|
2025-11-05 12:31:14 +09:00
|
|
|
int h = hak_super_hash(base, lg);
|
|
|
|
|
|
|
|
|
|
// Find actual probe depth for this entry
|
|
|
|
|
for (int j = 0; j < SUPER_MAX_PROBE; j++) {
|
2025-12-07 03:12:27 +09:00
|
|
|
int idx = (h + j) & eff_mask;
|
|
|
|
|
if (atomic_load_explicit(®[idx].base, memory_order_acquire) == base && reg[idx].lg_size == lg) {
|
2025-11-05 12:31:14 +09:00
|
|
|
if (j > stats->max_probe_depth) {
|
|
|
|
|
stats->max_probe_depth = j;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pthread_mutex_unlock(&g_super_reg_lock);
|
|
|
|
|
}
|