Files
hakmem/core/hakmem_tiny_superslab.c

1066 lines
38 KiB
C
Raw Normal View History

// hakmem_tiny_superslab.c - SuperSlab allocator implementation (Phase 6.22)
// Purpose: 2MB aligned slab allocation with fast pointer→slab lookup
// License: MIT
// Date: 2025-10-24
#include "hakmem_tiny_superslab.h"
#include "hakmem_super_registry.h" // Phase 1: Registry integration
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
#include "hakmem_tiny.h" // For g_tiny_class_sizes and tiny_self_u32
#include <sys/mman.h>
#include <sys/resource.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h> // getenv, atoi
#include <pthread.h>
#include <unistd.h>
#include <sys/resource.h> // getrlimit for OOM diagnostics
#include <sys/mman.h>
#include "hakmem_internal.h" // HAKMEM_LOG for release-silent logging
static int g_ss_force_lg = -1;
static _Atomic int g_ss_populate_once = 0;
// ============================================================================
// Global Statistics
// ============================================================================
static pthread_mutex_t g_superslab_lock = PTHREAD_MUTEX_INITIALIZER;
uint64_t g_superslabs_allocated = 0; // Non-static for debugging
uint64_t g_superslabs_freed = 0; // Phase 7.6: Non-static for test access
uint64_t g_bytes_allocated = 0; // Non-static for debugging
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
// ============================================================================
// Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads
// ============================================================================
SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS] = {NULL};
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
// Debug counters
_Atomic uint64_t g_ss_active_dec_calls = 0;
_Atomic uint64_t g_hak_tiny_free_calls = 0;
_Atomic uint64_t g_ss_remote_push_calls = 0;
// Free path instrumentation (lightweight, for OOM/route diagnosis)
_Atomic uint64_t g_free_ss_enter = 0; // hak_tiny_free_superslab() entries
_Atomic uint64_t g_free_local_box_calls = 0; // same-thread freelist pushes
_Atomic uint64_t g_free_remote_box_calls = 0; // cross-thread remote pushes
// Per-class counters for gating/metrics (Tiny classes = 8)
uint64_t g_ss_alloc_by_class[8] = {0};
uint64_t g_ss_freed_by_class[8] = {0};
typedef struct SuperslabCacheEntry {
struct SuperslabCacheEntry* next;
} SuperslabCacheEntry;
static SuperslabCacheEntry* g_ss_cache_head[8] = {0};
static size_t g_ss_cache_count[8] = {0};
static size_t g_ss_cache_cap[8] = {0};
static size_t g_ss_precharge_target[8] = {0};
static _Atomic int g_ss_precharge_done[8] = {0};
static int g_ss_cache_enabled = 0;
static pthread_once_t g_ss_cache_once = PTHREAD_ONCE_INIT;
static pthread_mutex_t g_ss_cache_lock[8];
uint64_t g_ss_cache_hits[8] = {0};
uint64_t g_ss_cache_misses[8] = {0};
uint64_t g_ss_cache_puts[8] = {0};
uint64_t g_ss_cache_drops[8] = {0};
uint64_t g_ss_cache_precharged[8] = {0};
uint64_t g_superslabs_reused = 0;
uint64_t g_superslabs_cached = 0;
static void ss_cache_global_init(void) {
for (int i = 0; i < 8; i++) {
pthread_mutex_init(&g_ss_cache_lock[i], NULL);
}
}
static inline void ss_cache_ensure_init(void) {
pthread_once(&g_ss_cache_once, ss_cache_global_init);
}
static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate);
static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask);
static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class);
static int ss_cache_push(uint8_t size_class, SuperSlab* ss);
static inline void ss_stats_os_alloc(uint8_t size_class, size_t ss_size) {
pthread_mutex_lock(&g_superslab_lock);
g_superslabs_allocated++;
if (size_class < 8) {
g_ss_alloc_by_class[size_class]++;
}
g_bytes_allocated += ss_size;
pthread_mutex_unlock(&g_superslab_lock);
}
static inline void ss_stats_cache_reuse(void) {
pthread_mutex_lock(&g_superslab_lock);
g_superslabs_reused++;
pthread_mutex_unlock(&g_superslab_lock);
}
static inline void ss_stats_cache_store(void) {
pthread_mutex_lock(&g_superslab_lock);
g_superslabs_cached++;
pthread_mutex_unlock(&g_superslab_lock);
}
// ============================================================================
// Phase 8.3: ACE (Adaptive Cache Engine) State
// ============================================================================
SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS] = {{0}};
// Phase 8.3: hak_now_ns() is now defined in hakmem_tiny_superslab.h as static inline
// ============================================================================
// Diagnostics
// ============================================================================
static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) {
static int logged = 0;
if (logged) return;
logged = 1;
// CRITICAL FIX: Increment lock depth FIRST before any LIBC calls
// fopen/fclose/getrlimit/fprintf all may call malloc internally
// Must bypass HAKMEM wrapper to avoid header mismatch crash
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++; // Force wrapper to use __libc_malloc
struct rlimit rl = {0};
if (getrlimit(RLIMIT_AS, &rl) != 0) {
rl.rlim_cur = RLIM_INFINITY;
rl.rlim_max = RLIM_INFINITY;
}
unsigned long vm_size_kb = 0;
unsigned long vm_rss_kb = 0;
FILE* status = fopen("/proc/self/status", "r");
if (status) {
char line[256];
while (fgets(line, sizeof(line), status)) {
if (strncmp(line, "VmSize:", 7) == 0) {
(void)sscanf(line + 7, "%lu", &vm_size_kb);
} else if (strncmp(line, "VmRSS:", 6) == 0) {
(void)sscanf(line + 6, "%lu", &vm_rss_kb);
}
}
fclose(status);
}
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
// CRITICAL FIX: Do NOT decrement lock_depth yet!
// fprintf() below may call malloc for buffering
char rl_cur_buf[32];
char rl_max_buf[32];
if (rl.rlim_cur == RLIM_INFINITY) {
strcpy(rl_cur_buf, "inf");
} else {
snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur);
}
if (rl.rlim_max == RLIM_INFINITY) {
strcpy(rl_max_buf, "inf");
} else {
snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max);
}
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr,
"[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu "
"alloc=%llu freed=%llu bytes=%llu "
"RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n",
err,
ss_size,
alloc_size,
(unsigned long long)g_superslabs_allocated,
(unsigned long long)g_superslabs_freed,
(unsigned long long)g_bytes_allocated,
rl_cur_buf,
rl_max_buf,
vm_size_kb,
vm_rss_kb);
#endif
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
g_hakmem_lock_depth--; // Now safe to restore (all libc calls complete)
}
// Global counters for debugging (non-static for external access)
_Atomic uint64_t g_ss_mmap_count = 0;
_Atomic uint64_t g_final_fallback_mmap_count = 0;
static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) {
void* ptr = NULL;
static int log_count = 0;
#ifdef MAP_ALIGNED_SUPER
int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER;
#ifdef MAP_POPULATE
if (populate) {
map_flags |= MAP_POPULATE;
}
#endif
ptr = mmap(NULL, ss_size,
PROT_READ | PROT_WRITE,
map_flags,
-1, 0);
if (ptr != MAP_FAILED) {
atomic_fetch_add(&g_ss_mmap_count, 1);
if (((uintptr_t)ptr & ss_mask) == 0) {
ss_stats_os_alloc(size_class, ss_size);
return ptr;
}
munmap(ptr, ss_size);
ptr = NULL;
} else {
log_superslab_oom_once(ss_size, ss_size, errno);
}
#endif
size_t alloc_size = ss_size * 2;
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
#ifdef MAP_POPULATE
if (populate) {
flags |= MAP_POPULATE;
}
#endif
void* raw = mmap(NULL, alloc_size,
PROT_READ | PROT_WRITE,
flags,
-1, 0);
if (raw != MAP_FAILED) {
uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1;
#if !HAKMEM_BUILD_RELEASE
if (log_count < 10) {
fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n",
(unsigned long)count, size_class, ss_size);
log_count++;
}
#endif
}
if (raw == MAP_FAILED) {
log_superslab_oom_once(ss_size, alloc_size, errno);
return NULL;
}
uintptr_t raw_addr = (uintptr_t)raw;
uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask;
ptr = (void*)aligned_addr;
size_t prefix_size = aligned_addr - raw_addr;
if (prefix_size > 0) {
munmap(raw, prefix_size);
}
size_t suffix_size = alloc_size - prefix_size - ss_size;
if (suffix_size > 0) {
if (populate) {
#ifdef MADV_DONTNEED
madvise((char*)ptr + ss_size, suffix_size, MADV_DONTNEED);
#endif
} else {
munmap((char*)ptr + ss_size, suffix_size);
}
}
ss_stats_os_alloc(size_class, ss_size);
return ptr;
}
static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask) {
if (!g_ss_cache_enabled) return;
if (size_class >= 8) return;
if (g_ss_precharge_target[size_class] == 0) return;
if (atomic_load_explicit(&g_ss_precharge_done[size_class], memory_order_acquire)) return;
ss_cache_ensure_init();
pthread_mutex_lock(&g_ss_cache_lock[size_class]);
size_t target = g_ss_precharge_target[size_class];
size_t cap = g_ss_cache_cap[size_class];
size_t desired = target;
if (cap != 0 && desired > cap) {
desired = cap;
}
while (g_ss_cache_count[size_class] < desired) {
void* raw = ss_os_acquire(size_class, ss_size, ss_mask, 1);
if (!raw) {
break;
}
SuperslabCacheEntry* entry = (SuperslabCacheEntry*)raw;
entry->next = g_ss_cache_head[size_class];
g_ss_cache_head[size_class] = entry;
g_ss_cache_count[size_class]++;
g_ss_cache_precharged[size_class]++;
}
atomic_store_explicit(&g_ss_precharge_done[size_class], 1, memory_order_release);
pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
}
static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class) {
if (!g_ss_cache_enabled) return NULL;
if (size_class >= 8) return NULL;
ss_cache_ensure_init();
pthread_mutex_lock(&g_ss_cache_lock[size_class]);
SuperslabCacheEntry* entry = g_ss_cache_head[size_class];
if (entry) {
g_ss_cache_head[size_class] = entry->next;
if (g_ss_cache_count[size_class] > 0) {
g_ss_cache_count[size_class]--;
}
entry->next = NULL;
g_ss_cache_hits[size_class]++;
} else {
g_ss_cache_misses[size_class]++;
}
pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
return entry;
}
static int ss_cache_push(uint8_t size_class, SuperSlab* ss) {
if (!g_ss_cache_enabled) return 0;
if (size_class >= 8) return 0;
ss_cache_ensure_init();
pthread_mutex_lock(&g_ss_cache_lock[size_class]);
size_t cap = g_ss_cache_cap[size_class];
if (cap != 0 && g_ss_cache_count[size_class] >= cap) {
g_ss_cache_drops[size_class]++;
pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
return 0;
}
SuperslabCacheEntry* entry = (SuperslabCacheEntry*)ss;
entry->next = g_ss_cache_head[size_class];
g_ss_cache_head[size_class] = entry;
g_ss_cache_count[size_class]++;
g_ss_cache_puts[size_class]++;
pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
return 1;
}
// ============================================================================
// SuperSlab Allocation (2MB aligned)
// ============================================================================
SuperSlab* superslab_allocate(uint8_t size_class) {
// Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗
static int fault_rate = -1; // -1=unparsed, 0=disabled, >0=rate
static __thread unsigned long fault_tick = 0;
if (__builtin_expect(fault_rate == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE");
if (e && *e) {
int v = atoi(e); if (v < 0) v = 0; fault_rate = v;
} else {
fault_rate = 0;
}
}
if (fault_rate > 0) {
unsigned long t = ++fault_tick;
if ((t % (unsigned long)fault_rate) == 0ul) {
return NULL; // simulate OOM
}
}
// Optional env clamp for SuperSlab size
static int env_parsed = 0;
static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_DEFAULT; // Start with default (2MB)
static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX;
if (!env_parsed) {
char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB");
if (maxmb) {
int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21;
}
char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB");
if (minmb) {
int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21;
}
if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env;
const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG");
if (force_lg_env && *force_lg_env) {
int v = atoi(force_lg_env);
if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) {
g_ss_force_lg = v;
g_ss_min_lg_env = g_ss_max_lg_env = v;
}
}
size_t precharge_default = 0;
const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE");
if (precharge_env && *precharge_env) {
long v = atol(precharge_env);
if (v < 0) v = 0;
precharge_default = (size_t)v;
if (v > 0) {
atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
}
}
size_t cache_default = 0;
const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE");
if (cache_env && *cache_env) {
long v = atol(cache_env);
if (v < 0) v = 0;
cache_default = (size_t)v;
}
for (int i = 0; i < 8; i++) {
g_ss_cache_cap[i] = cache_default;
g_ss_precharge_target[i] = precharge_default;
}
for (int i = 0; i < 8; i++) {
char name[64];
snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i);
char* cap_env = getenv(name);
if (cap_env && *cap_env) {
long v = atol(cap_env);
if (v < 0) v = 0;
g_ss_cache_cap[i] = (size_t)v;
}
snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i);
char* pre_env = getenv(name);
if (pre_env && *pre_env) {
long v = atol(pre_env);
if (v < 0) v = 0;
g_ss_precharge_target[i] = (size_t)v;
if (v > 0) {
atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
}
}
if (g_ss_cache_cap[i] > 0 || g_ss_precharge_target[i] > 0) {
g_ss_cache_enabled = 1;
}
}
const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE");
if (populate_env && atoi(populate_env) != 0) {
atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
}
env_parsed = 1;
}
uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class);
if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env;
if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env;
size_t ss_size = (size_t)1 << lg; // 2^20 = 1MB, 2^21 = 2MB
uintptr_t ss_mask = ss_size - 1;
int from_cache = 0;
void* ptr = NULL;
if (g_ss_cache_enabled && size_class < 8) {
ss_cache_precharge(size_class, ss_size, ss_mask);
SuperslabCacheEntry* cached = ss_cache_pop(size_class);
if (cached) {
ptr = (void*)cached;
from_cache = 1;
}
}
if (!ptr) {
int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel);
ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate);
if (!ptr) {
return NULL;
}
}
// Initialize SuperSlab header (Phase 1 Quick Win: removed memset for lazy init)
SuperSlab* ss = (SuperSlab*)ptr;
ss->magic = SUPERSLAB_MAGIC;
ss->size_class = size_class;
ss->active_slabs = 0;
ss->lg_size = lg; // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB)
ss->slab_bitmap = 0;
ss->nonempty_mask = 0; // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask
ss->partial_epoch = 0;
ss->publish_hint = 0xFF;
// Initialize atomics explicitly
atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed);
atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed);
atomic_store_explicit(&ss->listed, 0, memory_order_relaxed);
ss->partial_next = NULL;
// Initialize all slab metadata (only up to max slabs for this size)
int max_slabs = (int)(ss_size / SLAB_SIZE);
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure ## Major Additions ### 1. Box I: Integrity Verification System (NEW - 703 lines) - Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines) - Purpose: Unified integrity checking across all HAKMEM subsystems - Features: * 4-level integrity checking (0-4, compile-time controlled) * Priority 1: TLS array bounds validation * Priority 2: Freelist pointer validation * Priority 3: TLS canary monitoring * Priority ALPHA: Slab metadata invariant checking (5 invariants) * Atomic statistics tracking (thread-safe) * Beautiful BOX_BOUNDARY design pattern ### 2. Box E: SuperSlab Expansion System (COMPLETE) - Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c - Purpose: Safe SuperSlab expansion with TLS state guarantee - Features: * Immediate slab 0 binding after expansion * TLS state snapshot and restoration * Design by Contract (pre/post-conditions, invariants) * Thread-safe with mutex protection ### 3. Comprehensive Integrity Checking System - File: core/hakmem_tiny_integrity.h (NEW) - Unified validation functions for all allocator subsystems - Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe) - Pointer range validation (null-page, kernel-space) ### 4. P0 Bug Investigation - Root Cause Identified **Bug**: SEGV at iteration 28440 (deterministic with seed 42) **Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning) **Location**: TLS SLL (Single-Linked List) cache layer **Root Cause**: Race condition or use-after-free in TLS list management (class 0) **Detection**: Box I successfully caught invalid pointer at exact crash point ### 5. Defensive Improvements - Defensive memset in SuperSlab allocation (all metadata arrays) - Enhanced pointer validation with pattern detection - BOX_BOUNDARY markers throughout codebase (beautiful modular design) - 5 metadata invariant checks in allocation/free/refill paths ## Integration Points - Modified 13 files with Box I/E integration - Added 10+ BOX_BOUNDARY markers - 5 critical integrity check points in P0 refill path ## Test Results (100K iterations) - Baseline: 7.22M ops/s - Hotpath ON: 8.98M ops/s (+24% improvement ✓) - P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition) - Root cause: Identified but not yet fixed (requires deeper investigation) ## Performance - Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0) - Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4) - Beautiful modular design maintains clean separation of concerns ## Known Issues - P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0) - Cause: Use-after-free or race in remote free draining - Next step: Valgrind investigation to pinpoint exact corruption location ## Code Quality - Total new code: ~1400 lines (Box I + Box E + integrity system) - Design: Beautiful Box Theory with clear boundaries - Modularity: Complete separation of concerns - Documentation: Comprehensive inline comments and BOX_BOUNDARY markers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
// DEFENSIVE FIX: Zero all slab metadata arrays to prevent ANY uninitialized pointers
// This catches the 0xa2a2a2a2a2a2a2a2 pattern bug (ASan/debug fill pattern)
// Even though mmap should return zeroed pages, sanitizers may fill with debug patterns
memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta));
memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t));
memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t));
memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t));
for (int i = 0; i < max_slabs; i++) {
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure ## Major Additions ### 1. Box I: Integrity Verification System (NEW - 703 lines) - Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines) - Purpose: Unified integrity checking across all HAKMEM subsystems - Features: * 4-level integrity checking (0-4, compile-time controlled) * Priority 1: TLS array bounds validation * Priority 2: Freelist pointer validation * Priority 3: TLS canary monitoring * Priority ALPHA: Slab metadata invariant checking (5 invariants) * Atomic statistics tracking (thread-safe) * Beautiful BOX_BOUNDARY design pattern ### 2. Box E: SuperSlab Expansion System (COMPLETE) - Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c - Purpose: Safe SuperSlab expansion with TLS state guarantee - Features: * Immediate slab 0 binding after expansion * TLS state snapshot and restoration * Design by Contract (pre/post-conditions, invariants) * Thread-safe with mutex protection ### 3. Comprehensive Integrity Checking System - File: core/hakmem_tiny_integrity.h (NEW) - Unified validation functions for all allocator subsystems - Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe) - Pointer range validation (null-page, kernel-space) ### 4. P0 Bug Investigation - Root Cause Identified **Bug**: SEGV at iteration 28440 (deterministic with seed 42) **Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning) **Location**: TLS SLL (Single-Linked List) cache layer **Root Cause**: Race condition or use-after-free in TLS list management (class 0) **Detection**: Box I successfully caught invalid pointer at exact crash point ### 5. Defensive Improvements - Defensive memset in SuperSlab allocation (all metadata arrays) - Enhanced pointer validation with pattern detection - BOX_BOUNDARY markers throughout codebase (beautiful modular design) - 5 metadata invariant checks in allocation/free/refill paths ## Integration Points - Modified 13 files with Box I/E integration - Added 10+ BOX_BOUNDARY markers - 5 critical integrity check points in P0 refill path ## Test Results (100K iterations) - Baseline: 7.22M ops/s - Hotpath ON: 8.98M ops/s (+24% improvement ✓) - P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition) - Root cause: Identified but not yet fixed (requires deeper investigation) ## Performance - Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0) - Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4) - Beautiful modular design maintains clean separation of concerns ## Known Issues - P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0) - Cause: Use-after-free or race in remote free draining - Next step: Valgrind investigation to pinpoint exact corruption location ## Code Quality - Total new code: ~1400 lines (Box I + Box E + integrity system) - Design: Beautiful Box Theory with clear boundaries - Modularity: Complete separation of concerns - Documentation: Comprehensive inline comments and BOX_BOUNDARY markers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
ss->slabs[i].freelist = NULL; // Explicit NULL (redundant after memset, but clear intent)
ss->slabs[i].used = 0;
ss->slabs[i].capacity = 0;
ss->slabs[i].owner_tid = 0;
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure ## Major Additions ### 1. Box I: Integrity Verification System (NEW - 703 lines) - Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines) - Purpose: Unified integrity checking across all HAKMEM subsystems - Features: * 4-level integrity checking (0-4, compile-time controlled) * Priority 1: TLS array bounds validation * Priority 2: Freelist pointer validation * Priority 3: TLS canary monitoring * Priority ALPHA: Slab metadata invariant checking (5 invariants) * Atomic statistics tracking (thread-safe) * Beautiful BOX_BOUNDARY design pattern ### 2. Box E: SuperSlab Expansion System (COMPLETE) - Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c - Purpose: Safe SuperSlab expansion with TLS state guarantee - Features: * Immediate slab 0 binding after expansion * TLS state snapshot and restoration * Design by Contract (pre/post-conditions, invariants) * Thread-safe with mutex protection ### 3. Comprehensive Integrity Checking System - File: core/hakmem_tiny_integrity.h (NEW) - Unified validation functions for all allocator subsystems - Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe) - Pointer range validation (null-page, kernel-space) ### 4. P0 Bug Investigation - Root Cause Identified **Bug**: SEGV at iteration 28440 (deterministic with seed 42) **Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning) **Location**: TLS SLL (Single-Linked List) cache layer **Root Cause**: Race condition or use-after-free in TLS list management (class 0) **Detection**: Box I successfully caught invalid pointer at exact crash point ### 5. Defensive Improvements - Defensive memset in SuperSlab allocation (all metadata arrays) - Enhanced pointer validation with pattern detection - BOX_BOUNDARY markers throughout codebase (beautiful modular design) - 5 metadata invariant checks in allocation/free/refill paths ## Integration Points - Modified 13 files with Box I/E integration - Added 10+ BOX_BOUNDARY markers - 5 critical integrity check points in P0 refill path ## Test Results (100K iterations) - Baseline: 7.22M ops/s - Hotpath ON: 8.98M ops/s (+24% improvement ✓) - P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition) - Root cause: Identified but not yet fixed (requires deeper investigation) ## Performance - Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0) - Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4) - Beautiful modular design maintains clean separation of concerns ## Known Issues - P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0) - Cause: Use-after-free or race in remote free draining - Next step: Valgrind investigation to pinpoint exact corruption location ## Code Quality - Total new code: ~1400 lines (Box I + Box E + integrity system) - Design: Beautiful Box Theory with clear boundaries - Modularity: Complete separation of concerns - Documentation: Comprehensive inline comments and BOX_BOUNDARY markers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
// Initialize remote queue atomics (memset already zeroed, but use proper atomic init)
atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed);
atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed);
atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed);
}
if (from_cache) {
ss_stats_cache_reuse();
}
// Phase 8.3: Update ACE current_lg to match allocated size
g_ss_ace[size_class].current_lg = lg;
// Phase 1: Register SuperSlab in global registry for fast lookup
// CRITICAL: Register AFTER full initialization (ss structure is ready)
uintptr_t base = (uintptr_t)ss;
if (!hak_super_register(base, ss)) {
// Registry full - this is a fatal error
fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss);
// Still return ss to avoid memory leak, but lookups may fail
}
return ss;
}
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
// ============================================================================
// Phase 2a: Dynamic Expansion - Chunk Management Functions
// ============================================================================
// Initialize SuperSlabHead for a class
SuperSlabHead* init_superslab_head(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return NULL;
}
// Allocate SuperSlabHead structure
SuperSlabHead* head = (SuperSlabHead*)calloc(1, sizeof(SuperSlabHead));
if (!head) {
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate SuperSlabHead for class %d\n", class_idx);
g_hakmem_lock_depth--;
return NULL;
}
head->class_idx = (uint8_t)class_idx;
atomic_store_explicit(&head->total_chunks, 0, memory_order_relaxed);
head->first_chunk = NULL;
head->current_chunk = NULL;
pthread_mutex_init(&head->expansion_lock, NULL);
// Allocate initial chunk(s)
// Hot classes (1, 4, 6) get 2 initial chunks to reduce contention
int initial_chunks = 1;
// Phase 2a: Start with 1 chunk for all classes (expansion will handle growth)
// This reduces startup memory overhead while still allowing unlimited growth
initial_chunks = 1;
for (int i = 0; i < initial_chunks; i++) {
if (expand_superslab_head(head) < 0) {
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate initial chunk %d for class %d\n",
i, class_idx);
g_hakmem_lock_depth--;
// Cleanup on failure
SuperSlab* chunk = head->first_chunk;
while (chunk) {
SuperSlab* next = chunk->next_chunk;
superslab_free(chunk);
chunk = next;
}
pthread_mutex_destroy(&head->expansion_lock);
free(head);
return NULL;
}
}
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
#if !HAKMEM_BUILD_RELEASE
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
fprintf(stderr, "[HAKMEM] Initialized SuperSlabHead for class %d: %zu initial chunks\n",
class_idx, atomic_load_explicit(&head->total_chunks, memory_order_relaxed));
#endif
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
g_hakmem_lock_depth--;
return head;
}
// Expand SuperSlabHead by allocating and linking a new chunk
int expand_superslab_head(SuperSlabHead* head) {
if (!head) {
return -1;
}
// Allocate new chunk via existing superslab_allocate
SuperSlab* new_chunk = superslab_allocate(head->class_idx);
if (!new_chunk) {
refactor: Guard SuperSlab expansion debug logs + Update CURRENT_TASK ## Changes ### 1. Debug Log Cleanup (Release Build Optimization) **Files Modified:** - `core/tiny_superslab_alloc.inc.h:183-234` - `core/hakmem_tiny_superslab.c:567-618` **Problem:** - SuperSlab expansion logs flooded output (268+ lines per benchmark run) - Massive I/O overhead masked true performance in benchmarks - Production builds should not spam stderr **Solution:** - Guard all expansion logs with `#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)` - Debug builds: Logs enabled by default - Release builds: Logs disabled (clean output) - Can re-enable with `-DHAKMEM_SUPERSLAB_VERBOSE` for debugging **Guarded Messages:** - "SuperSlab chunk exhausted for class X, expanding..." - "Successfully expanded SuperSlabHead for class X" - "CRITICAL: Failed to expand SuperSlabHead..." (OOM) - "Expanded SuperSlabHead for class X: N chunks now" **Impact:** - Release builds: Clean benchmark output (no log spam) - Debug builds: Full visibility into expansion behavior - Performance: No I/O overhead in production benchmarks ### 2. CURRENT_TASK.md Update **New Focus:** ACE Investigation for Mid-Large Performance Recovery **Context:** - ✅ 100% stability achieved (commit 616070cf7) - ✅ Tiny Hot Path: **First time beating BOTH System and mimalloc** (+48.5% vs System) - 🔴 Critical issue: Mid-Large MT collapsed (-88% vs System) - Root cause: ACE disabled → all allocations go to mmap (slow) **Next Task:** Task Agent to investigate ACE mechanism (Ultrathink mode): 1. Why is ACE disabled? 2. How does ACE improve Mid-Large performance? 3. Can we re-enable ACE to recover +171% advantage? 4. Implementation plan and risk assessment **Benchmark Results:** Comprehensive results saved to: `benchmarks/results/comprehensive_20251108_214317/` --- ## Testing Verified clean build output: ```bash make clean && make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 larson_hakmem ./larson_hakmem 1 1 128 1024 1 12345 1 # No expansion log spam in release build ``` 🎉 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 22:02:09 +09:00
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate new chunk for class %d (system OOM)\n",
head->class_idx);
g_hakmem_lock_depth--;
refactor: Guard SuperSlab expansion debug logs + Update CURRENT_TASK ## Changes ### 1. Debug Log Cleanup (Release Build Optimization) **Files Modified:** - `core/tiny_superslab_alloc.inc.h:183-234` - `core/hakmem_tiny_superslab.c:567-618` **Problem:** - SuperSlab expansion logs flooded output (268+ lines per benchmark run) - Massive I/O overhead masked true performance in benchmarks - Production builds should not spam stderr **Solution:** - Guard all expansion logs with `#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)` - Debug builds: Logs enabled by default - Release builds: Logs disabled (clean output) - Can re-enable with `-DHAKMEM_SUPERSLAB_VERBOSE` for debugging **Guarded Messages:** - "SuperSlab chunk exhausted for class X, expanding..." - "Successfully expanded SuperSlabHead for class X" - "CRITICAL: Failed to expand SuperSlabHead..." (OOM) - "Expanded SuperSlabHead for class X: N chunks now" **Impact:** - Release builds: Clean benchmark output (no log spam) - Debug builds: Full visibility into expansion behavior - Performance: No I/O overhead in production benchmarks ### 2. CURRENT_TASK.md Update **New Focus:** ACE Investigation for Mid-Large Performance Recovery **Context:** - ✅ 100% stability achieved (commit 616070cf7) - ✅ Tiny Hot Path: **First time beating BOTH System and mimalloc** (+48.5% vs System) - 🔴 Critical issue: Mid-Large MT collapsed (-88% vs System) - Root cause: ACE disabled → all allocations go to mmap (slow) **Next Task:** Task Agent to investigate ACE mechanism (Ultrathink mode): 1. Why is ACE disabled? 2. How does ACE improve Mid-Large performance? 3. Can we re-enable ACE to recover +171% advantage? 4. Implementation plan and risk assessment **Benchmark Results:** Comprehensive results saved to: `benchmarks/results/comprehensive_20251108_214317/` --- ## Testing Verified clean build output: ```bash make clean && make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 larson_hakmem ./larson_hakmem 1 1 128 1024 1 12345 1 # No expansion log spam in release build ``` 🎉 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 22:02:09 +09:00
#endif
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
return -1; // True OOM (system out of memory)
}
// CRITICAL FIX: Initialize slab 0 so bitmap != 0x00000000
// Phase 2a chunks must have at least one usable slab after allocation
size_t block_size = g_tiny_class_sizes[head->class_idx];
// Use pthread_self() directly since tiny_self_u32() is static inline in hakmem_tiny.c
uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
superslab_init_slab(new_chunk, 0, block_size, owner_tid);
// Initialize the next_chunk link to NULL
new_chunk->next_chunk = NULL;
// Thread-safe linking
pthread_mutex_lock(&head->expansion_lock);
if (head->current_chunk) {
// Find the tail of the list (optimization: could cache tail pointer)
SuperSlab* tail = head->current_chunk;
while (tail->next_chunk) {
tail = tail->next_chunk;
}
tail->next_chunk = new_chunk;
} else {
// First chunk
head->first_chunk = new_chunk;
}
// Update current chunk to new chunk (for fast allocation)
head->current_chunk = new_chunk;
// Increment total chunks atomically
size_t old_count = atomic_fetch_add_explicit(&head->total_chunks, 1, memory_order_relaxed);
size_t new_count = old_count + 1;
pthread_mutex_unlock(&head->expansion_lock);
refactor: Guard SuperSlab expansion debug logs + Update CURRENT_TASK ## Changes ### 1. Debug Log Cleanup (Release Build Optimization) **Files Modified:** - `core/tiny_superslab_alloc.inc.h:183-234` - `core/hakmem_tiny_superslab.c:567-618` **Problem:** - SuperSlab expansion logs flooded output (268+ lines per benchmark run) - Massive I/O overhead masked true performance in benchmarks - Production builds should not spam stderr **Solution:** - Guard all expansion logs with `#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)` - Debug builds: Logs enabled by default - Release builds: Logs disabled (clean output) - Can re-enable with `-DHAKMEM_SUPERSLAB_VERBOSE` for debugging **Guarded Messages:** - "SuperSlab chunk exhausted for class X, expanding..." - "Successfully expanded SuperSlabHead for class X" - "CRITICAL: Failed to expand SuperSlabHead..." (OOM) - "Expanded SuperSlabHead for class X: N chunks now" **Impact:** - Release builds: Clean benchmark output (no log spam) - Debug builds: Full visibility into expansion behavior - Performance: No I/O overhead in production benchmarks ### 2. CURRENT_TASK.md Update **New Focus:** ACE Investigation for Mid-Large Performance Recovery **Context:** - ✅ 100% stability achieved (commit 616070cf7) - ✅ Tiny Hot Path: **First time beating BOTH System and mimalloc** (+48.5% vs System) - 🔴 Critical issue: Mid-Large MT collapsed (-88% vs System) - Root cause: ACE disabled → all allocations go to mmap (slow) **Next Task:** Task Agent to investigate ACE mechanism (Ultrathink mode): 1. Why is ACE disabled? 2. How does ACE improve Mid-Large performance? 3. Can we re-enable ACE to recover +171% advantage? 4. Implementation plan and risk assessment **Benchmark Results:** Comprehensive results saved to: `benchmarks/results/comprehensive_20251108_214317/` --- ## Testing Verified clean build output: ```bash make clean && make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 larson_hakmem ./larson_hakmem 1 1 128 1024 1 12345 1 # No expansion log spam in release build ``` 🎉 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 22:02:09 +09:00
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] Expanded SuperSlabHead for class %d: %zu chunks now (bitmap=0x%08x)\n",
head->class_idx, new_count, new_chunk->slab_bitmap);
g_hakmem_lock_depth--;
refactor: Guard SuperSlab expansion debug logs + Update CURRENT_TASK ## Changes ### 1. Debug Log Cleanup (Release Build Optimization) **Files Modified:** - `core/tiny_superslab_alloc.inc.h:183-234` - `core/hakmem_tiny_superslab.c:567-618` **Problem:** - SuperSlab expansion logs flooded output (268+ lines per benchmark run) - Massive I/O overhead masked true performance in benchmarks - Production builds should not spam stderr **Solution:** - Guard all expansion logs with `#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)` - Debug builds: Logs enabled by default - Release builds: Logs disabled (clean output) - Can re-enable with `-DHAKMEM_SUPERSLAB_VERBOSE` for debugging **Guarded Messages:** - "SuperSlab chunk exhausted for class X, expanding..." - "Successfully expanded SuperSlabHead for class X" - "CRITICAL: Failed to expand SuperSlabHead..." (OOM) - "Expanded SuperSlabHead for class X: N chunks now" **Impact:** - Release builds: Clean benchmark output (no log spam) - Debug builds: Full visibility into expansion behavior - Performance: No I/O overhead in production benchmarks ### 2. CURRENT_TASK.md Update **New Focus:** ACE Investigation for Mid-Large Performance Recovery **Context:** - ✅ 100% stability achieved (commit 616070cf7) - ✅ Tiny Hot Path: **First time beating BOTH System and mimalloc** (+48.5% vs System) - 🔴 Critical issue: Mid-Large MT collapsed (-88% vs System) - Root cause: ACE disabled → all allocations go to mmap (slow) **Next Task:** Task Agent to investigate ACE mechanism (Ultrathink mode): 1. Why is ACE disabled? 2. How does ACE improve Mid-Large performance? 3. Can we re-enable ACE to recover +171% advantage? 4. Implementation plan and risk assessment **Benchmark Results:** Comprehensive results saved to: `benchmarks/results/comprehensive_20251108_214317/` --- ## Testing Verified clean build output: ```bash make clean && make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 larson_hakmem ./larson_hakmem 1 1 128 1024 1 12345 1 # No expansion log spam in release build ``` 🎉 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 22:02:09 +09:00
#endif
feat: Phase 7 + Phase 2 - Massive performance & stability improvements Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
return 0;
}
// Find which chunk a pointer belongs to
SuperSlab* find_chunk_for_ptr(void* ptr, int class_idx) {
if (!ptr || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
return NULL;
}
SuperSlabHead* head = g_superslab_heads[class_idx];
if (!head) {
return NULL;
}
uintptr_t ptr_addr = (uintptr_t)ptr;
// Walk the chunk list
SuperSlab* chunk = head->first_chunk;
while (chunk) {
// Check if ptr is within this chunk's memory range
// Each chunk is aligned to SUPERSLAB_SIZE (1MB or 2MB)
uintptr_t chunk_start = (uintptr_t)chunk;
size_t chunk_size = (size_t)1 << chunk->lg_size; // Use actual chunk size
uintptr_t chunk_end = chunk_start + chunk_size;
if (ptr_addr >= chunk_start && ptr_addr < chunk_end) {
// Found the chunk
return chunk;
}
chunk = chunk->next_chunk;
}
return NULL; // Not found in any chunk
}
// ============================================================================
// SuperSlab Deallocation
// ============================================================================
void superslab_free(SuperSlab* ss) {
if (!ss || ss->magic != SUPERSLAB_MAGIC) {
return; // Invalid SuperSlab
}
// Phase 1: Unregister SuperSlab from registry FIRST
// CRITICAL ORDER: unregister → clear magic → munmap
// This prevents new lookups from finding this SuperSlab
uintptr_t base = (uintptr_t)ss;
hak_super_unregister(base);
// Memory fence to ensure unregister is visible before magic clear
atomic_thread_fence(memory_order_release);
// Clear magic to prevent use-after-free (after unregister)
ss->magic = 0;
// Unmap entire SuperSlab using its actual size (1MB or 2MB)
size_t ss_size = (size_t)1 << ss->lg_size;
int cached = ss_cache_push(ss->size_class, ss);
if (cached) {
ss_stats_cache_store();
return;
}
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p class=%d size=%zu active=%u\n",
(void*)ss, ss->size_class, ss_size,
atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed));
munmap(ss, ss_size);
// Update statistics for actual release to OS
pthread_mutex_lock(&g_superslab_lock);
g_superslabs_freed++;
if (ss->size_class < 8) {
g_ss_freed_by_class[ss->size_class]++;
}
g_bytes_allocated -= ss_size;
pthread_mutex_unlock(&g_superslab_lock);
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n",
(unsigned long long)g_superslabs_freed);
}
// ============================================================================
// Slab Initialization within SuperSlab
// ============================================================================
void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_t owner_tid) {
if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
return;
}
Phase 6-2.5: Fix SuperSlab alignment bug + refactor constants ## Problem: 53-byte misalignment mystery **Symptom:** All SuperSlab allocations misaligned by exactly 53 bytes ``` [TRC_FAILFAST_PTR] stage=alloc_ret_align cls=7 ptr=0x..f835 offset=63541 (expected: 63488) Diff: 63541 - 63488 = 53 bytes ``` ## Root Cause (Ultrathink investigation) **sizeof(SuperSlab) != hardcoded offset:** - `sizeof(SuperSlab)` = 1088 bytes (actual struct size) - `tiny_slab_base_for()` used: 1024 (hardcoded) - `superslab_init_slab()` assumed: 2048 (in capacity calc) **Impact:** 1. Memory corruption: 64-byte overlap with SuperSlab metadata 2. Misalignment: 1088 % 1024 = 64 (violates class 7 alignment) 3. Inconsistency: Init assumed 2048, but runtime used 1024 ## Solution ### 1. Centralize constants (NEW) **File:** `core/hakmem_tiny_superslab_constants.h` - `SLAB_SIZE` = 64KB - `SUPERSLAB_HEADER_SIZE` = 1088 - `SUPERSLAB_SLAB0_DATA_OFFSET` = 2048 (aligned to 1024) - `SUPERSLAB_SLAB0_USABLE_SIZE` = 63488 (64KB - 2048) - Compile-time validation checks **Why 2048?** - Round up 1088 to next 1024-byte boundary - Ensures proper alignment for class 7 (1024-byte blocks) - Previous: (1088 + 1023) & ~1023 = 2048 ### 2. Update all code to use constants - `hakmem_tiny_superslab.h`: `tiny_slab_base_for()` → use `SUPERSLAB_SLAB0_DATA_OFFSET` - `hakmem_tiny_superslab.c`: `superslab_init_slab()` → use `SUPERSLAB_SLAB0_USABLE_SIZE` - Removed hardcoded 1024, 2048 magic numbers ### 3. Add class consistency check **File:** `core/tiny_superslab_alloc.inc.h:433-449` - Verify `tls->ss->size_class == class_idx` before allocation - Unbind TLS if mismatch detected - Prevents using wrong block_size for calculations ## Status ⚠️ **INCOMPLETE - New issue discovered** After fix, benchmark hits different error: ``` [TRC_FAILFAST] stage=freelist_next cls=7 node=0x...d474 ``` Freelist corruption detected. Likely caused by: - 2048 offset change affects free() path - Block addresses no longer match freelist expectations - Needs further investigation ## Files Modified - `core/hakmem_tiny_superslab_constants.h` - NEW: Centralized constants - `core/hakmem_tiny_superslab.h` - Use SUPERSLAB_SLAB0_DATA_OFFSET - `core/hakmem_tiny_superslab.c` - Use SUPERSLAB_SLAB0_USABLE_SIZE - `core/tiny_superslab_alloc.inc.h` - Add class consistency check - `core/hakmem_tiny_init.inc` - Remove diet mode override (Phase 6-2.5) - `core/hakmem_super_registry.h` - Remove debug output (cleaned) - `PERFORMANCE_INVESTIGATION_REPORT.md` - Task agent analysis ## Next Steps 1. Investigate freelist corruption with 2048 offset 2. Verify free() path uses tiny_slab_base_for() correctly 3. Consider reverting to 1024 and fixing capacity calculation instead 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 21:45:20 +09:00
// Calculate capacity using canonical tiny_slab_base_for() layout:
// - slab_data_start(ss, slab_idx) = SuperSlab base + slab_idx * SLAB_SIZE
// - tiny_slab_base_for(ss, 0) = SuperSlab base + SUPERSLAB_SLAB0_DATA_OFFSET
// - tiny_slab_base_for(ss, i>0) = slab_data_start (no gap)
//
// Phase 6-2.5: Use constants from hakmem_tiny_superslab_constants.h
size_t usable_size = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
Phase E3-FINAL: Fix Box API offset bugs - ALL classes now use correct offsets ## Root Cause Analysis (GPT5) **Physical Layout Constraints**: - Class 0: 8B = [1B header][7B payload] → offset 1 = 9B needed = ❌ IMPOSSIBLE - Class 1-6: >=16B = [1B header][15B+ payload] → offset 1 = ✅ POSSIBLE - Class 7: 1KB → offset 0 (compatibility) **Correct Specification**: - HAKMEM_TINY_HEADER_CLASSIDX != 0: - Class 0, 7: next at offset 0 (overwrites header when on freelist) - Class 1-6: next at offset 1 (after header) - HAKMEM_TINY_HEADER_CLASSIDX == 0: - All classes: next at offset 0 **Previous Bug**: - Attempted "ALL classes offset 1" unification - Class 0 with offset 1 caused immediate SEGV (9B > 8B block size) - Mixed 2-arg/3-arg API caused confusion ## Fixes Applied ### 1. Restored 3-Argument Box API (core/box/tiny_next_ptr_box.h) ```c // Correct signatures void tiny_next_write(int class_idx, void* base, void* next_value) void* tiny_next_read(int class_idx, const void* base) // Correct offset calculation size_t offset = (class_idx == 0 || class_idx == 7) ? 0 : 1; ``` ### 2. Updated 123+ Call Sites Across 34 Files - hakmem_tiny_hot_pop_v4.inc.h (4 locations) - hakmem_tiny_fastcache.inc.h (3 locations) - hakmem_tiny_tls_list.h (12 locations) - superslab_inline.h (5 locations) - tiny_fastcache.h (3 locations) - ptr_trace.h (macro definitions) - tls_sll_box.h (2 locations) - + 27 additional files Pattern: `tiny_next_read(base)` → `tiny_next_read(class_idx, base)` Pattern: `tiny_next_write(base, next)` → `tiny_next_write(class_idx, base, next)` ### 3. Added Sentinel Detection Guards - tiny_fast_push(): Block nodes with sentinel in ptr or ptr->next - tls_list_push(): Block nodes with sentinel in ptr or ptr->next - Defense-in-depth against remote free sentinel leakage ## Verification (GPT5 Report) **Test Command**: `./out/release/bench_random_mixed_hakmem --iterations=70000` **Results**: - ✅ Main loop completed successfully - ✅ Drain phase completed successfully - ✅ NO SEGV (previous crash at iteration 66151 is FIXED) - ℹ️ Final log: "tiny_alloc(1024) failed" is normal fallback to Mid/ACE layers **Analysis**: - Class 0 immediate SEGV: ✅ RESOLVED (correct offset 0 now used) - 66K iteration crash: ✅ RESOLVED (offset consistency fixed) - Box API conflicts: ✅ RESOLVED (unified 3-arg API) ## Technical Details ### Offset Logic Justification ``` Class 0: 8B block → next pointer (8B) fits ONLY at offset 0 Class 1: 16B block → next pointer (8B) fits at offset 1 (after 1B header) Class 2: 32B block → next pointer (8B) fits at offset 1 ... Class 6: 512B block → next pointer (8B) fits at offset 1 Class 7: 1024B block → offset 0 for legacy compatibility ``` ### Files Modified (Summary) - Core API: `box/tiny_next_ptr_box.h` - Hot paths: `hakmem_tiny_hot_pop*.inc.h`, `tiny_fastcache.h` - TLS layers: `hakmem_tiny_tls_list.h`, `hakmem_tiny_tls_ops.h` - SuperSlab: `superslab_inline.h`, `tiny_superslab_*.inc.h` - Refill: `hakmem_tiny_refill.inc.h`, `tiny_refill_opt.h` - Free paths: `tiny_free_magazine.inc.h`, `tiny_superslab_free.inc.h` - Documentation: Multiple Phase E3 reports ## Remaining Work None for Box API offset bugs - all structural issues resolved. Future enhancements (non-critical): - Periodic `grep -R '*(void**)' core/` to detect direct pointer access violations - Enforce Box API usage via static analysis - Document offset rationale in architecture docs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 06:50:20 +09:00
// Phase E1-CORRECT: block_size is already the stride (from g_tiny_class_sizes)
// g_tiny_class_sizes now stores TOTAL block size for ALL classes (including C7)
// No adjustment needed - just use block_size as-is
size_t stride = block_size;
int capacity = (int)(usable_size / stride);
// Diagnostic: Verify capacity for class 7 slab 0 (one-shot)
if (ss->size_class == 7 && slab_idx == 0) {
static _Atomic int g_cap_log_printed = 0;
if (atomic_load(&g_cap_log_printed) == 0 &&
atomic_exchange(&g_cap_log_printed, 1) == 0) {
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[SUPERSLAB_INIT] class 7 slab 0: usable_size=%zu stride=%zu capacity=%d\n",
usable_size, stride, capacity);
fprintf(stderr, "[SUPERSLAB_INIT] Expected: 63488 / 1024 = 62 blocks\n");
if (capacity != 62) {
fprintf(stderr, "[SUPERSLAB_INIT] WARNING: capacity=%d (expected 62!)\n", capacity);
}
#endif
}
}
// Phase 6.24: Lazy freelist initialization
// NO freelist build here! (saves 4000-8000 cycles per slab init)
// freelist will be built on-demand when first free() is called
// Linear allocation is used until then (sequential memory access)
// Initialize slab metadata
TinySlabMeta* meta = &ss->slabs[slab_idx];
meta->freelist = NULL; // NULL = linear allocation mode
meta->used = 0;
meta->capacity = (uint16_t)capacity;
meta->carved = 0; // FIX: Initialize carved counter (monotonic carve progress)
meta->owner_tid = (uint16_t)owner_tid; // FIX: Cast to uint16_t (changed from uint32_t)
// Store slab_start in SuperSlab for later use
// (We need this for linear allocation)
// Note: We'll calculate this in superslab_alloc_from_slab() instead
// Mark slab as active
superslab_activate_slab(ss, slab_idx);
}
// ============================================================================
// Slab Bitmap Management
// ============================================================================
void superslab_activate_slab(SuperSlab* ss, int slab_idx) {
if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
return;
}
uint32_t mask = 1u << slab_idx;
if ((ss->slab_bitmap & mask) == 0) {
ss->slab_bitmap |= mask;
ss->active_slabs++;
}
}
void superslab_deactivate_slab(SuperSlab* ss, int slab_idx) {
if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
return;
}
uint32_t mask = 1u << slab_idx;
if (ss->slab_bitmap & mask) {
ss->slab_bitmap &= ~mask;
ss->active_slabs--;
}
}
int superslab_find_free_slab(SuperSlab* ss) {
if (!ss) return -1;
if ((int)ss->active_slabs >= ss_slabs_capacity(ss)) {
return -1; // No free slabs
}
// Find first 0 bit in bitmap
int cap = ss_slabs_capacity(ss);
for (int i = 0; i < cap; i++) {
if ((ss->slab_bitmap & (1u << i)) == 0) {
return i;
}
}
return -1;
}
// ============================================================================
// Statistics / Debugging
// ============================================================================
void superslab_print_stats(SuperSlab* ss) {
if (!ss || ss->magic != SUPERSLAB_MAGIC) {
printf("Invalid SuperSlab\n");
return;
}
printf("=== SuperSlab Stats ===\n");
printf("Address: %p\n", (void*)ss);
printf("Size class: %u\n", ss->size_class);
printf("Active slabs: %u / %d\n", ss->active_slabs, ss_slabs_capacity(ss));
printf("Bitmap: 0x%08X\n", ss->slab_bitmap);
printf("\nPer-slab details:\n");
for (int i = 0; i < ss_slabs_capacity(ss); i++) {
if (ss->slab_bitmap & (1u << i)) {
TinySlabMeta* meta = &ss->slabs[i];
printf(" Slab %2d: used=%u/%u freelist=%p owner=%u\n",
i, meta->used, meta->capacity, meta->freelist, meta->owner_tid);
}
}
printf("\n");
}
// Global statistics
void superslab_print_global_stats(void) {
pthread_mutex_lock(&g_superslab_lock);
printf("=== Global SuperSlab Stats ===\n");
printf("SuperSlabs allocated: %lu\n", g_superslabs_allocated);
printf("SuperSlabs freed: %lu\n", g_superslabs_freed);
printf("SuperSlabs active: %lu\n", g_superslabs_allocated - g_superslabs_freed);
printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024));
pthread_mutex_unlock(&g_superslab_lock);
}
// ============================================================================
// Phase 8.3: ACE Statistics / Debugging
// ============================================================================
void superslab_ace_print_stats(void) {
printf("=== ACE (Adaptive Cache Engine) Stats ===\n");
const char* class_names[8] = {"8B", "16B", "24B", "32B", "40B", "48B", "56B", "64B"};
printf("Class Curr Targ Hot Allocs Refills Spills LiveBlks\n");
printf("--------------------------------------------------------------\n");
for (int i = 0; i < TINY_NUM_CLASSES_SS; i++) {
SuperSlabACEState* c = &g_ss_ace[i];
printf("%-6s %2uMB %2uMB %4u %7u %8u %7u %9u\n",
class_names[i],
(1u << c->current_lg) / (1024 * 1024),
(1u << c->target_lg) / (1024 * 1024),
c->hot_score,
c->alloc_count,
c->refill_count,
c->spill_count,
c->live_blocks);
}
printf("\n");
}
// ============================================================================
// Phase 8.3: ACE Tick Function (Promotion/Demotion Logic)
// ============================================================================
#define ACE_TICK_NS (150ULL * 1000 * 1000) // 150ms tick interval
#define ACE_COOLDOWN_NS (800ULL * 1000 * 1000) // 0.8s cooldown (anti-oscillation)
// Simplified thresholds for refill activity
#define HI_REFILL(k) (g_ss_ace[k].refill_count > 64) // High refill rate
#define MID_REFILL(k) (g_ss_ace[k].refill_count > 16) // Medium refill rate
// Object sizes per class (for capacity calculation)
// Must match TINY size classes: 8, 16, 24, 32, 40, 48, 56, 64 bytes
static const int g_tiny_obj_sizes[TINY_NUM_CLASSES_SS] = {8, 16, 24, 32, 40, 48, 56, 64};
void hak_tiny_superslab_ace_tick(int k, uint64_t now) {
if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;
SuperSlabACEState* c = &g_ss_ace[k];
// Rate limiting: only tick every ACE_TICK_NS (~150ms)
if (now - c->last_tick_ns < ACE_TICK_NS) return;
// Calculate capacity for 1MB and 2MB SuperSlabs
int obj_size = g_tiny_obj_sizes[k];
double cap1MB = (double)((1U << 20) / obj_size); // 1MB capacity
double cap2MB = (double)((1U << 21) / obj_size); // 2MB capacity
// Calculate hotness score (weighted: 60% live blocks, 40% refill rate)
double hot = 0.6 * (double)c->live_blocks + 0.4 * (double)c->refill_count;
if (hot < 0) hot = 0;
if (hot > 1000) hot = 1000;
c->hot_score = (uint16_t)hot;
// Cooldown mechanism: prevent size changes within 0.8s of last change
static uint64_t last_switch_ns[TINY_NUM_CLASSES_SS] = {0};
if (now - last_switch_ns[k] >= ACE_COOLDOWN_NS) {
if (c->current_lg <= 20) {
// Promotion condition: 1MB → 2MB
// High demand (live > 75% capacity) AND high refill rate
if (c->live_blocks > 0.75 * cap1MB && HI_REFILL(k)) {
c->target_lg = 21; // Promote to 2MB
last_switch_ns[k] = now;
}
} else {
// Demotion condition: 2MB → 1MB
// Low demand (live < 35% capacity) AND low refill rate
if (c->live_blocks < 0.35 * cap2MB && !MID_REFILL(k)) {
c->target_lg = 20; // Demote to 1MB
last_switch_ns[k] = now;
}
}
}
// EMA-style decay for counters (reduce by 75% each tick)
c->alloc_count = c->alloc_count / 4;
c->refill_count = c->refill_count / 4;
c->spill_count = c->spill_count / 4;
// live_blocks is updated incrementally by alloc/free, not decayed here
c->last_tick_ns = now;
}
// ============================================================================
// Phase 8.4: ACE Observer (Registry-based, zero hot-path overhead)
// ============================================================================
// Global debug flag (set once at initialization)
static int g_ace_debug = 0;
// Registry-based observation: scan all SuperSlabs for usage stats
static void ace_observe_and_decide(int k) {
if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;
SuperSlabACEState* c = &g_ss_ace[k];
// Scan Registry to count SuperSlabs and total live blocks
int ss_count = 0;
uint32_t total_live = 0;
for (int i = 0; i < SUPER_REG_SIZE; i++) {
SuperRegEntry* e = &g_super_reg[i];
// Atomic read (thread-safe)
uintptr_t base = atomic_load_explicit(
(_Atomic uintptr_t*)&e->base,
memory_order_acquire);
if (base == 0) continue; // Empty slot
// Phase 8.4: Safety check - skip if ss pointer is invalid
if (!e->ss) continue;
if (e->ss->size_class != k) continue; // Wrong class
ss_count++;
// Phase 8.4: Scan all slabs to count used blocks (zero hot-path overhead)
uint32_t ss_live = 0;
int cap_scan = ss_slabs_capacity(e->ss);
for (int slab_idx = 0; slab_idx < cap_scan; slab_idx++) {
TinySlabMeta* meta = &e->ss->slabs[slab_idx];
// Relaxed read is OK (stats only, no hot-path impact)
ss_live += meta->used;
}
total_live += ss_live;
}
// Calculate utilization
int obj_size = g_tiny_obj_sizes[k];
uint8_t current_lg = atomic_load_explicit(
(_Atomic uint8_t*)&c->current_lg,
memory_order_relaxed);
uint32_t capacity = (ss_count > 0) ? ss_count * ((1U << current_lg) / obj_size) : 1;
double util = (double)total_live / capacity;
// Update hot_score (for debugging/visualization)
c->hot_score = (uint16_t)(util * 1000);
if (c->hot_score > 1000) c->hot_score = 1000;
// Promotion/Demotion decision
uint8_t new_target = current_lg;
if (current_lg <= 20) {
// Promotion: 1MB → 2MB
if (util > 0.75) {
new_target = 21;
}
} else {
// Demotion: 2MB → 1MB
if (util < 0.35) {
new_target = 20;
}
}
// Debug output (if enabled)
if (g_ace_debug && ss_count > 0) {
fprintf(stderr, "[ACE] Class %d (%dB): ss=%d live=%u cap=%u util=%.2f%% lg=%d->%d hot=%d\n",
k, obj_size, ss_count, total_live, capacity, util * 100.0,
current_lg, new_target, c->hot_score);
}
// Atomic write (thread-safe)
if (new_target != current_lg) {
atomic_store_explicit(
(_Atomic uint8_t*)&c->target_lg,
new_target,
memory_order_release);
if (g_ace_debug) {
fprintf(stderr, "[ACE] *** Class %d: SIZE CHANGE %dMB -> %dMB (util=%.2f%%)\n",
k, 1 << (current_lg - 20), 1 << (new_target - 20), util * 100.0);
}
}
}
// Called from Learner thread (background observation)
void hak_tiny_superslab_ace_observe_all(void) {
// Initialize debug flag once
static int initialized = 0;
if (!initialized) {
const char* ace_debug = getenv("HAKMEM_ACE_DEBUG");
g_ace_debug = (ace_debug && atoi(ace_debug) != 0) ? 1 : 0;
initialized = 1;
}
for (int k = 0; k < TINY_NUM_CLASSES_SS; k++) {
ace_observe_and_decide(k);
}
}