feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements: - Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed) - Single-thread: +24% (2.71M → 3.36M ops/s Larson) - 4T stability: 0% → 95% (19/20 success rate) - Overall: 91.3% of System malloc average (target was 40-55%) ✓ Phase 7 (Tasks 1-3): Core Optimizations - Task 1: Header validation removal (Region-ID direct lookup) - Task 2: Aggressive inline (TLS cache access optimization) - Task 3: Pre-warm TLS cache (eliminate cold-start penalty) Result: +180-280% improvement, 85-146% of System malloc Critical Bug Fixes: - Fix 64B allocation crash (size-to-class +1 for header) - Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11) - Remove malloc fallback (30% → 50% stability) Phase 2a: SuperSlab Dynamic Expansion (CRITICAL) - Implement mimalloc-style chunk linking - Unlimited slab expansion (no more OOM at 32 slabs) - Fix chunk initialization bug (bitmap=0x00000001 after expansion) Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h Result: 50% → 95% stability (19/20 4T success) Phase 2b: TLS Cache Adaptive Sizing - Dynamic capacity: 16-2048 slots based on usage - High-water mark tracking + exponential growth/shrink - Expected: +3-10% performance, -30-50% memory Files: core/tiny_adaptive_sizing.c/h (new) Phase 2c: BigCache Dynamic Hash Table - Migrate from fixed 256×8 array to dynamic hash table - Auto-resize: 256 → 512 → 1024 → 65,536 buckets - Improved hash function (FNV-1a) + collision chaining Files: core/hakmem_bigcache.c/h Expected: +10-20% cache hit rate Design Flaws Analysis: - Identified 6 components with fixed-capacity bottlenecks - SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM) - Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters) Documentation: - 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md) - Implementation guides, test results, production readiness - Bug fix reports, root cause analysis Build System: - Makefile: phase7 targets, PREWARM_TLS flag - Auto dependency generation (-MMD -MP) for .inc files Known Issues: - 4T stability: 19/20 (95%) - investigating 1 failure for 100% - L2.5 Pool dynamic sharding: design only (needs 2-3 days integration) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -101,10 +101,30 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
|
||||
blk, offset % blk);
|
||||
tiny_failfast_abort_ptr("alloc_pop_misalign", ss, slab_idx, block, "freelist_head_corrupt");
|
||||
}
|
||||
|
||||
size_t index = offset / blk;
|
||||
if (index >= meta->capacity) {
|
||||
fprintf(stderr, "[ALLOC_CORRUPT] Freelist head out of bounds! block=%p index=%zu cap=%u\n",
|
||||
block, index, meta->capacity);
|
||||
tiny_failfast_abort_ptr("alloc_pop_oob", ss, slab_idx, block, "freelist_head_oob");
|
||||
}
|
||||
}
|
||||
|
||||
meta->freelist = *(void**)block; // Pop from freelist
|
||||
meta->used++;
|
||||
|
||||
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
||||
if (__builtin_expect(meta->used > meta->capacity, 0)) {
|
||||
fprintf(stderr, "[ALLOC_CORRUPT] meta->used overflow on freelist alloc: used=%u cap=%u cls=%u slab=%d\n",
|
||||
meta->used, meta->capacity, ss->size_class, slab_idx);
|
||||
tiny_failfast_abort_ptr("alloc_used_overflow",
|
||||
ss,
|
||||
slab_idx,
|
||||
block,
|
||||
"freelist_used_over_capacity");
|
||||
}
|
||||
}
|
||||
|
||||
tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
|
||||
tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
|
||||
return block;
|
||||
@ -119,6 +139,72 @@ static SuperSlab* superslab_refill(int class_idx) {
|
||||
g_superslab_refill_calls_dbg[class_idx]++;
|
||||
#endif
|
||||
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
|
||||
|
||||
// ============================================================================
|
||||
// Phase 2a: Dynamic Expansion - Initialize SuperSlabHead if needed
|
||||
// ============================================================================
|
||||
extern SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS];
|
||||
extern SuperSlabHead* init_superslab_head(int class_idx);
|
||||
extern int expand_superslab_head(SuperSlabHead* head);
|
||||
|
||||
SuperSlabHead* head = g_superslab_heads[class_idx];
|
||||
if (!head) {
|
||||
// First-time initialization for this class
|
||||
head = init_superslab_head(class_idx);
|
||||
if (!head) {
|
||||
extern __thread int g_hakmem_lock_depth;
|
||||
g_hakmem_lock_depth++;
|
||||
fprintf(stderr, "[DEBUG] superslab_refill: Failed to init SuperSlabHead for class %d\n", class_idx);
|
||||
g_hakmem_lock_depth--;
|
||||
return NULL; // Critical failure
|
||||
}
|
||||
g_superslab_heads[class_idx] = head;
|
||||
}
|
||||
|
||||
// Try current chunk first (fast path)
|
||||
SuperSlab* current_chunk = head->current_chunk;
|
||||
if (current_chunk) {
|
||||
// Check if current chunk has available slabs
|
||||
int chunk_cap = ss_slabs_capacity(current_chunk);
|
||||
if (current_chunk->slab_bitmap != 0x00000000) {
|
||||
// Current chunk has free slabs, use normal refill logic below
|
||||
// (Will be handled by existing code that checks tls->ss)
|
||||
if (tls->ss != current_chunk) {
|
||||
// Update TLS to point to current chunk
|
||||
tls->ss = current_chunk;
|
||||
}
|
||||
} else {
|
||||
// Current chunk exhausted (bitmap = 0x00000000), try to expand
|
||||
extern __thread int g_hakmem_lock_depth;
|
||||
g_hakmem_lock_depth++;
|
||||
fprintf(stderr, "[HAKMEM] SuperSlab chunk exhausted for class %d (bitmap=0x00000000), expanding...\n", class_idx);
|
||||
g_hakmem_lock_depth--;
|
||||
|
||||
// Try to expand by allocating a new chunk
|
||||
if (expand_superslab_head(head) < 0) {
|
||||
g_hakmem_lock_depth++;
|
||||
fprintf(stderr, "[HAKMEM] CRITICAL: Failed to expand SuperSlabHead for class %d (system OOM)\n", class_idx);
|
||||
g_hakmem_lock_depth--;
|
||||
return NULL; // True system OOM
|
||||
}
|
||||
|
||||
// Update current_chunk and tls->ss to point to new chunk
|
||||
current_chunk = head->current_chunk;
|
||||
tls->ss = current_chunk;
|
||||
|
||||
// Verify new chunk has free slabs
|
||||
if (!current_chunk || current_chunk->slab_bitmap == 0x00000000) {
|
||||
g_hakmem_lock_depth++;
|
||||
fprintf(stderr, "[HAKMEM] CRITICAL: New chunk still has no free slabs for class %d\n", class_idx);
|
||||
g_hakmem_lock_depth--;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Continue with existing refill logic
|
||||
// ============================================================================
|
||||
static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen
|
||||
if (g_ss_adopt_en == -1) {
|
||||
char* e = getenv("HAKMEM_TINY_SS_ADOPT");
|
||||
@ -388,6 +474,12 @@ static SuperSlab* superslab_refill(int class_idx) {
|
||||
if (!g_superslab_refill_debug_once) {
|
||||
g_superslab_refill_debug_once = 1;
|
||||
int err = errno;
|
||||
|
||||
// CRITICAL FIX (BUG #11): Protect fprintf() with lock_depth
|
||||
// fprintf() can call malloc for buffering → must use libc malloc
|
||||
extern __thread int g_hakmem_lock_depth;
|
||||
g_hakmem_lock_depth++;
|
||||
|
||||
fprintf(stderr,
|
||||
"[DEBUG] superslab_refill returned NULL (OOM) detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n",
|
||||
class_idx,
|
||||
@ -401,6 +493,8 @@ static SuperSlab* superslab_refill(int class_idx) {
|
||||
reused_slabs,
|
||||
free_idx_attempted,
|
||||
err);
|
||||
|
||||
g_hakmem_lock_depth--;
|
||||
}
|
||||
// Clear errno to avoid confusion in fallback paths
|
||||
errno = 0;
|
||||
|
||||
Reference in New Issue
Block a user