## Changes ### 1. core/page_arena.c - Removed init failure message (lines 25-27) - error is handled by returning early - All other fprintf statements already wrapped in existing #if !HAKMEM_BUILD_RELEASE blocks ### 2. core/hakmem.c - Wrapped SIGSEGV handler init message (line 72) - CRITICAL: Kept SIGSEGV/SIGBUS/SIGABRT error messages (lines 62-64) - production needs crash logs ### 3. core/hakmem_shared_pool.c - Wrapped all debug fprintf statements in #if !HAKMEM_BUILD_RELEASE: - Node pool exhaustion warning (line 252) - SP_META_CAPACITY_ERROR warning (line 421) - SP_FIX_GEOMETRY debug logging (line 745) - SP_ACQUIRE_STAGE0.5_EMPTY debug logging (line 865) - SP_ACQUIRE_STAGE0_L0 debug logging (line 803) - SP_ACQUIRE_STAGE1_LOCKFREE debug logging (line 922) - SP_ACQUIRE_STAGE2_LOCKFREE debug logging (line 996) - SP_ACQUIRE_STAGE3 debug logging (line 1116) - SP_SLOT_RELEASE debug logging (line 1245) - SP_SLOT_FREELIST_LOCKFREE debug logging (line 1305) - SP_SLOT_COMPLETELY_EMPTY debug logging (line 1316) - Fixed lock_stats_init() for release builds (lines 60-65) - ensure g_lock_stats_enabled is initialized ## Performance Validation Before: 51M ops/s (with debug fprintf overhead) After: 49.1M ops/s (consistent performance, fprintf removed from hot paths) ## Build & Test ```bash ./build.sh larson_hakmem ./out/release/larson_hakmem 1 5 1 1000 100 10000 42 # Result: 49.1M ops/s ``` Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
15 KiB
mimalloc Optimization Implementation Roadmap
Closing the 47% Performance Gap
Current: 16.53 M ops/sec Target: 24.00 M ops/sec (+45%) Strategy: Three-phase implementation with incremental validation
Phase 1: Direct Page Cache ⚡ HIGH PRIORITY
Target: +2.5-3.3 M ops/sec (15-20% improvement) Effort: 1-2 days Risk: Low Dependencies: None
Implementation Steps
Step 1.1: Add Direct Cache to Heap Structure
File: core/hakmem_tiny.h
#define HAKMEM_DIRECT_PAGES 129 // Up to 1024 bytes (129 * 8)
typedef struct hakmem_tiny_heap_s {
// Existing fields...
hakmem_tiny_class_t size_classes[32];
// NEW: Direct page cache
hakmem_tiny_page_t* pages_direct[HAKMEM_DIRECT_PAGES];
// Existing fields...
} hakmem_tiny_heap_t;
Memory cost: 129 × 8 = 1,032 bytes per heap (acceptable)
Step 1.2: Initialize Direct Cache
File: core/hakmem_tiny.c
void hakmem_tiny_heap_init(hakmem_tiny_heap_t* heap) {
// Existing initialization...
// Initialize direct cache
for (size_t i = 0; i < HAKMEM_DIRECT_PAGES; i++) {
heap->pages_direct[i] = NULL;
}
// Populate from existing size classes
hakmem_tiny_rebuild_direct_cache(heap);
}
Step 1.3: Cache Update Function
File: core/hakmem_tiny.c
static inline void hakmem_tiny_update_direct_cache(
hakmem_tiny_heap_t* heap,
hakmem_tiny_page_t* page,
size_t block_size)
{
if (block_size > 1024) return; // Only cache small sizes
size_t idx = (block_size + 7) / 8; // Round up to word size
if (idx < HAKMEM_DIRECT_PAGES) {
heap->pages_direct[idx] = page;
}
}
// Call this whenever a page is added/removed from size class
Step 1.4: Fast Path Using Direct Cache
File: core/hakmem_tiny.c
static inline void* hakmem_tiny_malloc_direct(
hakmem_tiny_heap_t* heap,
size_t size)
{
// Fast path: direct cache lookup
if (size <= 1024) {
size_t idx = (size + 7) / 8;
hakmem_tiny_page_t* page = heap->pages_direct[idx];
if (page && page->free_list) {
// Pop from free list
hakmem_block_t* block = page->free_list;
page->free_list = block->next;
page->used++;
return block;
}
}
// Fallback to existing generic path
return hakmem_tiny_malloc_generic(heap, size);
}
// Update main malloc to call this:
void* hakmem_malloc(size_t size) {
if (size <= HAKMEM_TINY_MAX) {
return hakmem_tiny_malloc_direct(tls_heap, size);
}
// ... existing large allocation path
}
Validation
Benchmark command:
./bench_random_mixed_hakx
Expected output:
Before: 16.53 M ops/sec
After: 19.00-20.00 M ops/sec (+15-20%)
If target not met:
- Profile with
perf record -e cycles,cache-misses ./bench_random_mixed_hakx - Check direct cache hit rate
- Verify cache is being updated correctly
- Check for branch mispredictions
Phase 2: Dual Free Lists 🚀 MEDIUM PRIORITY
Target: +2.0-3.3 M ops/sec additional (10-15% improvement) Effort: 3-5 days Risk: Medium (structural changes) Dependencies: Phase 1 complete
Implementation Steps
Step 2.1: Modify Page Structure
File: core/hakmem_tiny.h
typedef struct hakmem_tiny_page_s {
// Existing fields...
uint32_t block_size;
uint32_t capacity;
// OLD: Single free list
// hakmem_block_t* free_list;
// NEW: Three separate free lists
hakmem_block_t* free; // Hot allocation path
hakmem_block_t* local_free; // Local frees (no atomic!)
_Atomic(uintptr_t) thread_free; // Remote frees + flags (lower 2 bits)
uint32_t used;
// ... other fields
} hakmem_tiny_page_t;
Note: thread_free encodes both pointer and flags in lower 2 bits (aligned blocks allow this)
Step 2.2: Update Free Path
File: core/hakmem_tiny.c
void hakmem_tiny_free(void* ptr) {
hakmem_tiny_page_t* page = hakmem_tiny_ptr_to_page(ptr);
hakmem_block_t* block = (hakmem_block_t*)ptr;
// Fast path: local thread owns this page
if (hakmem_tiny_is_local_page(page)) {
// Add to local_free (no atomic!)
block->next = page->local_free;
page->local_free = block;
page->used--;
// Retire page if fully free
if (page->used == 0) {
hakmem_tiny_page_retire(page);
}
return;
}
// Slow path: remote free (atomic)
hakmem_tiny_free_remote(page, block);
}
Step 2.3: Migration Logic
File: core/hakmem_tiny.c
static void hakmem_tiny_collect_frees(hakmem_tiny_page_t* page) {
// Step 1: Collect remote frees (atomic)
uintptr_t tfree = atomic_exchange(&page->thread_free, 0);
hakmem_block_t* remote_list = (hakmem_block_t*)(tfree & ~0x3);
if (remote_list) {
// Append to local_free
hakmem_block_t* tail = remote_list;
while (tail->next) tail = tail->next;
tail->next = page->local_free;
page->local_free = remote_list;
}
// Step 2: Migrate local_free to free
if (page->local_free && !page->free) {
page->free = page->local_free;
page->local_free = NULL;
}
}
// Call this in allocation path when free list is empty
void* hakmem_tiny_malloc_direct(hakmem_tiny_heap_t* heap, size_t size) {
// ... direct cache lookup
hakmem_tiny_page_t* page = heap->pages_direct[idx];
if (page) {
// Try to allocate from free list
hakmem_block_t* block = page->free;
if (block) {
page->free = block->next;
page->used++;
return block;
}
// Free list empty - collect and retry
hakmem_tiny_collect_frees(page);
block = page->free;
if (block) {
page->free = block->next;
page->used++;
return block;
}
}
// Fallback
return hakmem_tiny_malloc_generic(heap, size);
}
Validation
Benchmark command:
./bench_random_mixed_hakx
Expected output:
After Phase 1: 19.00-20.00 M ops/sec
After Phase 2: 21.50-23.00 M ops/sec (+10-15% additional)
Key metrics to track:
- Atomic operation count (should drop significantly)
- Cache miss rate (should improve)
- Free path latency (should be faster)
If target not met:
- Profile atomic operations:
perf record -e cpu-cycles,instructions,cache-references,cache-misses ./bench_random_mixed_hakx - Check remote free percentage
- Verify migration is happening correctly
- Analyze cache line bouncing
Phase 3: Branch Hints + Bit-Packed Flags 🎯 LOW PRIORITY
Target: +1.0-2.0 M ops/sec additional (5-8% improvement) Effort: 1-2 days Risk: Low Dependencies: Phase 2 complete
Implementation Steps
Step 3.1: Add Branch Hint Macros
File: core/hakmem_config.h
#if defined(__GNUC__) || defined(__clang__)
#define hakmem_likely(x) __builtin_expect(!!(x), 1)
#define hakmem_unlikely(x) __builtin_expect(!!(x), 0)
#else
#define hakmem_likely(x) (x)
#define hakmem_unlikely(x) (x)
#endif
Step 3.2: Add Branch Hints to Hot Path
File: core/hakmem_tiny.c
void* hakmem_tiny_malloc_direct(hakmem_tiny_heap_t* heap, size_t size) {
// Fast path hint
if (hakmem_likely(size <= 1024)) {
size_t idx = (size + 7) / 8;
hakmem_tiny_page_t* page = heap->pages_direct[idx];
if (hakmem_likely(page != NULL)) {
hakmem_block_t* block = page->free;
if (hakmem_likely(block != NULL)) {
page->free = block->next;
page->used++;
return block;
}
// Slow path within fast path
hakmem_tiny_collect_frees(page);
block = page->free;
if (hakmem_likely(block != NULL)) {
page->free = block->next;
page->used++;
return block;
}
}
}
// Fallback (unlikely)
return hakmem_tiny_malloc_generic(heap, size);
}
void hakmem_tiny_free(void* ptr) {
if (hakmem_unlikely(ptr == NULL)) return;
hakmem_tiny_page_t* page = hakmem_tiny_ptr_to_page(ptr);
hakmem_block_t* block = (hakmem_block_t*)ptr;
// Local free is likely
if (hakmem_likely(hakmem_tiny_is_local_page(page))) {
block->next = page->local_free;
page->local_free = block;
page->used--;
// Rarely fully free
if (hakmem_unlikely(page->used == 0)) {
hakmem_tiny_page_retire(page);
}
return;
}
// Remote free is unlikely
hakmem_tiny_free_remote(page, block);
}
Step 3.3: Bit-Pack Page Flags
File: core/hakmem_tiny.h
typedef union hakmem_page_flags_u {
uint8_t combined; // For fast check
struct {
uint8_t is_full : 1;
uint8_t has_remote_frees : 1;
uint8_t is_retired : 1;
uint8_t unused : 5;
} bits;
} hakmem_page_flags_t;
typedef struct hakmem_tiny_page_s {
// ... other fields
hakmem_page_flags_t flags;
// ...
} hakmem_tiny_page_t;
Usage:
// Single comparison instead of multiple
if (hakmem_likely(page->flags.combined == 0)) {
// Fast path: not full, no remote frees, not retired
// ... 3-instruction free
}
Validation
Benchmark command:
./bench_random_mixed_hakx
Expected output:
After Phase 2: 21.50-23.00 M ops/sec
After Phase 3: 23.00-24.50 M ops/sec (+5-8% additional)
Key metrics:
- Branch misprediction rate (should decrease)
- Instruction count (should decrease slightly)
- Code size (should decrease due to better branch layout)
Testing Strategy
Unit Tests
File: test_hakmem_phases.c
// Phase 1: Direct cache correctness
void test_direct_cache() {
hakmem_tiny_heap_t* heap = hakmem_tiny_heap_create();
// Allocate various sizes
void* p8 = hakmem_malloc(8);
void* p16 = hakmem_malloc(16);
void* p32 = hakmem_malloc(32);
// Verify direct cache is populated
assert(heap->pages_direct[1] != NULL); // 8 bytes
assert(heap->pages_direct[2] != NULL); // 16 bytes
assert(heap->pages_direct[4] != NULL); // 32 bytes
// Free and verify cache is updated
hakmem_free(p8);
assert(heap->pages_direct[1]->free != NULL);
hakmem_tiny_heap_destroy(heap);
}
// Phase 2: Dual free lists
void test_dual_free_lists() {
hakmem_tiny_heap_t* heap = hakmem_tiny_heap_create();
void* p = hakmem_malloc(64);
hakmem_tiny_page_t* page = hakmem_tiny_ptr_to_page(p);
// Local free goes to local_free
hakmem_free(p);
assert(page->local_free != NULL);
assert(page->free == NULL || page->free != p);
// Allocate again triggers migration
void* p2 = hakmem_malloc(64);
assert(page->local_free == NULL); // Migrated
hakmem_tiny_heap_destroy(heap);
}
// Phase 3: Branch hints (no functional change)
void test_branch_hints() {
// Just verify compilation and no regression
for (int i = 0; i < 10000; i++) {
void* p = hakmem_malloc(64);
hakmem_free(p);
}
}
Benchmark Suite
Run after each phase:
# Core benchmark
./bench_random_mixed_hakx
# Stress tests
./bench_mid_large_hakx
./bench_tiny_hot_hakx
./bench_fragment_stress_hakx
# Multi-threaded
./bench_mid_large_mt_hakx
Validation Checklist
Phase 1:
- Direct cache correctly populated
- Cache hit rate > 95% for small allocations
- Performance gain: 15-20%
- No memory leaks
- All existing tests pass
Phase 2:
- Local frees go to local_free
- Remote frees go to thread_free
- Migration works correctly
- Atomic operation count reduced by 80%+
- Performance gain: 10-15% additional
- Thread-safety maintained
- All existing tests pass
Phase 3:
- Branch hints compile correctly
- Bit-packed flags work as expected
- Performance gain: 5-8% additional
- Code size reduced or unchanged
- All existing tests pass
Rollback Plan
Phase 1 Rollback
If Phase 1 doesn't meet targets:
// #define HAKMEM_USE_DIRECT_CACHE 1 // Comment out
void* hakmem_malloc(size_t size) {
#ifdef HAKMEM_USE_DIRECT_CACHE
return hakmem_tiny_malloc_direct(tls_heap, size);
#else
return hakmem_tiny_malloc_generic(tls_heap, size); // Old path
#endif
}
Phase 2 Rollback
If Phase 2 causes issues:
// Revert to single free list
typedef struct hakmem_tiny_page_s {
#ifdef HAKMEM_USE_DUAL_LISTS
hakmem_block_t* free;
hakmem_block_t* local_free;
_Atomic(uintptr_t) thread_free;
#else
hakmem_block_t* free_list; // Old single list
#endif
// ...
} hakmem_tiny_page_t;
Success Criteria
Minimum Acceptable Performance
- Phase 1: +10% (18.18 M ops/sec)
- Phase 2: +20% cumulative (19.84 M ops/sec)
- Phase 3: +35% cumulative (22.32 M ops/sec)
Target Performance
- Phase 1: +15% (19.01 M ops/sec)
- Phase 2: +27% cumulative (21.00 M ops/sec)
- Phase 3: +40% cumulative (23.14 M ops/sec)
Stretch Goal
- Phase 3: +45% cumulative (24.00 M ops/sec) - Match mimalloc!
Timeline
Conservative Estimate
- Week 1: Phase 1 implementation + validation
- Week 2: Phase 2 implementation
- Week 3: Phase 2 validation + debugging
- Week 4: Phase 3 implementation + final validation
Total: 4 weeks
Aggressive Estimate
- Day 1-2: Phase 1 implementation + validation
- Day 3-6: Phase 2 implementation + validation
- Day 7-8: Phase 3 implementation + validation
Total: 8 days
Risk Mitigation
Technical Risks
-
Cache coherency issues (Phase 2)
- Mitigation: Extensive multi-threaded testing
- Fallback: Keep atomic operations on critical path
-
Memory overhead (Phase 1)
- Mitigation: Monitor RSS increase
- Fallback: Reduce HAKMEM_DIRECT_PAGES to 65 (512 bytes)
-
Correctness bugs (Phase 2)
- Mitigation: Extensive unit tests, ASAN/TSAN builds
- Fallback: Revert to single free list
Performance Risks
-
Phase 1 underperforms (<10%)
- Action: Profile cache hit rate
- Fix: Adjust cache update logic
-
Phase 2 adds latency (cache bouncing)
- Action: Profile cache misses
- Fix: Adjust migration threshold
-
Phase 3 no improvement (compiler already optimized)
- Action: Check assembly output
- Fix: Skip phase or use PGO
Monitoring
Key Metrics to Track
- Operations/sec (primary metric)
- Latency percentiles (p50, p95, p99)
- Memory usage (RSS)
- Cache miss rate
- Branch misprediction rate
- Atomic operation count
Profiling Commands
# Basic profiling
perf record -e cycles,instructions,cache-misses ./bench_random_mixed_hakx
perf report
# Cache analysis
perf record -e cache-references,cache-misses,L1-dcache-load-misses ./bench_random_mixed_hakx
# Branch analysis
perf record -e branch-misses,branches ./bench_random_mixed_hakx
# ASAN/TSAN builds
CC=clang CFLAGS="-fsanitize=address" make
CC=clang CFLAGS="-fsanitize=thread" make
Next Steps
- Implement Phase 1 (direct page cache)
- Benchmark and validate (target: +15-20%)
- If successful: Proceed to Phase 2
- If not: Debug and iterate
Start now with Phase 1 - it's low-risk and high-reward!