Implementation of Task-sensei Priority 1 recommendation: Add empty_mask to SuperSlab for immediate EMPTY slab detection and reuse, reducing Stage 3 (mmap) overhead. ## Changes ### 1. SuperSlab Structure (core/superslab/superslab_types.h) - Added `empty_mask` (uint32_t): Bitmap for EMPTY slabs (used==0) - Added `empty_count` (uint8_t): Quick check for EMPTY slab availability ### 2. EMPTY Detection API (core/box/ss_hot_cold_box.h) - Added `ss_is_slab_empty()`: Returns true if slab is completely EMPTY - Added `ss_mark_slab_empty()`: Marks slab as EMPTY (highest reuse priority) - Added `ss_clear_slab_empty()`: Removes EMPTY state when reactivated - Updated `ss_update_hot_cold_indices()`: Classify EMPTY/Hot/Cold slabs - Updated `ss_init_hot_cold()`: Initialize empty_mask/empty_count ### 3. Free Path Integration (core/box/free_local_box.c) - After `meta->used--`, check if `meta->used == 0` - If true, call `ss_mark_slab_empty()` to update empty_mask - Enables immediate EMPTY detection on every free operation ### 4. Shared Pool Stage 0.5 (core/hakmem_shared_pool.c) - New Stage 0.5 before Stage 1: Scan existing SuperSlabs for EMPTY slabs - Iterate over `g_super_reg_by_class[class_idx][]` (first 16 entries) - Check `ss->empty_count > 0` → scan `empty_mask` with `__builtin_ctz()` - Reuse EMPTY slab directly, avoiding Stage 3 (mmap/lock overhead) - ENV control: `HAKMEM_SS_EMPTY_REUSE=1` (default OFF for A/B testing) - ENV tunable: `HAKMEM_SS_EMPTY_SCAN_LIMIT=N` (default 16 SuperSlabs) ## Performance Results ``` Benchmark: Random Mixed 256B (100K iterations) OFF (default): 10.2M ops/s (baseline) ON (ENV=1): 11.5M ops/s (+13.0% improvement) ✅ ``` ## Expected Impact (from Task-sensei analysis) **Current bottleneck**: - Stage 1: 2-5% hit rate (free list broken) - Stage 2: 3-8% hit rate (rare UNUSED) - Stage 3: 87-95% hit rate (lock + mmap overhead) ← bottleneck **Expected with Phase 12-1.1**: - Stage 0.5: 20-40% hit rate (EMPTY scan) - Stage 1-2: 20-30% hit rate (combined) - Stage 3: 30-50% hit rate (significantly reduced) **Theoretical max**: 25M → 55-70M ops/s (+120-180%) ## Current Gap Analysis **Observed**: 11.5M ops/s (+13%) **Expected**: 55-70M ops/s (+120-180%) **Gap**: Performance regression or missing complementary optimizations Possible causes: 1. Phase 3d-C (25.1M→10.2M) regression - unrelated to this change 2. EMPTY scan overhead (16 SuperSlabs × empty_count check) 3. Missing Priority 2-5 optimizations (Lazy SS deallocation, etc.) 4. Stage 0.5 too conservative (scan_limit=16, should be higher?) ## Usage ```bash # Enable EMPTY reuse optimization export HAKMEM_SS_EMPTY_REUSE=1 # Optional: increase scan limit (trade-off: throughput vs latency) export HAKMEM_SS_EMPTY_SCAN_LIMIT=32 ./bench_random_mixed_hakmem 100000 256 42 ``` ## Next Steps **Priority 1-A**: Investigate Phase 3d-C→12-1.1 regression (25.1M→10.2M) **Priority 1-B**: Implement Phase 12-1.2 (Lazy SS deallocation) for complementary effect **Priority 1-C**: Profile Stage 0.5 overhead (scan_limit tuning) ## Files Modified Core implementation: - `core/superslab/superslab_types.h` - empty_mask/empty_count fields - `core/box/ss_hot_cold_box.h` - EMPTY detection/marking API - `core/box/free_local_box.c` - Free path EMPTY detection - `core/hakmem_shared_pool.c` - Stage 0.5 EMPTY scan Documentation: - `CURRENT_TASK.md` - Task-sensei investigation report --- 🎯 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Task-sensei (investigation & design analysis)
132 lines
6.0 KiB
C
132 lines
6.0 KiB
C
#include "free_local_box.h"
|
|
#include "free_publish_box.h"
|
|
#include "hakmem_tiny.h"
|
|
#include "tiny_next_ptr_box.h" // Phase E1-CORRECT: Box API
|
|
#include "ss_hot_cold_box.h" // Phase 12-1.1: EMPTY slab marking
|
|
|
|
void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid) {
|
|
extern _Atomic uint64_t g_free_local_box_calls;
|
|
atomic_fetch_add_explicit(&g_free_local_box_calls, 1, memory_order_relaxed);
|
|
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
|
|
if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) return;
|
|
(void)my_tid;
|
|
|
|
// ✅ Phase E1-CORRECT: ALL classes have headers, calculate BASE pointer once
|
|
void* base = (void*)((uint8_t*)ptr - 1);
|
|
|
|
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
|
int actual_idx = slab_index_for(ss, base);
|
|
if (actual_idx != slab_idx) {
|
|
tiny_failfast_abort_ptr("free_local_box_idx", ss, slab_idx, ptr, "slab_idx_mismatch");
|
|
} else {
|
|
uint8_t cls = (meta && meta->class_idx < TINY_NUM_CLASSES) ? meta->class_idx : 0;
|
|
size_t blk = g_tiny_class_sizes[cls];
|
|
uint8_t* slab_base = tiny_slab_base_for(ss, slab_idx);
|
|
uintptr_t delta = (uintptr_t)base - (uintptr_t)slab_base;
|
|
if (blk == 0 || (delta % blk) != 0) {
|
|
tiny_failfast_abort_ptr("free_local_box_align", ss, slab_idx, ptr, "misaligned");
|
|
} else if (meta && delta / blk >= meta->capacity) {
|
|
tiny_failfast_abort_ptr("free_local_box_range", ss, slab_idx, ptr, "out_of_capacity");
|
|
}
|
|
}
|
|
}
|
|
|
|
void* prev = meta->freelist;
|
|
|
|
// FREELIST CORRUPTION DEBUG: Validate pointer before writing
|
|
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
|
uint8_t cls = (meta && meta->class_idx < TINY_NUM_CLASSES) ? meta->class_idx : 0;
|
|
size_t blk = g_tiny_class_sizes[cls];
|
|
uint8_t* base_ss = (uint8_t*)ss;
|
|
uint8_t* slab_base = tiny_slab_base_for(ss, slab_idx);
|
|
|
|
// Verify prev pointer is valid (if not NULL)
|
|
if (prev != NULL) {
|
|
uintptr_t prev_addr = (uintptr_t)prev;
|
|
uintptr_t slab_addr = (uintptr_t)slab_base;
|
|
|
|
// Check if prev is within this slab
|
|
if (prev_addr < (uintptr_t)base_ss || prev_addr >= (uintptr_t)base_ss + (2*1024*1024)) {
|
|
fprintf(stderr, "[FREE_CORRUPT] prev=%p outside SuperSlab ss=%p slab=%d\n",
|
|
prev, ss, slab_idx);
|
|
tiny_failfast_abort_ptr("free_local_prev_range", ss, slab_idx, ptr, "prev_outside_ss");
|
|
}
|
|
|
|
// Check alignment of prev
|
|
if ((prev_addr - slab_addr) % blk != 0) {
|
|
fprintf(stderr, "[FREE_CORRUPT] prev=%p misaligned (cls=%u slab=%d blk=%zu offset=%zu)\n",
|
|
prev, cls, slab_idx, blk, (size_t)(prev_addr - slab_addr));
|
|
fprintf(stderr, "[FREE_CORRUPT] Writing from ptr=%p, freelist was=%p\n", ptr, prev);
|
|
tiny_failfast_abort_ptr("free_local_prev_misalign", ss, slab_idx, ptr, "prev_misaligned");
|
|
}
|
|
}
|
|
|
|
fprintf(stderr, "[FREE_VERIFY] cls=%u slab=%d ptr=%p prev=%p (offset_ptr=%zu offset_prev=%zu)\n",
|
|
cls, slab_idx, ptr, prev,
|
|
(size_t)((uintptr_t)base - (uintptr_t)slab_base),
|
|
prev ? (size_t)((uintptr_t)prev - (uintptr_t)slab_base) : 0);
|
|
}
|
|
|
|
// Use per-slab class for freelist linkage
|
|
uint8_t cls = (meta && meta->class_idx < TINY_NUM_CLASSES) ? meta->class_idx : 0;
|
|
tiny_next_write(cls, ptr, prev); // Phase E1-CORRECT: Box API with shared pool
|
|
meta->freelist = ptr;
|
|
|
|
// FREELIST CORRUPTION DEBUG: Verify write succeeded
|
|
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
|
void* readback = tiny_next_read(cls, ptr); // Phase E1-CORRECT: Box API
|
|
if (readback != prev) {
|
|
fprintf(stderr, "[FREE_CORRUPT] Wrote prev=%p to ptr=%p but read back %p!\n",
|
|
prev, ptr, readback);
|
|
fprintf(stderr, "[FREE_CORRUPT] Memory corruption detected during freelist push\n");
|
|
tiny_failfast_abort_ptr("free_local_readback", ss, slab_idx, ptr, "write_corrupted");
|
|
}
|
|
}
|
|
|
|
tiny_failfast_log("free_local_box", cls, ss, meta, ptr, prev);
|
|
// BUGFIX: Memory barrier to ensure freelist visibility before used decrement
|
|
// Without this, other threads can see new freelist but old used count (race)
|
|
atomic_thread_fence(memory_order_release);
|
|
|
|
// Optional freelist mask update on first push
|
|
do {
|
|
static int g_mask_en = -1;
|
|
if (__builtin_expect(g_mask_en == -1, 0)) {
|
|
const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
|
|
g_mask_en = (e && *e && *e != '0') ? 1 : 0;
|
|
}
|
|
if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
|
|
uint32_t bit = (1u << slab_idx);
|
|
atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
|
|
}
|
|
} while (0);
|
|
|
|
// Track local free (debug helpers may be no-op)
|
|
tiny_remote_track_on_local_free(ss, slab_idx, ptr, "local_free", my_tid);
|
|
meta->used--;
|
|
ss_active_dec_one(ss);
|
|
|
|
// Phase 12-1.1: EMPTY slab detection (immediate reuse optimization)
|
|
if (meta->used == 0) {
|
|
// Slab became EMPTY → mark for highest-priority reuse
|
|
ss_mark_slab_empty(ss, slab_idx);
|
|
|
|
// DEBUG LOGGING - Track when used reaches 0
|
|
static int dbg = -1;
|
|
if (__builtin_expect(dbg == -1, 0)) {
|
|
const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
|
|
dbg = (e && *e && *e != '0') ? 1 : 0;
|
|
}
|
|
if (dbg == 1) {
|
|
fprintf(stderr, "[FREE_LOCAL_BOX] EMPTY detected: cls=%u ss=%p slab=%d empty_mask=0x%x empty_count=%u\n",
|
|
cls, (void*)ss, slab_idx, ss->empty_mask, ss->empty_count);
|
|
}
|
|
}
|
|
|
|
if (prev == NULL) {
|
|
// First-free → advertise slab to adopters using per-slab class
|
|
uint8_t cls0 = (meta && meta->class_idx < TINY_NUM_CLASSES) ? meta->class_idx : 0;
|
|
tiny_free_publish_first_free((int)cls0, ss, slab_idx);
|
|
}
|
|
}
|