diff --git a/core/box/tls_sll_drain_box.h b/core/box/tls_sll_drain_box.h index a8a2da7b..52e79c4c 100644 --- a/core/box/tls_sll_drain_box.h +++ b/core/box/tls_sll_drain_box.h @@ -146,23 +146,35 @@ static inline uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size) { // Resolve SuperSlab/Slab (like slow path does) SuperSlab* ss = hak_super_lookup(base); if (!ss || ss->magic != SUPERSLAB_MAGIC) { - // Invalid SuperSlab - skip this block + // CRITICAL FIX: Don't leak pointers when SuperSlab lookup fails! + // Problem: Pointer was popped from TLS SLL but not returned anywhere → leak + potential double-alloc + // Solution: Push back to TLS SLL (will retry on next drain cycle) if (g_debug) { - fprintf(stderr, "[TLS_SLL_DRAIN] SKIP: class=%d base=%p (invalid SuperSlab)\n", + fprintf(stderr, "[TLS_SLL_DRAIN] PUSHBACK: class=%d base=%p (invalid SuperSlab, will retry)\n", class_idx, base); } - continue; + // Push back to TLS SLL head (retry later when SS becomes valid) + extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; + tiny_next_write(class_idx, base, g_tls_sll[class_idx].head); + g_tls_sll[class_idx].head = base; + g_tls_sll[class_idx].count++; // Restore count + break; // Stop draining this class for now (avoid infinite retry loop) } // Get slab index int slab_idx = slab_index_for(ss, base); if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) { - // Invalid slab index - skip this block + // CRITICAL FIX: Don't leak pointers when slab index is invalid! if (g_debug) { - fprintf(stderr, "[TLS_SLL_DRAIN] SKIP: class=%d base=%p (invalid slab_idx=%d)\n", + fprintf(stderr, "[TLS_SLL_DRAIN] PUSHBACK: class=%d base=%p (invalid slab_idx=%d, will retry)\n", class_idx, base, slab_idx); } - continue; + // Push back to TLS SLL head (retry later) + extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; + tiny_next_write(class_idx, base, g_tls_sll[class_idx].head); + g_tls_sll[class_idx].head = base; + g_tls_sll[class_idx].count++; + break; } // Get slab metadata diff --git a/docs/analysis/LARSON_DOUBLE_FREE_INVESTIGATION.md b/docs/analysis/LARSON_DOUBLE_FREE_INVESTIGATION.md new file mode 100644 index 00000000..12a280f9 --- /dev/null +++ b/docs/analysis/LARSON_DOUBLE_FREE_INVESTIGATION.md @@ -0,0 +1,143 @@ +# Larson Double-Free Investigation Report + +## Date: 2025-11-27 + +## Summary + +Larson benchmark crashes with TLS_SLL_PUSH_DUP error (double-free detection). Investigation reveals potential metadata inconsistency causing same pointer to be allocated twice without proper free. + +## Symptoms + +``` +[TLS_SLL_PUSH_DUP] cls=1 ptr=0x76e109240430 +last_push_from=hak_tiny_free_fast_v2 +last_pop_from=(null) ← Never popped from TLS SLL! +where=hak_tiny_free_fast_v2 +``` + +**Key Observation**: Pointer was pushed to TLS SLL but never popped, yet being freed again. + +## Root Cause Analysis + +### Eliminated Hypotheses + +1. ❌ **Larson Benchmark Bug**: ChatGPT analyzed larson.cpp - no double-free logic found +2. ❌ **Cross-Thread Free**: LARSON_FIX=1 doesn't prevent the crash +3. ❌ **Stale Header**: Fixed freelist header write (commit e4868bf23) but crash persists + +### Current Leading Hypothesis: Metadata Inconsistency + +**Scenario**: +1. User: `free(P)` → P pushed to TLS SLL (count++) +2. **Without pop**: P somehow reallocated from slab freelist or carve +3. User: `p2 = malloc()` → Returns P (same address!) +4. User: `free(p2)` → Tries to push P to TLS SLL again +5. Duplicate detection: P already in TLS SLL → ABORT + +**This requires**: +- `meta->used` count mismatch +- P in both TLS SLL AND slab freelist simultaneously +- Synchronization failure between TLS SLL and slab metadata + +## Evidence + +### TLS SLL Pop Logic (Suspicious) +File: `core/box/tls_sll_box.h:570-572` +```c +if (g_tls_sll[class_idx].count > 0) { + g_tls_sll[class_idx].count--; // Conditional decrement! +} +``` +If count somehow becomes 0, head is updated but count doesn't decrement! + +### TLS Drain Leak (Memory Leak Bug) +File: `core/box/tls_sll_drain_box.h:148-154` +```c +SuperSlab* ss = hak_super_lookup(base); +if (!ss || ss->magic != SUPERSLAB_MAGIC) { + fprintf(stderr, "[TLS_SLL_DRAIN] SKIP: ...\n"); + continue; // ← Pointer DROPPED without returning to freelist! +} +``` +**Critical**: If SuperSlab lookup fails, pointer is popped but never returned → memory leak. + +## Fixes Implemented + +### 1. Freelist Header Write (commit e4868bf23) +File: `core/tiny_superslab_alloc.inc.h:159-169` + +**Problem**: Freelist allocation path didn't write headers +```c +// OLD (buggy) +return block; // Returns BASE without header + +// NEW (fixed) +void* user = tiny_region_id_write_header(block, meta->class_idx); +return user; +``` + +**Impact**: Prevents stale headers, but doesn't fix double-free. + +### 2. Abort on Duplicate (commit e4868bf23) +File: `core/box/tls_sll_box.h:381` + +**Change**: `return true` → `abort()` for diagnostic backtrace + +**Impact**: Enables precise root cause identification. + +## Next Steps + +### Priority 1: Verify Metadata Consistency + +Add assertions to check: +1. Pointer is in ONLY ONE location at a time: + - TLS SLL + - Slab freelist + - Not both! + +2. `meta->used` count matches reality + +### Priority 2: Fix TLS Drain Leak + +File: `core/box/tls_sll_drain_box.h:154` + +**Current (buggy)**: +```c +if (!ss || ss->magic != SUPERSLAB_MAGIC) { + continue; // ← LEAK! +} +``` + +**Proposed Fix**: +```c +if (!ss || ss->magic != SUPERSLAB_MAGIC) { + // Option A: Push back to TLS SLL (retry later) + tls_sll_push(class_idx, base, UINT32_MAX); + break; // Stop draining this class for now + + // Option B: Force to remote queue (if lookup fails, assume remote) + // (requires more analysis) +} +``` + +### Priority 3: Enhanced Tracing + +Add debug logging to track pointer lifecycle: +1. Malloc: "P allocated from source X" +2. Free: "P freed to TLS SLL" +3. Pop: "P popped from TLS SLL" +4. Drain: "P drained to freelist" +5. Re-alloc: "P reallocated from freelist" + +ENV: `HAKMEM_TINY_PTR_TRACE=
` to track specific pointer. + +## Crash Rate + +**Before fixes**: 47% (14/30 runs crash) +**After header fix**: 100% (still crashes, just faster detection) + +## References + +- Commit: e4868bf23 "Larson crash investigation: Add freelist header write + abort()" +- Previous investigation: Task agent Phase 2 (identified TLS_SLL_PUSH_DUP pattern) +- Larson benchmark analysis: ChatGPT confirmed no user-code bug