288 lines
9.0 KiB
Markdown
288 lines
9.0 KiB
Markdown
|
|
# Larson Race Condition Diagnostic Patch
|
||
|
|
|
||
|
|
**Purpose**: Confirm the freelist race condition hypothesis before implementing full fix
|
||
|
|
|
||
|
|
## Quick Diagnostic (5 minutes)
|
||
|
|
|
||
|
|
Add logging to detect concurrent freelist access:
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Edit core/front/tiny_unified_cache.c
|
||
|
|
```
|
||
|
|
|
||
|
|
### Patch: Add Thread ID Logging
|
||
|
|
|
||
|
|
```diff
|
||
|
|
--- a/core/front/tiny_unified_cache.c
|
||
|
|
+++ b/core/front/tiny_unified_cache.c
|
||
|
|
@@ -8,6 +8,7 @@
|
||
|
|
#include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <string.h>
|
||
|
|
+#include <pthread.h>
|
||
|
|
|
||
|
|
// Phase 23-E: Forward declarations
|
||
|
|
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c
|
||
|
|
@@ -166,8 +167,22 @@ void* unified_cache_refill(int class_idx) {
|
||
|
|
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
|
||
|
|
|
||
|
|
while (produced < room) {
|
||
|
|
if (m->freelist) {
|
||
|
|
+ // DIAGNOSTIC: Log thread + freelist state
|
||
|
|
+ static _Atomic uint64_t g_diag_count = 0;
|
||
|
|
+ uint64_t diag_n = atomic_fetch_add_explicit(&g_diag_count, 1, memory_order_relaxed);
|
||
|
|
+ if (diag_n < 100) { // First 100 pops only
|
||
|
|
+ fprintf(stderr, "[FREELIST_POP] T%lu cls=%d ss=%p slab=%d freelist=%p owner=%u\n",
|
||
|
|
+ (unsigned long)pthread_self(),
|
||
|
|
+ class_idx,
|
||
|
|
+ (void*)tls->ss,
|
||
|
|
+ tls->slab_idx,
|
||
|
|
+ m->freelist,
|
||
|
|
+ (unsigned)m->owner_tid_low);
|
||
|
|
+ fflush(stderr);
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
// Freelist pop
|
||
|
|
void* p = m->freelist;
|
||
|
|
m->freelist = tiny_next_read(class_idx, p);
|
||
|
|
```
|
||
|
|
|
||
|
|
### Build and Run
|
||
|
|
|
||
|
|
```bash
|
||
|
|
./build.sh larson_hakmem 2>&1 | tail -5
|
||
|
|
|
||
|
|
# Run with 4 threads (known to crash)
|
||
|
|
./out/release/larson_hakmem 4 4 500 10000 1000 12345 1 2>&1 | tee larson_diag.log
|
||
|
|
|
||
|
|
# Analyze results
|
||
|
|
grep FREELIST_POP larson_diag.log | head -50
|
||
|
|
```
|
||
|
|
|
||
|
|
### Expected Output (Race Confirmed)
|
||
|
|
|
||
|
|
If race exists, you'll see:
|
||
|
|
```
|
||
|
|
[FREELIST_POP] T140737353857856 cls=6 ss=0x76f899260800 slab=3 freelist=0x76f899261000 owner=42
|
||
|
|
[FREELIST_POP] T140737345465088 cls=6 ss=0x76f899260800 slab=3 freelist=0x76f899261000 owner=42
|
||
|
|
^^^^ SAME SS+SLAB+FREELIST ^^^^
|
||
|
|
```
|
||
|
|
|
||
|
|
**Key Evidence**:
|
||
|
|
- Different thread IDs (T140737353857856 vs T140737345465088)
|
||
|
|
- SAME SuperSlab pointer (`ss=0x76f899260800`)
|
||
|
|
- SAME slab index (`slab=3`)
|
||
|
|
- SAME freelist head (`freelist=0x76f899261000`)
|
||
|
|
- → **RACE CONFIRMED**: Two threads popping from same freelist simultaneously!
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Quick Workaround (30 minutes)
|
||
|
|
|
||
|
|
Force thread affinity by rejecting cross-thread access:
|
||
|
|
|
||
|
|
```diff
|
||
|
|
--- a/core/front/tiny_unified_cache.c
|
||
|
|
+++ b/core/front/tiny_unified_cache.c
|
||
|
|
@@ -137,6 +137,21 @@ void* unified_cache_refill(int class_idx) {
|
||
|
|
void* unified_cache_refill(int class_idx) {
|
||
|
|
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
|
||
|
|
|
||
|
|
+ // WORKAROUND: Ensure slab ownership (thread affinity)
|
||
|
|
+ if (tls->meta) {
|
||
|
|
+ uint8_t my_tid_low = (uint8_t)pthread_self();
|
||
|
|
+
|
||
|
|
+ // If slab has no owner, claim it
|
||
|
|
+ if (tls->meta->owner_tid_low == 0) {
|
||
|
|
+ tls->meta->owner_tid_low = my_tid_low;
|
||
|
|
+ }
|
||
|
|
+ // If slab owned by different thread, force refill to get new slab
|
||
|
|
+ else if (tls->meta->owner_tid_low != my_tid_low) {
|
||
|
|
+ tls->ss = NULL; // Trigger superslab_refill
|
||
|
|
+ }
|
||
|
|
+ }
|
||
|
|
+
|
||
|
|
// Step 1: Ensure SuperSlab available
|
||
|
|
if (!tls->ss) {
|
||
|
|
if (!superslab_refill(class_idx)) return NULL;
|
||
|
|
```
|
||
|
|
|
||
|
|
### Test Workaround
|
||
|
|
|
||
|
|
```bash
|
||
|
|
./build.sh larson_hakmem 2>&1 | tail -5
|
||
|
|
|
||
|
|
# Test with 4, 8, 10 threads
|
||
|
|
for threads in 4 8 10; do
|
||
|
|
echo "Testing $threads threads..."
|
||
|
|
timeout 30 ./out/release/larson_hakmem $threads $threads 500 10000 1000 12345 1
|
||
|
|
echo "Exit code: $?"
|
||
|
|
done
|
||
|
|
```
|
||
|
|
|
||
|
|
**Expected**: Larson should complete without SEGV (may be slower due to more refills)
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Proper Fix Preview (Option 1: Atomic Freelist)
|
||
|
|
|
||
|
|
### Step 1: Update TinySlabMeta
|
||
|
|
|
||
|
|
```diff
|
||
|
|
--- a/core/superslab/superslab_types.h
|
||
|
|
+++ b/core/superslab/superslab_types.h
|
||
|
|
@@ -10,8 +10,8 @@
|
||
|
|
// TinySlabMeta: per-slab metadata embedded in SuperSlab
|
||
|
|
typedef struct TinySlabMeta {
|
||
|
|
- void* freelist; // NULL = bump-only, non-NULL = freelist head
|
||
|
|
- uint16_t used; // blocks currently allocated from this slab
|
||
|
|
+ _Atomic uintptr_t freelist; // Atomic freelist head (was: void*)
|
||
|
|
+ _Atomic uint16_t used; // Atomic used count (was: uint16_t)
|
||
|
|
uint16_t capacity; // total blocks this slab can hold
|
||
|
|
uint8_t class_idx; // owning tiny class (Phase 12: per-slab)
|
||
|
|
uint8_t carved; // carve/owner flags
|
||
|
|
```
|
||
|
|
|
||
|
|
### Step 2: Update Freelist Operations
|
||
|
|
|
||
|
|
```diff
|
||
|
|
--- a/core/front/tiny_unified_cache.c
|
||
|
|
+++ b/core/front/tiny_unified_cache.c
|
||
|
|
@@ -168,9 +168,20 @@ void* unified_cache_refill(int class_idx) {
|
||
|
|
|
||
|
|
while (produced < room) {
|
||
|
|
- if (m->freelist) {
|
||
|
|
- void* p = m->freelist;
|
||
|
|
- m->freelist = tiny_next_read(class_idx, p);
|
||
|
|
+ // Atomic freelist pop (lock-free)
|
||
|
|
+ void* p = (void*)atomic_load_explicit(&m->freelist, memory_order_acquire);
|
||
|
|
+ while (p != NULL) {
|
||
|
|
+ void* next = tiny_next_read(class_idx, p);
|
||
|
|
+
|
||
|
|
+ // CAS: Only succeed if freelist unchanged
|
||
|
|
+ if (atomic_compare_exchange_weak_explicit(
|
||
|
|
+ &m->freelist, &p, (uintptr_t)next,
|
||
|
|
+ memory_order_release, memory_order_acquire)) {
|
||
|
|
+ // Successfully popped block
|
||
|
|
+ break;
|
||
|
|
+ }
|
||
|
|
+ // CAS failed → p was updated to current value, retry
|
||
|
|
+ }
|
||
|
|
+ if (p) {
|
||
|
|
|
||
|
|
// PageFaultTelemetry: record page touch for this BASE
|
||
|
|
pagefault_telemetry_touch(class_idx, p);
|
||
|
|
@@ -180,7 +191,7 @@ void* unified_cache_refill(int class_idx) {
|
||
|
|
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
|
||
|
|
#endif
|
||
|
|
|
||
|
|
- m->used++;
|
||
|
|
+ atomic_fetch_add_explicit(&m->used, 1, memory_order_relaxed);
|
||
|
|
out[produced++] = p;
|
||
|
|
|
||
|
|
} else if (m->carved < m->capacity) {
|
||
|
|
```
|
||
|
|
|
||
|
|
### Step 3: Update All Access Sites
|
||
|
|
|
||
|
|
**Files requiring atomic conversion** (estimated 20 high-priority sites):
|
||
|
|
1. `core/front/tiny_unified_cache.c` - freelist pop (DONE above)
|
||
|
|
2. `core/tiny_superslab_free.inc.h` - freelist push (same-thread free)
|
||
|
|
3. `core/tiny_superslab_alloc.inc.h` - freelist allocation
|
||
|
|
4. `core/box/carve_push_box.c` - batch operations
|
||
|
|
5. `core/slab_handle.h` - freelist traversal
|
||
|
|
|
||
|
|
**Grep pattern to find sites**:
|
||
|
|
```bash
|
||
|
|
grep -rn "->freelist" core/ --include="*.c" --include="*.h" | grep -v "\.d:" | grep -v "//" | wc -l
|
||
|
|
# Result: 87 sites (audit required)
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Testing Checklist
|
||
|
|
|
||
|
|
### Phase 1: Basic Functionality
|
||
|
|
- [ ] Single-threaded: `bench_random_mixed_hakmem 10000 256 42`
|
||
|
|
- [ ] C7 specific: `bench_random_mixed_hakmem 10000 1024 42`
|
||
|
|
- [ ] Fixed size: `bench_fixed_size_hakmem 10000 1024 128`
|
||
|
|
|
||
|
|
### Phase 2: Multi-Threading
|
||
|
|
- [ ] 2 threads: `larson_hakmem 2 2 100 1000 100 12345 1`
|
||
|
|
- [ ] 4 threads: `larson_hakmem 4 4 500 10000 1000 12345 1`
|
||
|
|
- [ ] 8 threads: `larson_hakmem 8 8 500 10000 1000 12345 1`
|
||
|
|
- [ ] 10 threads: `larson_hakmem 10 10 500 10000 1000 12345 1` (original params)
|
||
|
|
|
||
|
|
### Phase 3: Stress Test
|
||
|
|
```bash
|
||
|
|
# 100 iterations with random parameters
|
||
|
|
for i in {1..100}; do
|
||
|
|
threads=$((RANDOM % 16 + 2))
|
||
|
|
./out/release/larson_hakmem $threads $threads 500 10000 1000 $RANDOM 1 || {
|
||
|
|
echo "FAILED at iteration $i with $threads threads"
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
done
|
||
|
|
echo "✅ All 100 iterations passed"
|
||
|
|
```
|
||
|
|
|
||
|
|
### Phase 4: Performance Regression
|
||
|
|
```bash
|
||
|
|
# Before fix
|
||
|
|
./out/release/larson_hakmem 2 2 100 1000 100 12345 1 | grep "Throughput ="
|
||
|
|
# Expected: ~24.6M ops/s
|
||
|
|
|
||
|
|
# After fix (should be similar, lock-free CAS is fast)
|
||
|
|
./out/release/larson_hakmem 2 2 100 1000 100 12345 1 | grep "Throughput ="
|
||
|
|
# Target: >= 20M ops/s (< 20% regression acceptable)
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Timeline Estimate
|
||
|
|
|
||
|
|
| Task | Time | Priority |
|
||
|
|
|------|------|----------|
|
||
|
|
| Apply diagnostic patch | 5 min | P0 |
|
||
|
|
| Verify race with logs | 10 min | P0 |
|
||
|
|
| Apply workaround patch | 30 min | P1 |
|
||
|
|
| Test workaround | 30 min | P1 |
|
||
|
|
| Implement atomic fix | 2-3 hrs | P2 |
|
||
|
|
| Audit all access sites | 3-4 hrs | P2 |
|
||
|
|
| Comprehensive testing | 1 hr | P2 |
|
||
|
|
| **Total (Full Fix)** | **7-9 hrs** | - |
|
||
|
|
| **Total (Workaround Only)** | **1-2 hrs** | - |
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Decision Matrix
|
||
|
|
|
||
|
|
### Use Workaround If:
|
||
|
|
- Need Larson working ASAP (< 2 hours)
|
||
|
|
- Can tolerate slight performance regression (~10-15%)
|
||
|
|
- Want minimal code changes (< 20 lines)
|
||
|
|
|
||
|
|
### Use Atomic Fix If:
|
||
|
|
- Need production-quality solution
|
||
|
|
- Performance is critical (lock-free = optimal)
|
||
|
|
- Have time for thorough audit (7-9 hours)
|
||
|
|
|
||
|
|
### Use Per-Slab Mutex If:
|
||
|
|
- Want guaranteed correctness
|
||
|
|
- Performance less critical than safety
|
||
|
|
- Prefer simple, auditable code
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Recommendation
|
||
|
|
|
||
|
|
**Immediate (Today)**: Apply workaround patch to unblock Larson testing
|
||
|
|
**Short-term (This Week)**: Implement atomic fix with careful audit
|
||
|
|
**Long-term (Next Release)**: Consider architectural fix (slab affinity) for optimal performance
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Contact for Questions
|
||
|
|
|
||
|
|
See `LARSON_CRASH_ROOT_CAUSE_REPORT.md` for detailed analysis.
|