From 66a29783a467ac42e4b8d0d485609214c009d43f Mon Sep 17 00:00:00 2001
From: "Moe Charm (CI)" <moecharm@example.com>
Date: Fri, 21 Nov 2025 05:33:17 +0900
Subject: [PATCH] Phase 19-1: Quick Prune (Frontend SLIM mode) - Experimental
 implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Implementation
Added `HAKMEM_TINY_FRONT_SLIM=1` ENV gate to skip FastCache + SFC layers,
going straight to SLL (Single-Linked List) for direct backend access.

### Code Changes
**File**: `core/tiny_alloc_fast.inc.h` (lines 201-230)

Added early return gate in `tiny_alloc_fast_pop()`:
```c
// Phase 19-1: Quick Prune (Frontend SLIM mode)
static __thread int g_front_slim_checked = 0;
static __thread int g_front_slim_enabled = 0;

if (g_front_slim_enabled) {
    // Skip FastCache + SFC, go straight to SLL
    extern int g_tls_sll_enable;
    if (g_tls_sll_enable) {
        void* base = NULL;
        if (tls_sll_pop(class_idx, &base)) {
            g_front_sll_hit[class_idx]++;
            return base;  // SLL hit (SLIM fast path)
        }
    }
    return NULL;  // SLL miss → caller refills
}
// else: Existing FC → SFC → SLL cascade (unchanged)
```

### Design Rationale
**Goal**: Skip unused frontend layers to reduce branch misprediction overhead
**Strategy**: Based on ChatGPT-sensei analysis showing FC/SFC hit rates near 0%
**Expected**: 22M → 27-30M ops/s (+22-36%)

**Features**:
- ✅ A/B testable via ENV (instant rollback: ENV=0)
- ✅ Existing code unchanged (backward compatible)
- ✅ TLS-cached enable check (amortized overhead)

---

## Performance Results

### Benchmark: Random Mixed 256B (1M iterations)

```
Baseline (SLIM OFF): 23.2M, 23.7M, 23.2M ops/s (avg: 23.4M)
Phase 19-1 (SLIM ON): 22.8M, 22.8M, 23.7M ops/s (avg: 23.1M)

Difference: -1.3% (within noise, no improvement) ⚠️
Expected:   +22-36% ← NOT achieved
```

### Stability Testing
- ✅ 100K short run: No SEGV, no crashes
- ✅ 1M iterations: Stable performance across 3 runs
- ✅ Functional correctness: All allocations successful

---

## Analysis: Why Quick Prune Failed

### Hypothesis 1: FC/SFC Overhead Already Minimal
- FC/SFC checks are branch-predicted (miss path well-optimized)
- Skipping these layers provides negligible cycle savings
- Premise of "0% hit rate" may not reflect actual benefit of having layers

### Hypothesis 2: ENV Check Overhead Cancels Gains
- TLS variable initialization (`g_front_slim_checked`)
- `getenv()` call overhead on first allocation
- Cost of SLIM gate check == cost of skipping FC/SFC

### Hypothesis 3: Incorrect Premise
- Task-sensei's "FC/SFC hit rate 0%" assumption may be wrong
- Layers may provide cache locality benefits even with low hit rate
- Removing layers disrupts cache line prefetching

---

## Conclusion & Next Steps

**Phase 19-1 Status**: ❌ Experimental - No performance improvement

**Key Learnings**:
1. Frontend layer pruning alone is insufficient
2. Branch prediction in existing code is already effective
3. Structural change (not just pruning) needed for significant gains

**Recommendation**: Proceed to Phase 19-2 (Front-V2 tcache single-layer)
- Phase 19-1 approach (pruning) = failed
- Phase 19-2 approach (structural redesign) = recommended
- Expected: 31ns → 15ns via tcache-style single TLS magazine

---

## ENV Usage

```bash
# Enable SLIM mode (experimental, no gain observed)
export HAKMEM_TINY_FRONT_SLIM=1
./bench_random_mixed_hakmem 1000000 256 42

# Disable SLIM mode (default, recommended)
unset HAKMEM_TINY_FRONT_SLIM
./bench_random_mixed_hakmem 1000000 256 42
```

---

## Files Modified
- `core/tiny_alloc_fast.inc.h` - Added Phase 19-1 Quick Prune gate

## Investigation Report
Task-sensei analysis documented entry point (`tiny_alloc_fast_pop()` line 176),
identified skip targets (FC: lines 208-220, SFC: lines 222-250), and confirmed
SLL as primary fast path (88-99% hit rate from prior analysis).

---

📝 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task-sensei (tiny_alloc_fast.inc.h structure analysis)
Co-Authored-By: ChatGPT (Phase 19 strategy design)
---
 core/tiny_alloc_fast.inc.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h
index 75e6a653..2e7b17a6 100644
--- a/core/tiny_alloc_fast.inc.h
+++ b/core/tiny_alloc_fast.inc.h
@@ -198,6 +198,37 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
     }
     return NULL;
 #else
+    // ========== Phase 19-1: Quick Prune (Frontend SLIM mode) ==========
+    // ENV: HAKMEM_TINY_FRONT_SLIM=1
+    // Goal: Skip FastCache + SFC layers, go straight to SLL (88-99% hit rate)
+    // Expected: 22M → 27-30M ops/s (+22-36%)
+    static __thread int g_front_slim_checked = 0;
+    static __thread int g_front_slim_enabled = 0;
+
+    if (__builtin_expect(!g_front_slim_checked, 0)) {
+        const char* e = getenv("HAKMEM_TINY_FRONT_SLIM");
+        g_front_slim_enabled = (e && *e && *e != '0') ? 1 : 0;
+        g_front_slim_checked = 1;
+    }
+
+    // SLIM MODE: Skip FastCache + SFC, go straight to SLL
+    if (__builtin_expect(g_front_slim_enabled, 0)) {
+        // Box Boundary: TLS SLL freelist pop (only layer in SLIM mode)
+        extern int g_tls_sll_enable;
+        if (__builtin_expect(g_tls_sll_enable, 1)) {
+            void* base = NULL;
+            if (tls_sll_pop(class_idx, &base)) {
+                // Front Gate: SLL hit (SLIM fast path - 3 instructions)
+                extern unsigned long long g_front_sll_hit[];
+                g_front_sll_hit[class_idx]++;
+                return base;
+            }
+        }
+        // SLIM mode miss → return NULL (caller refills)
+        return NULL;
+    }
+    // ========== End Phase 19-1: Quick Prune ==========
+
     // Phase 7 Task 3: Profiling overhead removed in release builds
     // In release mode, compiler can completely eliminate profiling code
 #if !HAKMEM_BUILD_RELEASE