Phase 6-7: Dual Free Lists (Phase 2) - Mixed results

Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
parent e3514e7fa9
commit 3429ed4457
2 changed files with 61 additions and 13 deletions
--- a/core/tiny_fastcache.h
+++ b/core/tiny_fastcache.h
@ -36,6 +36,12 @@ extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
 // Initialized flag
 extern __thread int g_tiny_fast_initialized;

+// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
+// Separate free staging area to reduce cache line bouncing
+
+extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT];
+extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT];
+
 // ========== Size to Class Mapping ==========
 // Inline size-to-class for fast path (O(1) lookup table)

@ -89,7 +95,7 @@ static inline void* tiny_fast_alloc(size_t size) {
    int cls = tiny_fast_size_to_class(size);
    if (__builtin_expect(cls < 0, 0)) return NULL;  // Not tiny (rare)

-    // Step 2: Pop from TLS cache (2-3 instructions)
+    // Step 2: Pop from alloc_head (hot allocation path)
    void* ptr = g_tiny_fast_cache[cls];
    if (__builtin_expect(ptr != NULL, 1)) {
        // Fast path: Pop head, decrement count
@ -98,6 +104,25 @@ static inline void* tiny_fast_alloc(size_t size) {
        return ptr;
    }

+    // ========================================================================
+    // Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2)
+    // If alloc_head empty but free_head has blocks, migrate with pointer swap
+    // This is mimalloc's key optimization: batched migration, zero overhead
+    // ========================================================================
+    if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) {
+        // Migrate entire free_head → alloc_head (pointer swap, instant!)
+        g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls];
+        g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls];
+        g_tiny_fast_free_head[cls] = NULL;
+        g_tiny_fast_free_count[cls] = 0;
+
+        // Now pop one from newly migrated list
+        ptr = g_tiny_fast_cache[cls];
+        g_tiny_fast_cache[cls] = *(void**)ptr;
+        g_tiny_fast_count[cls]--;
+        return ptr;
+    }
+
    // Step 3: Slow path - refill from Magazine/SuperSlab
    return tiny_fast_refill(cls);
 }
@ -109,16 +134,22 @@ static inline void tiny_fast_free(void* ptr, size_t size) {
    int cls = tiny_fast_size_to_class(size);
    if (__builtin_expect(cls < 0, 0)) return;  // Not tiny (error)

-    // Step 2: Check capacity
-    if (__builtin_expect(g_tiny_fast_count[cls] >= TINY_FAST_CACHE_CAP, 0)) {
-        // Cache full - drain to Magazine/SuperSlab
+    // ========================================================================
+    // Phase 6-7: Push to free_head (Phase 2)
+    // Separate free staging area reduces cache line contention with alloc_head
+    // mimalloc's key insight: alloc/free touch different cache lines
+    // ========================================================================
+
+    // Step 2: Check free_head capacity
+    if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) {
+        // Free cache full - drain to Magazine/SuperSlab
        tiny_fast_drain(cls);
    }

-    // Step 3: Push to TLS cache (2 instructions)
-    *(void**)ptr = g_tiny_fast_cache[cls];
-    g_tiny_fast_cache[cls] = ptr;
-    g_tiny_fast_count[cls]++;
+    // Step 3: Push to free_head (separate cache line from alloc_head!)
+    *(void**)ptr = g_tiny_fast_free_head[cls];
+    g_tiny_fast_free_head[cls] = ptr;
+    g_tiny_fast_free_count[cls]++;
 }

 // ========== Initialization ==========
@ -128,5 +159,10 @@ static inline void tiny_fast_init(void) {

    memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache));
    memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count));
+
+    // Phase 6-7: Initialize dual free lists (Phase 2)
+    memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head));
+    memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count));
+
    g_tiny_fast_initialized = 1;
 }