Phase 6-7: Dual Free Lists (Phase 2) - Mixed results

Implementation:
Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy).

Changes:
1. Added g_tiny_fast_free_head[] - separate free staging area
2. Modified tiny_fast_alloc() - lazy migration from free_head
3. Modified tiny_fast_free() - push to free_head (separate cache line)
4. Modified tiny_fast_drain() - drain from free_head

Key design (inspired by mimalloc):
- alloc_head: Hot allocation path (g_tiny_fast_cache)
- free_head: Local frees staging (g_tiny_fast_free_head)
- Migration: Pointer swap when alloc_head empty (zero-cost batching)
- Benefit: alloc/free touch different cache lines → reduce bouncing

Results (Larson 2s 8-128B 1024):
- Phase 3 baseline: ST 0.474M, MT 1.712M ops/s
- Phase 2: ST 0.600M, MT 1.624M ops/s
- Change: **+27% ST, -5% MT** ⚠️

Analysis - Mixed results:
 Single-thread: +27% improvement
   - Better cache locality (alloc/free separated)
   - No contention, pure memory access pattern win

 Multi-thread: -5% regression (expected +30-50%)
   - Migration logic overhead (extra branches)
   - Dual arrays increase TLS size → more cache misses?
   - Pointer swap cost on migration path
   - May not help in Larson's specific access pattern

Comparison to system malloc:
- Current: 1.624M ops/s (MT)
- System: ~7.2M ops/s (MT)
- **Gap: Still 4.4x slower**

Key insights:
1. mimalloc's dual free lists help with *cross-thread* frees
2. Larson may be mostly *same-thread* frees → less benefit
3. Migration overhead > cache line bouncing reduction
4. ST improvement shows memory locality matters
5. Need to profile actual malloc/free patterns in Larson

Why mimalloc succeeds but HAKMEM doesn't:
- mimalloc has sophisticated remote free queue (lock-free MPSC)
- HAKMEM's simple dual lists don't handle cross-thread well
- Larson's workload may differ from mimalloc's target benchmarks

Next considerations:
- Verify Larson's same-thread vs cross-thread free ratio
- Consider combining all 3 phases (may have synergy)
- Profile with actual counters (malloc vs free hotspots)
- May need fundamentally different approach
This commit is contained in:
Claude
2025-11-05 05:35:06 +00:00
parent e3514e7fa9
commit 3429ed4457
2 changed files with 61 additions and 13 deletions

View File

@ -14,6 +14,13 @@ __thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT];
__thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT]; __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
__thread int g_tiny_fast_initialized = 0; __thread int g_tiny_fast_initialized = 0;
// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
// Inspired by mimalloc's local/remote split design
// Separate alloc/free paths to reduce cache line bouncing
__thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT]; // Free staging area
__thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT]; // Free count
// ========== External References ========== // ========== External References ==========
// External references to existing Tiny infrastructure (from hakmem_tiny.c) // External references to existing Tiny infrastructure (from hakmem_tiny.c)
@ -108,7 +115,12 @@ void tiny_fast_drain(int class_idx) {
g_tiny_fast_drain_count++; g_tiny_fast_drain_count++;
// Drain half of the cache to Magazine/SuperSlab // ========================================================================
// Phase 6-7: Drain from free_head (Phase 2)
// Since frees go to free_head, drain from there when capacity exceeded
// ========================================================================
// Drain half of the free_head to Magazine/SuperSlab
// TODO: For now, we just reduce the count limit // TODO: For now, we just reduce the count limit
// In a full implementation, we'd push blocks back to Magazine freelist // In a full implementation, we'd push blocks back to Magazine freelist
@ -116,12 +128,12 @@ void tiny_fast_drain(int class_idx) {
// A full implementation would return blocks to SuperSlab freelist // A full implementation would return blocks to SuperSlab freelist
uint32_t target = TINY_FAST_CACHE_CAP / 2; uint32_t target = TINY_FAST_CACHE_CAP / 2;
while (g_tiny_fast_count[class_idx] > target) { while (g_tiny_fast_free_count[class_idx] > target) {
void* ptr = g_tiny_fast_cache[class_idx]; void* ptr = g_tiny_fast_free_head[class_idx];
if (!ptr) break; if (!ptr) break;
g_tiny_fast_cache[class_idx] = *(void**)ptr; g_tiny_fast_free_head[class_idx] = *(void**)ptr;
g_tiny_fast_count[class_idx]--; g_tiny_fast_free_count[class_idx]--;
// TODO: Return to Magazine/SuperSlab // TODO: Return to Magazine/SuperSlab
// For now, we'll just re-push it (no-op, but prevents loss) // For now, we'll just re-push it (no-op, but prevents loss)

View File

@ -36,6 +36,12 @@ extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
// Initialized flag // Initialized flag
extern __thread int g_tiny_fast_initialized; extern __thread int g_tiny_fast_initialized;
// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
// Separate free staging area to reduce cache line bouncing
extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT];
extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT];
// ========== Size to Class Mapping ========== // ========== Size to Class Mapping ==========
// Inline size-to-class for fast path (O(1) lookup table) // Inline size-to-class for fast path (O(1) lookup table)
@ -89,7 +95,7 @@ static inline void* tiny_fast_alloc(size_t size) {
int cls = tiny_fast_size_to_class(size); int cls = tiny_fast_size_to_class(size);
if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare) if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare)
// Step 2: Pop from TLS cache (2-3 instructions) // Step 2: Pop from alloc_head (hot allocation path)
void* ptr = g_tiny_fast_cache[cls]; void* ptr = g_tiny_fast_cache[cls];
if (__builtin_expect(ptr != NULL, 1)) { if (__builtin_expect(ptr != NULL, 1)) {
// Fast path: Pop head, decrement count // Fast path: Pop head, decrement count
@ -98,6 +104,25 @@ static inline void* tiny_fast_alloc(size_t size) {
return ptr; return ptr;
} }
// ========================================================================
// Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2)
// If alloc_head empty but free_head has blocks, migrate with pointer swap
// This is mimalloc's key optimization: batched migration, zero overhead
// ========================================================================
if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) {
// Migrate entire free_head → alloc_head (pointer swap, instant!)
g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls];
g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls];
g_tiny_fast_free_head[cls] = NULL;
g_tiny_fast_free_count[cls] = 0;
// Now pop one from newly migrated list
ptr = g_tiny_fast_cache[cls];
g_tiny_fast_cache[cls] = *(void**)ptr;
g_tiny_fast_count[cls]--;
return ptr;
}
// Step 3: Slow path - refill from Magazine/SuperSlab // Step 3: Slow path - refill from Magazine/SuperSlab
return tiny_fast_refill(cls); return tiny_fast_refill(cls);
} }
@ -109,16 +134,22 @@ static inline void tiny_fast_free(void* ptr, size_t size) {
int cls = tiny_fast_size_to_class(size); int cls = tiny_fast_size_to_class(size);
if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error) if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error)
// Step 2: Check capacity // ========================================================================
if (__builtin_expect(g_tiny_fast_count[cls] >= TINY_FAST_CACHE_CAP, 0)) { // Phase 6-7: Push to free_head (Phase 2)
// Cache full - drain to Magazine/SuperSlab // Separate free staging area reduces cache line contention with alloc_head
// mimalloc's key insight: alloc/free touch different cache lines
// ========================================================================
// Step 2: Check free_head capacity
if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) {
// Free cache full - drain to Magazine/SuperSlab
tiny_fast_drain(cls); tiny_fast_drain(cls);
} }
// Step 3: Push to TLS cache (2 instructions) // Step 3: Push to free_head (separate cache line from alloc_head!)
*(void**)ptr = g_tiny_fast_cache[cls]; *(void**)ptr = g_tiny_fast_free_head[cls];
g_tiny_fast_cache[cls] = ptr; g_tiny_fast_free_head[cls] = ptr;
g_tiny_fast_count[cls]++; g_tiny_fast_free_count[cls]++;
} }
// ========== Initialization ========== // ========== Initialization ==========
@ -128,5 +159,10 @@ static inline void tiny_fast_init(void) {
memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache)); memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache));
memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count)); memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count));
// Phase 6-7: Initialize dual free lists (Phase 2)
memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head));
memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count));
g_tiny_fast_initialized = 1; g_tiny_fast_initialized = 1;
} }