Files
hakmem/core/tiny_fastcache.h

169 lines
6.2 KiB
C
Raw Normal View History

// tiny_fastcache.h - Ultra-Simple Tiny Fast Path (System tcache style)
// Phase 6-3: Bypass Magazine/SuperSlab for Tiny allocations (<=128B)
// Goal: 3-4 instruction fast path, 70-80% of System tcache performance
#pragma once
#include <stdint.h>
#include <stddef.h>
#include <string.h>
// ========== Configuration ==========
// Enable Tiny Fast Path (default: ON for Phase 6-3)
#ifndef HAKMEM_TINY_FAST_PATH
#define HAKMEM_TINY_FAST_PATH 1
#endif
// Tiny class count (sizes: 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128)
#define TINY_FAST_CLASS_COUNT 16
// Fast cache capacity per class (default: 64 slots, like System tcache)
#ifndef TINY_FAST_CACHE_CAP
#define TINY_FAST_CACHE_CAP 64
#endif
// Tiny size threshold (<=128B goes to fast path)
#define TINY_FAST_THRESHOLD 128
// ========== TLS Cache (System tcache style) ==========
// Per-thread fast cache: array of freelist heads (defined in tiny_fastcache.c)
extern __thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT];
// Per-thread cache counts (for capacity management)
extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
// Initialized flag
extern __thread int g_tiny_fast_initialized;
Phase 6-7: Dual Free Lists (Phase 2) - Mixed results Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
// Separate free staging area to reduce cache line bouncing
extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT];
extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT];
// ========== Size to Class Mapping ==========
// Inline size-to-class for fast path (O(1) lookup table)
static inline int tiny_fast_size_to_class(size_t size) {
// Optimized: Lookup table for O(1) mapping (vs 11-branch linear search)
// Table indexed by (size >> 3) for sizes 0-128
// Class mapping: 0:16B, 1:24B, 2:32B, 3:40B, 4:48B, 5:56B, 6:64B, 7:80B, 8:96B, 9:112B, 10:128B
static const int8_t size_to_class_lut[17] = {
0, // 0-7 → 16B (class 0)
0, // 8-15 → 16B (class 0)
0, // 16 → 16B (class 0)
1, // 17-23 → 24B (class 1)
1, // 24 → 24B (class 1)
2, // 25-31 → 32B (class 2)
2, // 32 → 32B (class 2)
3, // 33-39 → 40B (class 3)
3, // 40 → 40B (class 3)
4, // 41-47 → 48B (class 4)
4, // 48 → 48B (class 4)
5, // 49-55 → 56B (class 5)
5, // 56 → 56B (class 5)
6, // 57-63 → 64B (class 6)
6, // 64 → 64B (class 6)
7, // 65-79 → 80B (class 7)
8 // 80-95 → 96B (class 8)
};
if (__builtin_expect(size > 128, 0)) return -1; // Not tiny
// Fast path: Direct lookup (1-2 instructions!)
unsigned int idx = size >> 3; // size / 8
if (__builtin_expect(idx < 17, 1)) {
return size_to_class_lut[idx];
}
// Size 96-128: class 9-10
if (size <= 112) return 9; // 112B (class 9)
return 10; // 128B (class 10)
}
// ========== Forward Declarations ==========
// Slow path: refill from Magazine/SuperSlab (implemented in tiny_fastcache.c)
void* tiny_fast_refill(int class_idx);
void tiny_fast_drain(int class_idx);
// ========== Fast Path: Alloc (3-4 instructions!) ==========
static inline void* tiny_fast_alloc(size_t size) {
// Step 1: Size to class (1-2 instructions, branch predictor friendly)
int cls = tiny_fast_size_to_class(size);
if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare)
Phase 6-7: Dual Free Lists (Phase 2) - Mixed results Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
// Step 2: Pop from alloc_head (hot allocation path)
void* ptr = g_tiny_fast_cache[cls];
if (__builtin_expect(ptr != NULL, 1)) {
// Fast path: Pop head, decrement count
g_tiny_fast_cache[cls] = *(void**)ptr;
g_tiny_fast_count[cls]--;
return ptr;
}
Phase 6-7: Dual Free Lists (Phase 2) - Mixed results Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
// ========================================================================
// Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2)
// If alloc_head empty but free_head has blocks, migrate with pointer swap
// This is mimalloc's key optimization: batched migration, zero overhead
// ========================================================================
if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) {
// Migrate entire free_head → alloc_head (pointer swap, instant!)
g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls];
g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls];
g_tiny_fast_free_head[cls] = NULL;
g_tiny_fast_free_count[cls] = 0;
// Now pop one from newly migrated list
ptr = g_tiny_fast_cache[cls];
g_tiny_fast_cache[cls] = *(void**)ptr;
g_tiny_fast_count[cls]--;
return ptr;
}
// Step 3: Slow path - refill from Magazine/SuperSlab
return tiny_fast_refill(cls);
}
// ========== Fast Path: Free (2-3 instructions!) ==========
static inline void tiny_fast_free(void* ptr, size_t size) {
// Step 1: Size to class
int cls = tiny_fast_size_to_class(size);
if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error)
Phase 6-7: Dual Free Lists (Phase 2) - Mixed results Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
// ========================================================================
// Phase 6-7: Push to free_head (Phase 2)
// Separate free staging area reduces cache line contention with alloc_head
// mimalloc's key insight: alloc/free touch different cache lines
// ========================================================================
// Step 2: Check free_head capacity
if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) {
// Free cache full - drain to Magazine/SuperSlab
tiny_fast_drain(cls);
}
Phase 6-7: Dual Free Lists (Phase 2) - Mixed results Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
// Step 3: Push to free_head (separate cache line from alloc_head!)
*(void**)ptr = g_tiny_fast_free_head[cls];
g_tiny_fast_free_head[cls] = ptr;
g_tiny_fast_free_count[cls]++;
}
// ========== Initialization ==========
static inline void tiny_fast_init(void) {
if (g_tiny_fast_initialized) return;
memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache));
memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count));
Phase 6-7: Dual Free Lists (Phase 2) - Mixed results Implementation: Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy). Changes: 1. Added g_tiny_fast_free_head[] - separate free staging area 2. Modified tiny_fast_alloc() - lazy migration from free_head 3. Modified tiny_fast_free() - push to free_head (separate cache line) 4. Modified tiny_fast_drain() - drain from free_head Key design (inspired by mimalloc): - alloc_head: Hot allocation path (g_tiny_fast_cache) - free_head: Local frees staging (g_tiny_fast_free_head) - Migration: Pointer swap when alloc_head empty (zero-cost batching) - Benefit: alloc/free touch different cache lines → reduce bouncing Results (Larson 2s 8-128B 1024): - Phase 3 baseline: ST 0.474M, MT 1.712M ops/s - Phase 2: ST 0.600M, MT 1.624M ops/s - Change: **+27% ST, -5% MT** ⚠️ Analysis - Mixed results: ✅ Single-thread: +27% improvement - Better cache locality (alloc/free separated) - No contention, pure memory access pattern win ❌ Multi-thread: -5% regression (expected +30-50%) - Migration logic overhead (extra branches) - Dual arrays increase TLS size → more cache misses? - Pointer swap cost on migration path - May not help in Larson's specific access pattern Comparison to system malloc: - Current: 1.624M ops/s (MT) - System: ~7.2M ops/s (MT) - **Gap: Still 4.4x slower** Key insights: 1. mimalloc's dual free lists help with *cross-thread* frees 2. Larson may be mostly *same-thread* frees → less benefit 3. Migration overhead > cache line bouncing reduction 4. ST improvement shows memory locality matters 5. Need to profile actual malloc/free patterns in Larson Why mimalloc succeeds but HAKMEM doesn't: - mimalloc has sophisticated remote free queue (lock-free MPSC) - HAKMEM's simple dual lists don't handle cross-thread well - Larson's workload may differ from mimalloc's target benchmarks Next considerations: - Verify Larson's same-thread vs cross-thread free ratio - Consider combining all 3 phases (may have synergy) - Profile with actual counters (malloc vs free hotspots) - May need fundamentally different approach
2025-11-05 05:35:06 +00:00
// Phase 6-7: Initialize dual free lists (Phase 2)
memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head));
memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count));
g_tiny_fast_initialized = 1;
}