diff --git a/core/tiny_fastcache.c b/core/tiny_fastcache.c index 0d06a522..813201e8 100644 --- a/core/tiny_fastcache.c +++ b/core/tiny_fastcache.c @@ -14,6 +14,13 @@ __thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT]; __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT]; __thread int g_tiny_fast_initialized = 0; +// ========== Phase 6-7: Dual Free Lists (Phase 2) ========== +// Inspired by mimalloc's local/remote split design +// Separate alloc/free paths to reduce cache line bouncing + +__thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT]; // Free staging area +__thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT]; // Free count + // ========== External References ========== // External references to existing Tiny infrastructure (from hakmem_tiny.c) @@ -108,7 +115,12 @@ void tiny_fast_drain(int class_idx) { g_tiny_fast_drain_count++; - // Drain half of the cache to Magazine/SuperSlab + // ======================================================================== + // Phase 6-7: Drain from free_head (Phase 2) + // Since frees go to free_head, drain from there when capacity exceeded + // ======================================================================== + + // Drain half of the free_head to Magazine/SuperSlab // TODO: For now, we just reduce the count limit // In a full implementation, we'd push blocks back to Magazine freelist @@ -116,12 +128,12 @@ void tiny_fast_drain(int class_idx) { // A full implementation would return blocks to SuperSlab freelist uint32_t target = TINY_FAST_CACHE_CAP / 2; - while (g_tiny_fast_count[class_idx] > target) { - void* ptr = g_tiny_fast_cache[class_idx]; + while (g_tiny_fast_free_count[class_idx] > target) { + void* ptr = g_tiny_fast_free_head[class_idx]; if (!ptr) break; - g_tiny_fast_cache[class_idx] = *(void**)ptr; - g_tiny_fast_count[class_idx]--; + g_tiny_fast_free_head[class_idx] = *(void**)ptr; + g_tiny_fast_free_count[class_idx]--; // TODO: Return to Magazine/SuperSlab // For now, we'll just re-push it (no-op, but prevents loss) diff --git a/core/tiny_fastcache.h b/core/tiny_fastcache.h index 24970398..8e21768a 100644 --- a/core/tiny_fastcache.h +++ b/core/tiny_fastcache.h @@ -36,6 +36,12 @@ extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT]; // Initialized flag extern __thread int g_tiny_fast_initialized; +// ========== Phase 6-7: Dual Free Lists (Phase 2) ========== +// Separate free staging area to reduce cache line bouncing + +extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT]; +extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT]; + // ========== Size to Class Mapping ========== // Inline size-to-class for fast path (O(1) lookup table) @@ -89,7 +95,7 @@ static inline void* tiny_fast_alloc(size_t size) { int cls = tiny_fast_size_to_class(size); if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare) - // Step 2: Pop from TLS cache (2-3 instructions) + // Step 2: Pop from alloc_head (hot allocation path) void* ptr = g_tiny_fast_cache[cls]; if (__builtin_expect(ptr != NULL, 1)) { // Fast path: Pop head, decrement count @@ -98,6 +104,25 @@ static inline void* tiny_fast_alloc(size_t size) { return ptr; } + // ======================================================================== + // Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2) + // If alloc_head empty but free_head has blocks, migrate with pointer swap + // This is mimalloc's key optimization: batched migration, zero overhead + // ======================================================================== + if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) { + // Migrate entire free_head → alloc_head (pointer swap, instant!) + g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls]; + g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls]; + g_tiny_fast_free_head[cls] = NULL; + g_tiny_fast_free_count[cls] = 0; + + // Now pop one from newly migrated list + ptr = g_tiny_fast_cache[cls]; + g_tiny_fast_cache[cls] = *(void**)ptr; + g_tiny_fast_count[cls]--; + return ptr; + } + // Step 3: Slow path - refill from Magazine/SuperSlab return tiny_fast_refill(cls); } @@ -109,16 +134,22 @@ static inline void tiny_fast_free(void* ptr, size_t size) { int cls = tiny_fast_size_to_class(size); if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error) - // Step 2: Check capacity - if (__builtin_expect(g_tiny_fast_count[cls] >= TINY_FAST_CACHE_CAP, 0)) { - // Cache full - drain to Magazine/SuperSlab + // ======================================================================== + // Phase 6-7: Push to free_head (Phase 2) + // Separate free staging area reduces cache line contention with alloc_head + // mimalloc's key insight: alloc/free touch different cache lines + // ======================================================================== + + // Step 2: Check free_head capacity + if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) { + // Free cache full - drain to Magazine/SuperSlab tiny_fast_drain(cls); } - // Step 3: Push to TLS cache (2 instructions) - *(void**)ptr = g_tiny_fast_cache[cls]; - g_tiny_fast_cache[cls] = ptr; - g_tiny_fast_count[cls]++; + // Step 3: Push to free_head (separate cache line from alloc_head!) + *(void**)ptr = g_tiny_fast_free_head[cls]; + g_tiny_fast_free_head[cls] = ptr; + g_tiny_fast_free_count[cls]++; } // ========== Initialization ========== @@ -128,5 +159,10 @@ static inline void tiny_fast_init(void) { memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache)); memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count)); + + // Phase 6-7: Initialize dual free lists (Phase 2) + memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head)); + memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count)); + g_tiny_fast_initialized = 1; }