Files
hakmem/core/tiny_fastcache.c
Claude 3429ed4457 Phase 6-7: Dual Free Lists (Phase 2) - Mixed results
Implementation:
Separate alloc/free paths to reduce cache line bouncing (mimalloc's strategy).

Changes:
1. Added g_tiny_fast_free_head[] - separate free staging area
2. Modified tiny_fast_alloc() - lazy migration from free_head
3. Modified tiny_fast_free() - push to free_head (separate cache line)
4. Modified tiny_fast_drain() - drain from free_head

Key design (inspired by mimalloc):
- alloc_head: Hot allocation path (g_tiny_fast_cache)
- free_head: Local frees staging (g_tiny_fast_free_head)
- Migration: Pointer swap when alloc_head empty (zero-cost batching)
- Benefit: alloc/free touch different cache lines → reduce bouncing

Results (Larson 2s 8-128B 1024):
- Phase 3 baseline: ST 0.474M, MT 1.712M ops/s
- Phase 2: ST 0.600M, MT 1.624M ops/s
- Change: **+27% ST, -5% MT** ⚠️

Analysis - Mixed results:
 Single-thread: +27% improvement
   - Better cache locality (alloc/free separated)
   - No contention, pure memory access pattern win

 Multi-thread: -5% regression (expected +30-50%)
   - Migration logic overhead (extra branches)
   - Dual arrays increase TLS size → more cache misses?
   - Pointer swap cost on migration path
   - May not help in Larson's specific access pattern

Comparison to system malloc:
- Current: 1.624M ops/s (MT)
- System: ~7.2M ops/s (MT)
- **Gap: Still 4.4x slower**

Key insights:
1. mimalloc's dual free lists help with *cross-thread* frees
2. Larson may be mostly *same-thread* frees → less benefit
3. Migration overhead > cache line bouncing reduction
4. ST improvement shows memory locality matters
5. Need to profile actual malloc/free patterns in Larson

Why mimalloc succeeds but HAKMEM doesn't:
- mimalloc has sophisticated remote free queue (lock-free MPSC)
- HAKMEM's simple dual lists don't handle cross-thread well
- Larson's workload may differ from mimalloc's target benchmarks

Next considerations:
- Verify Larson's same-thread vs cross-thread free ratio
- Consider combining all 3 phases (may have synergy)
- Profile with actual counters (malloc vs free hotspots)
- May need fundamentally different approach
2025-11-05 05:35:06 +00:00

161 lines
5.4 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_fastcache.c - Slow path for Tiny Fast Cache (refill/drain)
// Phase 6-3: Refill from Magazine/SuperSlab when fast cache misses
#include "tiny_fastcache.h"
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include <stdio.h>
#include <stdlib.h>
// ========== TLS Cache Definitions ==========
// (Declared as extern in tiny_fastcache.h)
__thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT];
__thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
__thread int g_tiny_fast_initialized = 0;
// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
// Inspired by mimalloc's local/remote split design
// Separate alloc/free paths to reduce cache line bouncing
__thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT]; // Free staging area
__thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT]; // Free count
// ========== External References ==========
// External references to existing Tiny infrastructure (from hakmem_tiny.c)
extern __thread void* g_tls_sll_head[];
extern __thread uint32_t g_tls_sll_count[];
extern int g_use_superslab;
// From hakmem_tiny.c
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
// ========== Batch Refill Configuration ==========
// How many blocks to refill per miss (batch amortization)
#ifndef TINY_FAST_REFILL_BATCH
#define TINY_FAST_REFILL_BATCH 16
#endif
// ========== Debug Counters ==========
static __thread uint64_t g_tiny_fast_refill_count = 0;
static __thread uint64_t g_tiny_fast_drain_count = 0;
// Forward declaration for atexit registration
void tiny_fast_print_stats(void);
// ========== Slow Path: Refill from Magazine/SuperSlab ==========
void* tiny_fast_refill(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return NULL;
}
g_tiny_fast_refill_count++;
// Register stats printer on first refill (once per thread)
static __thread int stats_registered = 0;
if (!stats_registered) {
atexit(tiny_fast_print_stats);
stats_registered = 1;
}
// ========================================================================
// Phase 6-6: Batch Refill Optimization (Phase 3)
// Inspired by mimalloc's page-based refill and glibc's tcache batch refill
//
// OLD: 16 individual allocations + 16 individual pushes (16 × 100 cycles = 1,600 cycles)
// NEW: Batch allocate + link in one pass (~200 cycles, -87% cost)
// ========================================================================
// Get size from class mapping
static const size_t class_sizes[] = {16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 256};
size_t size = (class_idx < 16) ? class_sizes[class_idx] : 16;
// Step 1: Batch allocate into temporary array
void* batch[TINY_FAST_REFILL_BATCH];
int count = 0;
extern void* hak_tiny_alloc(size_t size);
for (int i = 0; i < TINY_FAST_REFILL_BATCH; i++) {
void* ptr = hak_tiny_alloc(size);
if (!ptr) break; // OOM or allocation failed
batch[count++] = ptr;
}
if (count == 0) return NULL; // Complete failure
// Step 2: Link all blocks into freelist in one pass (batch linking)
// This is the key optimization: N individual pushes → 1 batch link
for (int i = 0; i < count - 1; i++) {
*(void**)batch[i] = batch[i + 1];
}
*(void**)batch[count - 1] = NULL; // Terminate list
// Step 3: Attach batch to cache head
g_tiny_fast_cache[class_idx] = batch[0];
g_tiny_fast_count[class_idx] = count;
// Step 4: Pop one for the caller
void* result = g_tiny_fast_cache[class_idx];
g_tiny_fast_cache[class_idx] = *(void**)result;
g_tiny_fast_count[class_idx]--;
return result;
}
// ========== Slow Path: Drain to Magazine/SuperSlab ==========
void tiny_fast_drain(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return;
}
g_tiny_fast_drain_count++;
// ========================================================================
// Phase 6-7: Drain from free_head (Phase 2)
// Since frees go to free_head, drain from there when capacity exceeded
// ========================================================================
// Drain half of the free_head to Magazine/SuperSlab
// TODO: For now, we just reduce the count limit
// In a full implementation, we'd push blocks back to Magazine freelist
// Simple approach: just drop half the cache (temporary, for testing)
// A full implementation would return blocks to SuperSlab freelist
uint32_t target = TINY_FAST_CACHE_CAP / 2;
while (g_tiny_fast_free_count[class_idx] > target) {
void* ptr = g_tiny_fast_free_head[class_idx];
if (!ptr) break;
g_tiny_fast_free_head[class_idx] = *(void**)ptr;
g_tiny_fast_free_count[class_idx]--;
// TODO: Return to Magazine/SuperSlab
// For now, we'll just re-push it (no-op, but prevents loss)
// In production, call hak_tiny_free_slow(ptr, class_idx)
}
}
// ========== Debug Stats ==========
void tiny_fast_print_stats(void) {
static const char* env = NULL;
static int checked = 0;
if (!checked) {
env = getenv("HAKMEM_TINY_FAST_STATS");
checked = 1;
}
if (env && *env && *env != '0') {
fprintf(stderr, "[TINY_FAST] refills=%lu drains=%lu\n",
(unsigned long)g_tiny_fast_refill_count,
(unsigned long)g_tiny_fast_drain_count);
}
}