Files
hakmem/core/tiny_fastcache.c
Claude 494205435b Add debug counters for refill analysis - Surprising discovery
Implementation:
- Register tiny_fast_print_stats() via atexit() on first refill
- Forward declaration for function ordering
- Enable with HAKMEM_TINY_FAST_STATS=1

Usage:
```bash
HAKMEM_TINY_FAST_STATS=1 ./larson_hakmem 2 8 128 1024 1 12345 4
```

Results (threads=4, Throughput=1.377M ops/s):
- refills = 1,285 per thread
- drains = 0 (cache never full)
- Total ops = 2.754M (2 seconds)
- Refill allocations = 20,560 (1,285 × 16)
- **Refill rate: 0.75%**
- **Cache hit rate: 99.25%** 

Analysis:
Contrary to expectations, refill cost is NOT the bottleneck:
- Current refill cost: 1,285 × 1,600 cycles = 2.056M cycles
- Even if batched (200 cycles): saves 1.799M cycles
- But refills are only 0.75% of operations!

True bottleneck must be:
1. Fast path itself (99.25% of allocations)
   - malloc() overhead despite reordering
   - size_to_class mapping (even LUT has cost)
   - TLS cache access pattern
2. free() path (not optimized yet)
3. Cross-thread synchronization (22.8% cycles in profiling)

Key insight:
Phase 1 (entry point optimization) and Phase 3 (batch refill)
won't help much because:
- Entry point: Fast path already hit 99.25%
- Batch refill: Only affects 0.75% of operations

Next steps:
1. Add malloc/free counters to identify which is slower
2. Consider Phase 2 (Dual Free Lists) for locality
3. Investigate free() path optimization
4. May need to profile TLS cache access patterns

Related: mimalloc research shows dual free lists reduce cache
line bouncing - this may be more important than refill cost.
2025-11-05 05:19:32 +00:00

147 lines
4.8 KiB
C

// tiny_fastcache.c - Slow path for Tiny Fast Cache (refill/drain)
// Phase 6-3: Refill from Magazine/SuperSlab when fast cache misses
#include "tiny_fastcache.h"
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include <stdio.h>
#include <stdlib.h>
// ========== TLS Cache Definitions ==========
// (Declared as extern in tiny_fastcache.h)
__thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT];
__thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT];
__thread int g_tiny_fast_initialized = 0;
// ========== External References ==========
// External references to existing Tiny infrastructure (from hakmem_tiny.c)
extern __thread void* g_tls_sll_head[];
extern __thread uint32_t g_tls_sll_count[];
extern int g_use_superslab;
// From hakmem_tiny.c
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
// ========== Batch Refill Configuration ==========
// How many blocks to refill per miss (batch amortization)
#ifndef TINY_FAST_REFILL_BATCH
#define TINY_FAST_REFILL_BATCH 16
#endif
// ========== Debug Counters ==========
static __thread uint64_t g_tiny_fast_refill_count = 0;
static __thread uint64_t g_tiny_fast_drain_count = 0;
// Forward declaration for atexit registration
void tiny_fast_print_stats(void);
// ========== Slow Path: Refill from Magazine/SuperSlab ==========
void* tiny_fast_refill(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return NULL;
}
g_tiny_fast_refill_count++;
// Register stats printer on first refill (once per thread)
static __thread int stats_registered = 0;
if (!stats_registered) {
atexit(tiny_fast_print_stats);
stats_registered = 1;
}
// Try to batch-refill from existing Magazine/SuperSlab infrastructure
// We'll allocate TINY_FAST_REFILL_BATCH blocks and push to fast cache
int refilled = 0;
// Get size from g_tiny_class_sizes array (defined in hakmem_tiny.h)
// For now, use a simple size mapping (16, 24, 32, 40, 48, 56, 64, 80...)
static const size_t class_sizes[] = {16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 256};
size_t size = (class_idx < 16) ? class_sizes[class_idx] : 16;
// Batch allocation: try to get multiple blocks at once
for (int i = 0; i < TINY_FAST_REFILL_BATCH; i++) {
// Phase 6-3 Fix #2: Use proven Box Refactor path (hak_tiny_alloc) instead of hak_tiny_alloc_slow
// OLD: void* ptr = hak_tiny_alloc_slow(size, class_idx); // OOM!
// NEW: Use proven Box Refactor allocation (works at 4.19M ops/s)
extern void* hak_tiny_alloc(size_t size);
void* ptr = hak_tiny_alloc(size);
if (!ptr) break; // OOM or failed
// Push to fast cache (refilling)
if (g_tiny_fast_count[class_idx] < TINY_FAST_CACHE_CAP) {
*(void**)ptr = g_tiny_fast_cache[class_idx];
g_tiny_fast_cache[class_idx] = ptr;
g_tiny_fast_count[class_idx]++;
refilled++;
} else {
// Cache full (shouldn't happen, but handle gracefully)
// Free it back immediately
// TODO: implement tiny_fast_free_to_magazine(ptr, class_idx)
break;
}
}
// Now pop one for the caller
void* result = g_tiny_fast_cache[class_idx];
if (result) {
g_tiny_fast_cache[class_idx] = *(void**)result;
g_tiny_fast_count[class_idx]--;
}
return result;
}
// ========== Slow Path: Drain to Magazine/SuperSlab ==========
void tiny_fast_drain(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return;
}
g_tiny_fast_drain_count++;
// Drain half of the cache to Magazine/SuperSlab
// TODO: For now, we just reduce the count limit
// In a full implementation, we'd push blocks back to Magazine freelist
// Simple approach: just drop half the cache (temporary, for testing)
// A full implementation would return blocks to SuperSlab freelist
uint32_t target = TINY_FAST_CACHE_CAP / 2;
while (g_tiny_fast_count[class_idx] > target) {
void* ptr = g_tiny_fast_cache[class_idx];
if (!ptr) break;
g_tiny_fast_cache[class_idx] = *(void**)ptr;
g_tiny_fast_count[class_idx]--;
// TODO: Return to Magazine/SuperSlab
// For now, we'll just re-push it (no-op, but prevents loss)
// In production, call hak_tiny_free_slow(ptr, class_idx)
}
}
// ========== Debug Stats ==========
void tiny_fast_print_stats(void) {
static const char* env = NULL;
static int checked = 0;
if (!checked) {
env = getenv("HAKMEM_TINY_FAST_STATS");
checked = 1;
}
if (env && *env && *env != '0') {
fprintf(stderr, "[TINY_FAST] refills=%lu drains=%lu\n",
(unsigned long)g_tiny_fast_refill_count,
(unsigned long)g_tiny_fast_drain_count);
}
}