// tiny_fastcache.h - Ultra-Simple Tiny Fast Path (System tcache style) // Phase 6-3: Bypass Magazine/SuperSlab for Tiny allocations (<=128B) // Goal: 3-4 instruction fast path, 70-80% of System tcache performance #pragma once #include #include #include #include // For getenv() #include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write // ========== Configuration ========== // Enable Tiny Fast Path (default: ON for Phase 6-3) #ifndef HAKMEM_TINY_FAST_PATH #define HAKMEM_TINY_FAST_PATH 1 #endif // Tiny class count (sizes: 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128) #define TINY_FAST_CLASS_COUNT 16 // Fast cache capacity per class (default: 64 slots, like System tcache) #ifndef TINY_FAST_CACHE_CAP #define TINY_FAST_CACHE_CAP 64 #endif // Tiny size threshold (<=128B goes to fast path) #define TINY_FAST_THRESHOLD 128 // ========== TLS Cache (System tcache style) ========== // Per-thread fast cache: array of freelist heads (defined in tiny_fastcache.c) extern __thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT]; // Per-thread cache counts (for capacity management) extern __thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT]; // Initialized flag extern __thread int g_tiny_fast_initialized; // ========== Phase 6-7: Dual Free Lists (Phase 2) ========== // Separate free staging area to reduce cache line bouncing extern __thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT]; extern __thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT]; // ========== RDTSC Profiling (Phase 6-8) ========== // Extern declarations for inline functions to access profiling counters extern __thread uint64_t g_tiny_malloc_count; extern __thread uint64_t g_tiny_malloc_cycles; extern __thread uint64_t g_tiny_free_count; extern __thread uint64_t g_tiny_free_cycles; extern __thread uint64_t g_tiny_refill_cycles; extern __thread uint64_t g_tiny_migration_count; extern __thread uint64_t g_tiny_migration_cycles; #ifdef __x86_64__ static inline uint64_t tiny_fast_rdtsc(void) { unsigned int lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return ((uint64_t)hi << 32) | lo; } #else static inline uint64_t tiny_fast_rdtsc(void) { return 0; } #endif extern int g_profile_enabled; static inline int tiny_fast_profile_enabled(void) { extern int g_profile_enabled; if (__builtin_expect(g_profile_enabled == -1, 0)) { const char* env = getenv("HAKMEM_TINY_PROFILE"); g_profile_enabled = (env && *env && *env != '0') ? 1 : 0; } return g_profile_enabled; } // ========== Size to Class Mapping ========== // Inline size-to-class for fast path (O(1) lookup table) static inline int tiny_fast_size_to_class(size_t size) { // Optimized: Lookup table for O(1) mapping (vs 11-branch linear search) // Table indexed by (size >> 3) for sizes 0-128 // Class mapping: 0:16B, 1:24B, 2:32B, 3:40B, 4:48B, 5:56B, 6:64B, 7:80B, 8:96B, 9:112B, 10:128B static const int8_t size_to_class_lut[17] = { 0, // 0-7 → 16B (class 0) 0, // 8-15 → 16B (class 0) 0, // 16 → 16B (class 0) 1, // 17-23 → 24B (class 1) 1, // 24 → 24B (class 1) 2, // 25-31 → 32B (class 2) 2, // 32 → 32B (class 2) 3, // 33-39 → 40B (class 3) 3, // 40 → 40B (class 3) 4, // 41-47 → 48B (class 4) 4, // 48 → 48B (class 4) 5, // 49-55 → 56B (class 5) 5, // 56 → 56B (class 5) 6, // 57-63 → 64B (class 6) 6, // 64 → 64B (class 6) 7, // 65-79 → 80B (class 7) 8 // 80-95 → 96B (class 8) }; if (__builtin_expect(size > 128, 0)) return -1; // Not tiny // Fast path: Direct lookup (1-2 instructions!) unsigned int idx = size >> 3; // size / 8 if (__builtin_expect(idx < 17, 1)) { return size_to_class_lut[idx]; } // Size 96-128: class 9-10 if (size <= 112) return 9; // 112B (class 9) return 10; // 128B (class 10) } // ========== Forward Declarations ========== // Slow path: refill from Magazine/SuperSlab (implemented in tiny_fastcache.c) void* tiny_fast_refill(int class_idx); void tiny_fast_drain(int class_idx); // ========== Fast Path: Alloc (3-4 instructions!) ========== static inline void* tiny_fast_alloc(size_t size) { uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0; // Step 1: Size to class (1-2 instructions, branch predictor friendly) int cls = tiny_fast_size_to_class(size); if (__builtin_expect(cls < 0, 0)) return NULL; // Not tiny (rare) // Step 2: Pop from alloc_head (hot allocation path) void* ptr = g_tiny_fast_cache[cls]; if (__builtin_expect(ptr != NULL, 1)) { // Fast path: Pop head, decrement count g_tiny_fast_cache[cls] = tiny_next_read(cls, ptr); g_tiny_fast_count[cls]--; if (start) { g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start); g_tiny_malloc_count++; } return ptr; } // ======================================================================== // Phase 6-7: Step 2.5: Lazy Migration from free_head (Phase 2) // If alloc_head empty but free_head has blocks, migrate with pointer swap // This is mimalloc's key optimization: batched migration, zero overhead // ======================================================================== if (__builtin_expect(g_tiny_fast_free_head[cls] != NULL, 0)) { uint64_t mig_start = start ? tiny_fast_rdtsc() : 0; // Migrate entire free_head → alloc_head (pointer swap, instant!) g_tiny_fast_cache[cls] = g_tiny_fast_free_head[cls]; g_tiny_fast_count[cls] = g_tiny_fast_free_count[cls]; g_tiny_fast_free_head[cls] = NULL; g_tiny_fast_free_count[cls] = 0; // Now pop one from newly migrated list ptr = g_tiny_fast_cache[cls]; g_tiny_fast_cache[cls] = tiny_next_read(cls, ptr); g_tiny_fast_count[cls]--; if (mig_start) { g_tiny_migration_cycles += (tiny_fast_rdtsc() - mig_start); g_tiny_migration_count++; } if (start) { g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start); g_tiny_malloc_count++; } return ptr; } // Step 3: Slow path - refill from Magazine/SuperSlab ptr = tiny_fast_refill(cls); if (start) { g_tiny_malloc_cycles += (tiny_fast_rdtsc() - start); g_tiny_malloc_count++; } return ptr; } // ========== Fast Path: Free (2-3 instructions!) ========== static inline void tiny_fast_free(void* ptr, size_t size) { uint64_t start = tiny_fast_profile_enabled() ? tiny_fast_rdtsc() : 0; // Step 1: Size to class int cls = tiny_fast_size_to_class(size); if (__builtin_expect(cls < 0, 0)) return; // Not tiny (error) // ======================================================================== // Phase 6-7: Push to free_head (Phase 2) // Separate free staging area reduces cache line contention with alloc_head // mimalloc's key insight: alloc/free touch different cache lines // ======================================================================== // Step 2: Check free_head capacity if (__builtin_expect(g_tiny_fast_free_count[cls] >= TINY_FAST_CACHE_CAP, 0)) { // Free cache full - drain to Magazine/SuperSlab tiny_fast_drain(cls); } // Step 3: Push to free_head (separate cache line from alloc_head!) tiny_next_write(cls, ptr, g_tiny_fast_free_head[cls]); g_tiny_fast_free_head[cls] = ptr; g_tiny_fast_free_count[cls]++; if (start) { g_tiny_free_cycles += (tiny_fast_rdtsc() - start); g_tiny_free_count++; } } // ========== Initialization ========== static inline void tiny_fast_init(void) { if (g_tiny_fast_initialized) return; memset(g_tiny_fast_cache, 0, sizeof(g_tiny_fast_cache)); memset(g_tiny_fast_count, 0, sizeof(g_tiny_fast_count)); // Phase 6-7: Initialize dual free lists (Phase 2) memset(g_tiny_fast_free_head, 0, sizeof(g_tiny_fast_free_head)); memset(g_tiny_fast_free_count, 0, sizeof(g_tiny_fast_free_count)); g_tiny_fast_initialized = 1; }