// tls_ss_hint_box.h - TLS SuperSlab Hint Cache for Headerless Mode // // BOX THEORY: // ----------- // Mission: Cache recently-used SuperSlab references in TLS to accelerate // ptr→SuperSlab resolution in Headerless mode, avoiding expensive // hash table lookups on the critical free() path. // // Design: Provides O(1) lookup for hot SuperSlabs (L1 cache hit, 2-5 cycles) // Falls back to global registry on miss (fail-safe, no data loss) // No ownership, no remote queues, pure read-only cache // FIFO eviction policy with configurable cache size (4 slots) // // Invariants: // - hint.base <= ptr < hint.end implies hint.ss is valid // - Miss is always safe (triggers fallback to hak_super_lookup) // - TLS data survives only within thread lifetime // - Cache entries are invalidated implicitly by FIFO rotation // - Magic number check (SUPERSLAB_MAGIC) validates all pointers // // Boundary: // - Input: raw user pointer (void* ptr) from free() path // - Output: SuperSlab* or NULL (miss triggers fallback) // - Does NOT determine class_idx (that's slab_index_for's job) // - Does NOT perform ownership validation (that's SuperSlab's job) // // Performance: // - Cache hit: 2-5 cycles (L1 cache hit, 4 pointer comparisons) // - Cache miss: fallback to hak_super_lookup (10-50 cycles) // - Expected hit rate: 85-95% for single-threaded workloads // - Expected hit rate: 70-85% for multi-threaded workloads // // Thread Safety: // - TLS storage: no sharing, no synchronization required // - Read-only cache: never modifies SuperSlab state // - Stale entries: caught by magic number check #ifndef TLS_SS_HINT_BOX_H #define TLS_SS_HINT_BOX_H #include #include #include #include "hakmem_build_flags.h" // Forward declaration struct SuperSlab; // Cache entry for a single SuperSlab hint // Size: 24 bytes (cache-friendly, fits in 1 cache line with metadata) typedef struct { void* base; // SuperSlab base address (aligned to 1MB or 2MB) void* end; // base + superslab_size (for range check) struct SuperSlab* ss; // Cached SuperSlab pointer } TlsSsHintEntry; // TLS hint cache configuration // - 4 slots provide good hit rate without excessive overhead // - Larger caches (8, 16) show diminishing returns in benchmarks // - Smaller caches (2) may thrash on workloads with 3+ active SuperSlabs #define TLS_SS_HINT_SLOTS 4 // Thread-local SuperSlab hint cache // Total size: 24*4 + 16 = 112 bytes per thread (negligible overhead) typedef struct { TlsSsHintEntry entries[TLS_SS_HINT_SLOTS]; // Cache entries uint32_t count; // Number of valid entries (0 to TLS_SS_HINT_SLOTS) uint32_t next_slot; // Next slot for FIFO rotation (wraps at TLS_SS_HINT_SLOTS) // Statistics (optional, for profiling builds) // Disabled in HAKMEM_BUILD_RELEASE to save 16 bytes per thread #if !HAKMEM_BUILD_RELEASE uint64_t hits; // Cache hit count uint64_t misses; // Cache miss count #endif } TlsSsHintCache; // Thread-local storage instance // Initialized to zero by TLS semantics, formal init in tls_ss_hint_init() extern __thread TlsSsHintCache g_tls_ss_hint; // ============================================================================ // API FUNCTIONS // ============================================================================ /** * @brief Initialize TLS hint cache for current thread * * Call once per thread, typically in thread-local initialization path. * Safe to call multiple times (idempotent). * * Thread Safety: TLS, no synchronization required * Performance: ~10 cycles (negligible one-time cost) */ static inline void tls_ss_hint_init(void) { // Zero-initialization by TLS, but explicit init for clarity g_tls_ss_hint.count = 0; g_tls_ss_hint.next_slot = 0; #if !HAKMEM_BUILD_RELEASE g_tls_ss_hint.hits = 0; g_tls_ss_hint.misses = 0; #endif // Clear all entries (paranoid, but cache-friendly loop) for (int i = 0; i < TLS_SS_HINT_SLOTS; i++) { g_tls_ss_hint.entries[i].base = NULL; g_tls_ss_hint.entries[i].end = NULL; g_tls_ss_hint.entries[i].ss = NULL; } } /** * @brief Update hint cache with a SuperSlab reference * * Called on paths where we know the SuperSlab for a given address range: * - After successful tiny_alloc (cache the allocated-from SuperSlab) * - After superslab refill (cache the newly bound SuperSlab) * - After unified cache refill (cache the refilled SuperSlab) * * Duplicate detection: If the SuperSlab is already cached, no update occurs. * This prevents thrashing when repeatedly allocating from the same SuperSlab. * * @param ss SuperSlab to cache (must be non-NULL, SUPERSLAB_MAGIC validated by caller) * @param base SuperSlab base address (1MB or 2MB aligned) * @param size SuperSlab size in bytes (1MB or 2MB) * * Thread Safety: TLS, no synchronization required * Performance: ~15-20 cycles (duplicate check + FIFO rotation) */ static inline void tls_ss_hint_update(struct SuperSlab* ss, void* base, size_t size) { // Sanity check: reject invalid inputs if (__builtin_expect(!ss || !base || size == 0, 0)) { return; } // Duplicate detection: check if this SuperSlab is already cached // This prevents thrashing when allocating from the same SuperSlab repeatedly for (uint32_t i = 0; i < g_tls_ss_hint.count; i++) { if (g_tls_ss_hint.entries[i].ss == ss) { return; // Already cached, no update needed } } // Add to next slot (FIFO rotation) uint32_t slot = g_tls_ss_hint.next_slot; g_tls_ss_hint.entries[slot].base = base; g_tls_ss_hint.entries[slot].end = (char*)base + size; g_tls_ss_hint.entries[slot].ss = ss; // Advance to next slot (wrap at TLS_SS_HINT_SLOTS) g_tls_ss_hint.next_slot = (slot + 1) % TLS_SS_HINT_SLOTS; // Increment count until cache is full if (g_tls_ss_hint.count < TLS_SS_HINT_SLOTS) { g_tls_ss_hint.count++; } } /** * @brief Lookup SuperSlab for given pointer (fast path) * * Called on free() entry, before falling back to hak_super_lookup(). * Performs linear search over cached entries (4 iterations max). * * Cache hit: Returns true, sets *out_ss to cached SuperSlab pointer * Cache miss: Returns false, caller must use hak_super_lookup() * * @param ptr User pointer to lookup (arbitrary alignment) * @param out_ss Output: SuperSlab pointer if found (only valid if return true) * @return true if cache hit (out_ss is valid), false if miss * * Thread Safety: TLS, no synchronization required * Performance: 2-5 cycles (hit), 8-12 cycles (miss) * * NOTE: Caller MUST validate SUPERSLAB_MAGIC after successful lookup. * This Box does not perform magic validation to keep fast path minimal. */ static inline bool tls_ss_hint_lookup(void* ptr, struct SuperSlab** out_ss) { // Fast path: iterate over valid entries // Unrolling this loop (if count is small) is beneficial, but let compiler decide for (uint32_t i = 0; i < g_tls_ss_hint.count; i++) { TlsSsHintEntry* e = &g_tls_ss_hint.entries[i]; // Range check: base <= ptr < end // Note: end is exclusive (base + size), so use < not <= if (ptr >= e->base && ptr < e->end) { // Cache hit! *out_ss = e->ss; #if !HAKMEM_BUILD_RELEASE g_tls_ss_hint.hits++; #endif return true; } } // Cache miss: caller must fall back to hak_super_lookup() #if !HAKMEM_BUILD_RELEASE g_tls_ss_hint.misses++; #endif return false; } /** * @brief Clear all cached hints (for testing/reset) * * Use cases: * - Unit tests: Reset cache between test cases * - Debug: Force cache cold start for profiling * - Thread teardown: Optional cleanup (TLS auto-cleanup on thread exit) * * Thread Safety: TLS, no synchronization required * Performance: ~10 cycles */ static inline void tls_ss_hint_clear(void) { g_tls_ss_hint.count = 0; g_tls_ss_hint.next_slot = 0; #if !HAKMEM_BUILD_RELEASE // Preserve stats across clear (for cumulative profiling) // Uncomment to reset stats: // g_tls_ss_hint.hits = 0; // g_tls_ss_hint.misses = 0; #endif // Optional: zero out entries (paranoid, not required for correctness) for (int i = 0; i < TLS_SS_HINT_SLOTS; i++) { g_tls_ss_hint.entries[i].base = NULL; g_tls_ss_hint.entries[i].end = NULL; g_tls_ss_hint.entries[i].ss = NULL; } } /** * @brief Get cache statistics (for profiling builds) * * Returns hit/miss counters for performance analysis. * Only available in non-release builds (HAKMEM_BUILD_RELEASE=0). * * @param hits Output: Total cache hits * @param misses Output: Total cache misses * * Thread Safety: TLS, no synchronization required * Performance: ~5 cycles (two loads) */ #if !HAKMEM_BUILD_RELEASE static inline void tls_ss_hint_stats(uint64_t* hits, uint64_t* misses) { if (hits) *hits = g_tls_ss_hint.hits; if (misses) *misses = g_tls_ss_hint.misses; } #endif #endif // TLS_SS_HINT_BOX_H