257 lines
9.1 KiB
C
257 lines
9.1 KiB
C
|
|
// tls_ss_hint_box.h - TLS SuperSlab Hint Cache for Headerless Mode
|
||
|
|
//
|
||
|
|
// BOX THEORY:
|
||
|
|
// -----------
|
||
|
|
// Mission: Cache recently-used SuperSlab references in TLS to accelerate
|
||
|
|
// ptr→SuperSlab resolution in Headerless mode, avoiding expensive
|
||
|
|
// hash table lookups on the critical free() path.
|
||
|
|
//
|
||
|
|
// Design: Provides O(1) lookup for hot SuperSlabs (L1 cache hit, 2-5 cycles)
|
||
|
|
// Falls back to global registry on miss (fail-safe, no data loss)
|
||
|
|
// No ownership, no remote queues, pure read-only cache
|
||
|
|
// FIFO eviction policy with configurable cache size (4 slots)
|
||
|
|
//
|
||
|
|
// Invariants:
|
||
|
|
// - hint.base <= ptr < hint.end implies hint.ss is valid
|
||
|
|
// - Miss is always safe (triggers fallback to hak_super_lookup)
|
||
|
|
// - TLS data survives only within thread lifetime
|
||
|
|
// - Cache entries are invalidated implicitly by FIFO rotation
|
||
|
|
// - Magic number check (SUPERSLAB_MAGIC) validates all pointers
|
||
|
|
//
|
||
|
|
// Boundary:
|
||
|
|
// - Input: raw user pointer (void* ptr) from free() path
|
||
|
|
// - Output: SuperSlab* or NULL (miss triggers fallback)
|
||
|
|
// - Does NOT determine class_idx (that's slab_index_for's job)
|
||
|
|
// - Does NOT perform ownership validation (that's SuperSlab's job)
|
||
|
|
//
|
||
|
|
// Performance:
|
||
|
|
// - Cache hit: 2-5 cycles (L1 cache hit, 4 pointer comparisons)
|
||
|
|
// - Cache miss: fallback to hak_super_lookup (10-50 cycles)
|
||
|
|
// - Expected hit rate: 85-95% for single-threaded workloads
|
||
|
|
// - Expected hit rate: 70-85% for multi-threaded workloads
|
||
|
|
//
|
||
|
|
// Thread Safety:
|
||
|
|
// - TLS storage: no sharing, no synchronization required
|
||
|
|
// - Read-only cache: never modifies SuperSlab state
|
||
|
|
// - Stale entries: caught by magic number check
|
||
|
|
|
||
|
|
#ifndef TLS_SS_HINT_BOX_H
|
||
|
|
#define TLS_SS_HINT_BOX_H
|
||
|
|
|
||
|
|
#include <stdint.h>
|
||
|
|
#include <stdbool.h>
|
||
|
|
#include <stddef.h>
|
||
|
|
#include "hakmem_build_flags.h"
|
||
|
|
|
||
|
|
// Forward declaration
|
||
|
|
struct SuperSlab;
|
||
|
|
|
||
|
|
// Cache entry for a single SuperSlab hint
|
||
|
|
// Size: 24 bytes (cache-friendly, fits in 1 cache line with metadata)
|
||
|
|
typedef struct {
|
||
|
|
void* base; // SuperSlab base address (aligned to 1MB or 2MB)
|
||
|
|
void* end; // base + superslab_size (for range check)
|
||
|
|
struct SuperSlab* ss; // Cached SuperSlab pointer
|
||
|
|
} TlsSsHintEntry;
|
||
|
|
|
||
|
|
// TLS hint cache configuration
|
||
|
|
// - 4 slots provide good hit rate without excessive overhead
|
||
|
|
// - Larger caches (8, 16) show diminishing returns in benchmarks
|
||
|
|
// - Smaller caches (2) may thrash on workloads with 3+ active SuperSlabs
|
||
|
|
#define TLS_SS_HINT_SLOTS 4
|
||
|
|
|
||
|
|
// Thread-local SuperSlab hint cache
|
||
|
|
// Total size: 24*4 + 16 = 112 bytes per thread (negligible overhead)
|
||
|
|
typedef struct {
|
||
|
|
TlsSsHintEntry entries[TLS_SS_HINT_SLOTS]; // Cache entries
|
||
|
|
uint32_t count; // Number of valid entries (0 to TLS_SS_HINT_SLOTS)
|
||
|
|
uint32_t next_slot; // Next slot for FIFO rotation (wraps at TLS_SS_HINT_SLOTS)
|
||
|
|
|
||
|
|
// Statistics (optional, for profiling builds)
|
||
|
|
// Disabled in HAKMEM_BUILD_RELEASE to save 16 bytes per thread
|
||
|
|
#if !HAKMEM_BUILD_RELEASE
|
||
|
|
uint64_t hits; // Cache hit count
|
||
|
|
uint64_t misses; // Cache miss count
|
||
|
|
#endif
|
||
|
|
} TlsSsHintCache;
|
||
|
|
|
||
|
|
// Thread-local storage instance
|
||
|
|
// Initialized to zero by TLS semantics, formal init in tls_ss_hint_init()
|
||
|
|
extern __thread TlsSsHintCache g_tls_ss_hint;
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// API FUNCTIONS
|
||
|
|
// ============================================================================
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @brief Initialize TLS hint cache for current thread
|
||
|
|
*
|
||
|
|
* Call once per thread, typically in thread-local initialization path.
|
||
|
|
* Safe to call multiple times (idempotent).
|
||
|
|
*
|
||
|
|
* Thread Safety: TLS, no synchronization required
|
||
|
|
* Performance: ~10 cycles (negligible one-time cost)
|
||
|
|
*/
|
||
|
|
static inline void tls_ss_hint_init(void) {
|
||
|
|
// Zero-initialization by TLS, but explicit init for clarity
|
||
|
|
g_tls_ss_hint.count = 0;
|
||
|
|
g_tls_ss_hint.next_slot = 0;
|
||
|
|
|
||
|
|
#if !HAKMEM_BUILD_RELEASE
|
||
|
|
g_tls_ss_hint.hits = 0;
|
||
|
|
g_tls_ss_hint.misses = 0;
|
||
|
|
#endif
|
||
|
|
|
||
|
|
// Clear all entries (paranoid, but cache-friendly loop)
|
||
|
|
for (int i = 0; i < TLS_SS_HINT_SLOTS; i++) {
|
||
|
|
g_tls_ss_hint.entries[i].base = NULL;
|
||
|
|
g_tls_ss_hint.entries[i].end = NULL;
|
||
|
|
g_tls_ss_hint.entries[i].ss = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @brief Update hint cache with a SuperSlab reference
|
||
|
|
*
|
||
|
|
* Called on paths where we know the SuperSlab for a given address range:
|
||
|
|
* - After successful tiny_alloc (cache the allocated-from SuperSlab)
|
||
|
|
* - After superslab refill (cache the newly bound SuperSlab)
|
||
|
|
* - After unified cache refill (cache the refilled SuperSlab)
|
||
|
|
*
|
||
|
|
* Duplicate detection: If the SuperSlab is already cached, no update occurs.
|
||
|
|
* This prevents thrashing when repeatedly allocating from the same SuperSlab.
|
||
|
|
*
|
||
|
|
* @param ss SuperSlab to cache (must be non-NULL, SUPERSLAB_MAGIC validated by caller)
|
||
|
|
* @param base SuperSlab base address (1MB or 2MB aligned)
|
||
|
|
* @param size SuperSlab size in bytes (1MB or 2MB)
|
||
|
|
*
|
||
|
|
* Thread Safety: TLS, no synchronization required
|
||
|
|
* Performance: ~15-20 cycles (duplicate check + FIFO rotation)
|
||
|
|
*/
|
||
|
|
static inline void tls_ss_hint_update(struct SuperSlab* ss, void* base, size_t size) {
|
||
|
|
// Sanity check: reject invalid inputs
|
||
|
|
if (__builtin_expect(!ss || !base || size == 0, 0)) {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Duplicate detection: check if this SuperSlab is already cached
|
||
|
|
// This prevents thrashing when allocating from the same SuperSlab repeatedly
|
||
|
|
for (uint32_t i = 0; i < g_tls_ss_hint.count; i++) {
|
||
|
|
if (g_tls_ss_hint.entries[i].ss == ss) {
|
||
|
|
return; // Already cached, no update needed
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Add to next slot (FIFO rotation)
|
||
|
|
uint32_t slot = g_tls_ss_hint.next_slot;
|
||
|
|
g_tls_ss_hint.entries[slot].base = base;
|
||
|
|
g_tls_ss_hint.entries[slot].end = (char*)base + size;
|
||
|
|
g_tls_ss_hint.entries[slot].ss = ss;
|
||
|
|
|
||
|
|
// Advance to next slot (wrap at TLS_SS_HINT_SLOTS)
|
||
|
|
g_tls_ss_hint.next_slot = (slot + 1) % TLS_SS_HINT_SLOTS;
|
||
|
|
|
||
|
|
// Increment count until cache is full
|
||
|
|
if (g_tls_ss_hint.count < TLS_SS_HINT_SLOTS) {
|
||
|
|
g_tls_ss_hint.count++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @brief Lookup SuperSlab for given pointer (fast path)
|
||
|
|
*
|
||
|
|
* Called on free() entry, before falling back to hak_super_lookup().
|
||
|
|
* Performs linear search over cached entries (4 iterations max).
|
||
|
|
*
|
||
|
|
* Cache hit: Returns true, sets *out_ss to cached SuperSlab pointer
|
||
|
|
* Cache miss: Returns false, caller must use hak_super_lookup()
|
||
|
|
*
|
||
|
|
* @param ptr User pointer to lookup (arbitrary alignment)
|
||
|
|
* @param out_ss Output: SuperSlab pointer if found (only valid if return true)
|
||
|
|
* @return true if cache hit (out_ss is valid), false if miss
|
||
|
|
*
|
||
|
|
* Thread Safety: TLS, no synchronization required
|
||
|
|
* Performance: 2-5 cycles (hit), 8-12 cycles (miss)
|
||
|
|
*
|
||
|
|
* NOTE: Caller MUST validate SUPERSLAB_MAGIC after successful lookup.
|
||
|
|
* This Box does not perform magic validation to keep fast path minimal.
|
||
|
|
*/
|
||
|
|
static inline bool tls_ss_hint_lookup(void* ptr, struct SuperSlab** out_ss) {
|
||
|
|
// Fast path: iterate over valid entries
|
||
|
|
// Unrolling this loop (if count is small) is beneficial, but let compiler decide
|
||
|
|
for (uint32_t i = 0; i < g_tls_ss_hint.count; i++) {
|
||
|
|
TlsSsHintEntry* e = &g_tls_ss_hint.entries[i];
|
||
|
|
|
||
|
|
// Range check: base <= ptr < end
|
||
|
|
// Note: end is exclusive (base + size), so use < not <=
|
||
|
|
if (ptr >= e->base && ptr < e->end) {
|
||
|
|
// Cache hit!
|
||
|
|
*out_ss = e->ss;
|
||
|
|
|
||
|
|
#if !HAKMEM_BUILD_RELEASE
|
||
|
|
g_tls_ss_hint.hits++;
|
||
|
|
#endif
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Cache miss: caller must fall back to hak_super_lookup()
|
||
|
|
#if !HAKMEM_BUILD_RELEASE
|
||
|
|
g_tls_ss_hint.misses++;
|
||
|
|
#endif
|
||
|
|
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @brief Clear all cached hints (for testing/reset)
|
||
|
|
*
|
||
|
|
* Use cases:
|
||
|
|
* - Unit tests: Reset cache between test cases
|
||
|
|
* - Debug: Force cache cold start for profiling
|
||
|
|
* - Thread teardown: Optional cleanup (TLS auto-cleanup on thread exit)
|
||
|
|
*
|
||
|
|
* Thread Safety: TLS, no synchronization required
|
||
|
|
* Performance: ~10 cycles
|
||
|
|
*/
|
||
|
|
static inline void tls_ss_hint_clear(void) {
|
||
|
|
g_tls_ss_hint.count = 0;
|
||
|
|
g_tls_ss_hint.next_slot = 0;
|
||
|
|
|
||
|
|
#if !HAKMEM_BUILD_RELEASE
|
||
|
|
// Preserve stats across clear (for cumulative profiling)
|
||
|
|
// Uncomment to reset stats:
|
||
|
|
// g_tls_ss_hint.hits = 0;
|
||
|
|
// g_tls_ss_hint.misses = 0;
|
||
|
|
#endif
|
||
|
|
|
||
|
|
// Optional: zero out entries (paranoid, not required for correctness)
|
||
|
|
for (int i = 0; i < TLS_SS_HINT_SLOTS; i++) {
|
||
|
|
g_tls_ss_hint.entries[i].base = NULL;
|
||
|
|
g_tls_ss_hint.entries[i].end = NULL;
|
||
|
|
g_tls_ss_hint.entries[i].ss = NULL;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @brief Get cache statistics (for profiling builds)
|
||
|
|
*
|
||
|
|
* Returns hit/miss counters for performance analysis.
|
||
|
|
* Only available in non-release builds (HAKMEM_BUILD_RELEASE=0).
|
||
|
|
*
|
||
|
|
* @param hits Output: Total cache hits
|
||
|
|
* @param misses Output: Total cache misses
|
||
|
|
*
|
||
|
|
* Thread Safety: TLS, no synchronization required
|
||
|
|
* Performance: ~5 cycles (two loads)
|
||
|
|
*/
|
||
|
|
#if !HAKMEM_BUILD_RELEASE
|
||
|
|
static inline void tls_ss_hint_stats(uint64_t* hits, uint64_t* misses) {
|
||
|
|
if (hits) *hits = g_tls_ss_hint.hits;
|
||
|
|
if (misses) *misses = g_tls_ss_hint.misses;
|
||
|
|
}
|
||
|
|
#endif
|
||
|
|
|
||
|
|
#endif // TLS_SS_HINT_BOX_H
|