Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
296 lines
10 KiB
C
296 lines
10 KiB
C
// hakmem_bigcache.c - Big-Block Cache Implementation
|
||
// Purpose: Per-site ring cache for large allocations
|
||
//
|
||
// License: MIT
|
||
// Date: 2025-10-21
|
||
|
||
#include "hakmem_bigcache.h"
|
||
#include "hakmem_internal.h" // Phase 6.15 P0.1: For HAKMEM_LOG macro
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <stdio.h>
|
||
#include <pthread.h>
|
||
|
||
// ============================================================================
|
||
// Data Structures (Box理論: 箱の内部構造)
|
||
// ============================================================================
|
||
|
||
typedef struct __attribute__((aligned(64))) {
|
||
void* ptr; // Cached pointer (user pointer, not raw)
|
||
size_t actual_bytes; // Actual allocated size (for safety check)
|
||
size_t class_bytes; // Size class (1MB, 2MB, 4MB, 8MB) for indexing
|
||
uintptr_t site; // Allocation site
|
||
int valid; // 1 if slot is valid
|
||
uint16_t freq; // Phase 6.11 P0-BigCache-2: LFU frequency counter (0-65535)
|
||
} BigCacheSlot;
|
||
|
||
// Phase 6.4 P2: O(1) Direct Table [site][class]
|
||
// メモリ使用量: 64 sites × 4 classes × 32 bytes = 8 KB (cache-friendly!)
|
||
static BigCacheSlot g_cache[BIGCACHE_MAX_SITES][BIGCACHE_NUM_CLASSES];
|
||
static pthread_mutex_t g_cache_locks[BIGCACHE_MAX_SITES];
|
||
|
||
// Statistics (for debugging/paper)
|
||
static struct {
|
||
uint64_t hits;
|
||
uint64_t misses;
|
||
uint64_t puts;
|
||
uint64_t evictions;
|
||
uint64_t rejects;
|
||
} g_stats;
|
||
|
||
static int g_initialized = 0;
|
||
|
||
// Phase 6.11 P0-BigCache-2: LFU Hybrid - Decay tracking
|
||
static uint64_t g_put_count = 0; // Total puts (for decay trigger)
|
||
#define LFU_DECAY_INTERVAL 1024 // Decay every 1024 puts (prevents overflow + adapts to workload changes)
|
||
|
||
// ============================================================================
|
||
// Helper Functions (Box内部実装)
|
||
// ============================================================================
|
||
|
||
// Phase 6.11 P0-BigCache-3: FNV-1a hash function (better distribution than modulo)
|
||
// FNV-1a (Fowler-Noll-Vo) hash: fast, simple, excellent distribution
|
||
static inline int hash_site(uintptr_t site) {
|
||
uint32_t hash = 2166136261u; // FNV offset basis
|
||
uint8_t* bytes = (uint8_t*)&site;
|
||
|
||
// FNV-1a: XOR then multiply (better avalanche than FNV-1)
|
||
for (int i = 0; i < sizeof(uintptr_t); i++) {
|
||
hash ^= bytes[i];
|
||
hash *= 16777619u; // FNV prime
|
||
}
|
||
|
||
// Modulo to fit into BIGCACHE_MAX_SITES (256 sites)
|
||
return (int)(hash % BIGCACHE_MAX_SITES);
|
||
}
|
||
|
||
// Check if size is cacheable
|
||
static inline int is_cacheable(size_t size) {
|
||
return size >= BIGCACHE_MIN_SIZE;
|
||
}
|
||
|
||
// Phase 6.11: Finer-grained size-class決定 (8 classes)
|
||
// Returns: 0-7 (class index) for O(1) table lookup
|
||
// Classes: 512KB, 1MB, 2MB, 3MB, 4MB, 6MB, 8MB, 16MB
|
||
static inline int get_class_index(size_t size) {
|
||
// Simple conditional approach (easier to maintain with non-power-of-2 classes)
|
||
if (size < BIGCACHE_CLASS_1MB) return 0; // 512KB-1MB
|
||
if (size < BIGCACHE_CLASS_2MB) return 1; // 1MB-2MB
|
||
if (size < BIGCACHE_CLASS_3MB) return 2; // 2MB-3MB (NEW: reduces fragmentation)
|
||
if (size < BIGCACHE_CLASS_4MB) return 3; // 3MB-4MB (NEW)
|
||
if (size < BIGCACHE_CLASS_6MB) return 4; // 4MB-6MB
|
||
if (size < BIGCACHE_CLASS_8MB) return 5; // 6MB-8MB (NEW)
|
||
if (size < BIGCACHE_CLASS_16MB) return 6; // 8MB-16MB
|
||
return 7; // 16MB+ (NEW: very large allocations)
|
||
}
|
||
|
||
// Get size class bytes from index
|
||
static inline size_t class_index_to_bytes(int class_idx) {
|
||
static const size_t class_sizes[BIGCACHE_NUM_CLASSES] = {
|
||
BIGCACHE_CLASS_512KB, // Phase 6.11: NEW class for 512KB-1MB
|
||
BIGCACHE_CLASS_1MB,
|
||
BIGCACHE_CLASS_2MB,
|
||
BIGCACHE_CLASS_3MB, // Phase 6.11: NEW class to reduce fragmentation (e.g., 2.1MB → 3MB instead of 4MB)
|
||
BIGCACHE_CLASS_4MB,
|
||
BIGCACHE_CLASS_6MB, // Phase 6.11: NEW class
|
||
BIGCACHE_CLASS_8MB,
|
||
BIGCACHE_CLASS_16MB // Phase 6.11: NEW class for very large allocations
|
||
};
|
||
return class_sizes[class_idx];
|
||
}
|
||
|
||
// Callback for actual freeing (set by hakmem.c)
|
||
static void (*g_free_callback)(void* ptr, size_t size) = NULL;
|
||
|
||
// Free a cached block (when evicting)
|
||
static inline void evict_slot(BigCacheSlot* slot) {
|
||
if (!slot->valid) return;
|
||
|
||
// Use callback if available, otherwise just mark invalid
|
||
if (g_free_callback) {
|
||
// Pass actual allocated size, not class_bytes!
|
||
g_free_callback(slot->ptr, slot->actual_bytes);
|
||
}
|
||
|
||
slot->valid = 0;
|
||
slot->freq = 0; // Phase 6.11: Reset frequency on eviction
|
||
g_stats.evictions++;
|
||
}
|
||
|
||
// Phase 6.11 P0-BigCache-2: LFU Hybrid - Decay all frequencies
|
||
// Purpose: Prevent overflow + adapt to changing workload patterns
|
||
static inline void decay_frequencies(void) {
|
||
for (int site_idx = 0; site_idx < BIGCACHE_MAX_SITES; site_idx++) {
|
||
for (int class_idx = 0; class_idx < BIGCACHE_NUM_CLASSES; class_idx++) {
|
||
BigCacheSlot* slot = &g_cache[site_idx][class_idx];
|
||
if (slot->valid) {
|
||
slot->freq = slot->freq >> 1; // Halve frequency (shift right by 1)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// ============================================================================
|
||
// Public API (Box Interface)
|
||
// ============================================================================
|
||
|
||
void hak_bigcache_init(void) {
|
||
if (g_initialized) return;
|
||
|
||
memset(g_cache, 0, sizeof(g_cache));
|
||
memset(&g_stats, 0, sizeof(g_stats));
|
||
|
||
for (int i = 0; i < BIGCACHE_MAX_SITES; i++) {
|
||
pthread_mutex_init(&g_cache_locks[i], NULL);
|
||
}
|
||
|
||
g_initialized = 1;
|
||
|
||
HAKMEM_LOG("[BigCache] Initialized (P2: O(1) direct table, sites=%d, classes=%d)\n",
|
||
BIGCACHE_MAX_SITES, BIGCACHE_NUM_CLASSES);
|
||
HAKMEM_LOG("[BigCache] Size classes: 1MB, 2MB, 4MB, 8MB (P3: branchless)\n");
|
||
}
|
||
|
||
void hak_bigcache_shutdown(void) {
|
||
if (!g_initialized) return;
|
||
|
||
// Free all cached blocks (O(sites × classes) = 64 × 4 = 256 slots)
|
||
for (int site_idx = 0; site_idx < BIGCACHE_MAX_SITES; site_idx++) {
|
||
for (int class_idx = 0; class_idx < BIGCACHE_NUM_CLASSES; class_idx++) {
|
||
BigCacheSlot* slot = &g_cache[site_idx][class_idx];
|
||
if (slot->valid) {
|
||
evict_slot(slot);
|
||
}
|
||
}
|
||
}
|
||
|
||
hak_bigcache_print_stats();
|
||
|
||
g_initialized = 0;
|
||
}
|
||
|
||
// Phase 6.4 P2: O(1) get - Direct table lookup
|
||
int hak_bigcache_try_get(size_t size, uintptr_t site, void** out_ptr) {
|
||
if (!g_initialized) hak_bigcache_init();
|
||
if (!is_cacheable(size)) return 0;
|
||
|
||
// O(1) calculation: site_idx, class_idx
|
||
int site_idx = hash_site(site);
|
||
int class_idx = get_class_index(size); // P3: branchless
|
||
|
||
// O(1) lookup: table[site_idx][class_idx]
|
||
pthread_mutex_t* lock = &g_cache_locks[site_idx];
|
||
pthread_mutex_lock(lock);
|
||
BigCacheSlot* slot = &g_cache[site_idx][class_idx];
|
||
|
||
// Check: valid, matching site, AND sufficient size (Segfault fix!)
|
||
if (slot->valid && slot->site == site && slot->actual_bytes >= size) {
|
||
// Hit! Return and invalidate slot
|
||
*out_ptr = slot->ptr;
|
||
slot->valid = 0;
|
||
|
||
// Phase 6.11 P0-BigCache-2: LFU - increment frequency on hit (saturating at 65535)
|
||
if (slot->freq < 65535) slot->freq++;
|
||
|
||
g_stats.hits++;
|
||
pthread_mutex_unlock(lock);
|
||
return 1;
|
||
}
|
||
|
||
// Miss (invalid, wrong site, or undersized)
|
||
g_stats.misses++;
|
||
pthread_mutex_unlock(lock);
|
||
return 0;
|
||
}
|
||
|
||
// Phase 6.4 P2: O(1) put - Direct table insertion
|
||
int hak_bigcache_put(void* ptr, size_t actual_bytes, uintptr_t site) {
|
||
if (!g_initialized) hak_bigcache_init();
|
||
if (!is_cacheable(actual_bytes)) {
|
||
g_stats.rejects++;
|
||
return 0;
|
||
}
|
||
|
||
// O(1) calculation: site_idx, class_idx
|
||
int site_idx = hash_site(site);
|
||
int class_idx = get_class_index(actual_bytes); // P3: branchless
|
||
|
||
// O(1) lookup: table[site_idx][class_idx]
|
||
pthread_mutex_t* lock = &g_cache_locks[site_idx];
|
||
pthread_mutex_lock(lock);
|
||
BigCacheSlot* slot = &g_cache[site_idx][class_idx];
|
||
|
||
// Phase 6.11 P0-BigCache-2: LFU Hybrid Eviction
|
||
// Instead of evicting target slot directly, find coldest slot in same site
|
||
if (slot->valid) {
|
||
BigCacheSlot* coldest = slot;
|
||
uint16_t min_freq = slot->freq;
|
||
|
||
// Scan all class slots in same site (8 slots max)
|
||
for (int c = 0; c < BIGCACHE_NUM_CLASSES; c++) {
|
||
BigCacheSlot* candidate = &g_cache[site_idx][c];
|
||
if (!candidate->valid) {
|
||
// Invalid slot = coldest (freq=0, prefer reusing empty slots)
|
||
coldest = candidate;
|
||
break;
|
||
}
|
||
if (candidate->freq < min_freq) {
|
||
min_freq = candidate->freq;
|
||
coldest = candidate;
|
||
}
|
||
}
|
||
|
||
// Evict coldest slot (might be target slot, might be different)
|
||
evict_slot(coldest);
|
||
|
||
// If we evicted a different slot, use it instead of target slot
|
||
if (coldest != slot) {
|
||
slot = coldest;
|
||
class_idx = get_class_index(actual_bytes); // Recalculate class for new slot
|
||
}
|
||
}
|
||
|
||
// Store in cache (O(1) direct write)
|
||
slot->ptr = ptr;
|
||
slot->actual_bytes = actual_bytes; // Store actual size (Segfault fix!)
|
||
slot->class_bytes = class_index_to_bytes(class_idx); // For stats/debugging
|
||
slot->site = site;
|
||
slot->valid = 1;
|
||
slot->freq = 0; // Phase 6.11: Initialize frequency to 0 (will increment on first hit)
|
||
|
||
g_stats.puts++;
|
||
g_put_count++;
|
||
|
||
// Phase 6.11 P0-BigCache-2: Periodic decay (every 1024 puts)
|
||
if (g_put_count % LFU_DECAY_INTERVAL == 0) {
|
||
decay_frequencies();
|
||
}
|
||
|
||
pthread_mutex_unlock(lock);
|
||
return 1;
|
||
}
|
||
|
||
void hak_bigcache_print_stats(void) {
|
||
if (!g_initialized) return;
|
||
|
||
printf("\n========================================\n");
|
||
printf("BigCache Statistics\n");
|
||
printf("========================================\n");
|
||
printf("Hits: %lu\n", (unsigned long)g_stats.hits);
|
||
printf("Misses: %lu\n", (unsigned long)g_stats.misses);
|
||
printf("Puts: %lu\n", (unsigned long)g_stats.puts);
|
||
printf("Evictions: %lu\n", (unsigned long)g_stats.evictions);
|
||
printf("Rejects: %lu\n", (unsigned long)g_stats.rejects);
|
||
|
||
if (g_stats.hits + g_stats.misses > 0) {
|
||
double hit_rate = (double)g_stats.hits / (g_stats.hits + g_stats.misses) * 100.0;
|
||
printf("Hit Rate: %.1f%%\n", hit_rate);
|
||
}
|
||
|
||
printf("========================================\n");
|
||
}
|
||
|
||
void hak_bigcache_set_free_callback(hak_bigcache_free_fn_t fn) {
|
||
g_free_callback = fn;
|
||
}
|