Files
hakmem/core/tiny_fastcache.c
Moe Charm (CI) 9b0d746407 Phase 3d-B: TLS Cache Merge - Unified g_tls_sll[] structure (+12-18% expected)
Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified
TinyTLSSLL struct to improve L1D cache locality. Expected performance gain:
+12-18% from reducing cache line splits (2 loads → 1 load per operation).

Changes:
- core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad)
- core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8]
- core/box/tls_sll_box.h: Update Box API (13 sites) for unified access
- Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head
- Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count
- core/hakmem_tiny_integrity.h: Unified canary guards
- core/box/integrity_box.c: Simplified canary validation
- Makefile: Added core/box/tiny_sizeclass_hist_box.o to link

Build:  PASS (10K ops sanity test)
Warnings: Only pre-existing LTO type mismatches (unrelated)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 07:32:30 +09:00

306 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_fastcache.c - Slow path for Tiny Fast Cache (refill/drain)
// Phase 6-3: Refill from Magazine/SuperSlab when fast cache misses
#include "tiny_fastcache.h"
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "box/tiny_next_ptr_box.h" // Phase E1-CORRECT: Box API
#include <stdio.h>
#include <stdlib.h>
// ========== TLS Cache Definitions ==========
// (Declared as extern in tiny_fastcache.h)
// CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads
__thread void* g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0};
__thread uint32_t g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0};
__thread int g_tiny_fast_initialized = 0;
// ========== Phase 6-7: Dual Free Lists (Phase 2) ==========
// Inspired by mimalloc's local/remote split design
// Separate alloc/free paths to reduce cache line bouncing
__thread void* g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}; // Free staging area
__thread uint32_t g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}; // Free count
// ========== External References ==========
// External references to existing Tiny infrastructure (from hakmem_tiny.c)
extern __thread TinyTLSSLL g_tls_sll[];
extern int g_use_superslab;
// From hakmem_tiny.c
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
// ========== Batch Refill Configuration ==========
// How many blocks to refill per miss (batch amortization)
#ifndef TINY_FAST_REFILL_BATCH
#define TINY_FAST_REFILL_BATCH 16
#endif
// ========== Debug Counters ==========
static __thread uint64_t g_tiny_fast_refill_count = 0;
static __thread uint64_t g_tiny_fast_drain_count = 0;
// ========== RDTSC Cycle Profiling ==========
// Ultra-lightweight profiling using CPU Time-Stamp Counter (~10 cycles overhead)
#ifdef __x86_64__
static inline uint64_t rdtsc(void) {
unsigned int lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t rdtsc(void) { return 0; } // Fallback for non-x86
#endif
// Per-thread cycle counters (gated by HAKMEM_TINY_PROFILE env var)
// Declared as extern in tiny_fastcache.h for inline functions
__thread uint64_t g_tiny_malloc_count = 0;
__thread uint64_t g_tiny_malloc_cycles = 0;
__thread uint64_t g_tiny_free_count = 0;
__thread uint64_t g_tiny_free_cycles = 0;
__thread uint64_t g_tiny_refill_cycles = 0;
__thread uint64_t g_tiny_migration_count = 0;
__thread uint64_t g_tiny_migration_cycles = 0;
// Refill failure tracking
static __thread uint64_t g_refill_success_count = 0;
static __thread uint64_t g_refill_partial_count = 0; // Some blocks allocated
static __thread uint64_t g_refill_fail_count = 0; // Zero blocks allocated
static __thread uint64_t g_refill_total_blocks = 0; // Total blocks actually allocated
int g_profile_enabled = -1; // -1: uninitialized, 0: off, 1: on (extern in header)
static inline int profile_enabled(void) {
if (__builtin_expect(g_profile_enabled == -1, 0)) {
const char* env = getenv("HAKMEM_TINY_PROFILE");
g_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
}
return g_profile_enabled;
}
// Forward declarations for atexit registration
void tiny_fast_print_stats(void);
void tiny_fast_print_profile(void);
// ========== Slow Path: Refill from Magazine/SuperSlab ==========
void* tiny_fast_refill(int class_idx) {
uint64_t start = profile_enabled() ? rdtsc() : 0;
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return NULL;
}
g_tiny_fast_refill_count++;
// Register stats printer on first refill (once per thread)
static __thread int stats_registered = 0;
if (!stats_registered) {
atexit(tiny_fast_print_stats);
if (profile_enabled()) {
atexit(tiny_fast_print_profile);
}
stats_registered = 1;
}
// ========================================================================
// Phase 6-6: Batch Refill Optimization (Phase 3)
// Inspired by mimalloc's page-based refill and glibc's tcache batch refill
//
// OLD: 16 individual allocations + 16 individual pushes (16 × 100 cycles = 1,600 cycles)
// NEW: Batch allocate + link in one pass (~200 cycles, -87% cost)
// ========================================================================
// Get size from class mapping
static const size_t class_sizes[] = {16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160, 176, 192, 256};
size_t size = (class_idx < 16) ? class_sizes[class_idx] : 16;
// Step 1: Batch allocate into temporary array
void* batch[TINY_FAST_REFILL_BATCH];
int count = 0;
extern void* hak_tiny_alloc(size_t size);
for (int i = 0; i < TINY_FAST_REFILL_BATCH; i++) {
void* ptr = hak_tiny_alloc(size);
if (!ptr) break; // OOM or allocation failed
batch[count++] = ptr;
}
// Track refill results
if (count == 0) {
g_refill_fail_count++;
return NULL; // Complete failure
} else if (count < TINY_FAST_REFILL_BATCH) {
g_refill_partial_count++;
} else {
g_refill_success_count++;
}
g_refill_total_blocks += count;
// Step 2: Link all blocks into freelist in one pass (batch linking)
// This is the key optimization: N individual pushes → 1 batch link
for (int i = 0; i < count - 1; i++) {
tiny_next_write(class_idx, batch[i], batch[i + 1]);
}
tiny_next_write(class_idx, batch[count - 1], NULL); // Terminate list
// Step 3: Attach batch to cache head
g_tiny_fast_cache[class_idx] = batch[0];
g_tiny_fast_count[class_idx] = count;
// Step 4: Pop one for the caller
void* result = g_tiny_fast_cache[class_idx];
g_tiny_fast_cache[class_idx] = tiny_next_read(class_idx, result);
g_tiny_fast_count[class_idx]--;
// Profile: Record refill cycles
if (start) {
g_tiny_refill_cycles += (rdtsc() - start);
}
return result;
}
// ========== Slow Path: Drain to Magazine/SuperSlab ==========
void tiny_fast_drain(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_FAST_CLASS_COUNT) {
return;
}
g_tiny_fast_drain_count++;
// ========================================================================
// Phase 6-7: Drain from free_head (Phase 2)
// Since frees go to free_head, drain from there when capacity exceeded
// ========================================================================
// Drain half of the free_head to Magazine/SuperSlab
// TODO: For now, we just reduce the count limit
// In a full implementation, we'd push blocks back to Magazine freelist
// Simple approach: just drop half the cache (temporary, for testing)
// A full implementation would return blocks to SuperSlab freelist
uint32_t target = TINY_FAST_CACHE_CAP / 2;
while (g_tiny_fast_free_count[class_idx] > target) {
void* ptr = g_tiny_fast_free_head[class_idx];
if (!ptr) break;
g_tiny_fast_free_head[class_idx] = tiny_next_read(class_idx, ptr);
g_tiny_fast_free_count[class_idx]--;
// TODO: Return to Magazine/SuperSlab
// For now, we'll just re-push it (no-op, but prevents loss)
// In production, call hak_tiny_free_slow(ptr, class_idx)
}
}
// ========== Debug Stats ==========
void tiny_fast_print_stats(void) {
static const char* env = NULL;
static int checked = 0;
if (!checked) {
env = getenv("HAKMEM_TINY_FAST_STATS");
checked = 1;
}
if (env && *env && *env != '0') {
fprintf(stderr, "[TINY_FAST] refills=%lu drains=%lu\n",
(unsigned long)g_tiny_fast_refill_count,
(unsigned long)g_tiny_fast_drain_count);
}
}
// ========== RDTSC Cycle Profiling Output ==========
// External routing counters from hakmem.c
extern __thread uint64_t g_malloc_total_calls;
extern __thread uint64_t g_malloc_tiny_size_match;
extern __thread uint64_t g_malloc_fast_path_tried;
extern __thread uint64_t g_malloc_fast_path_null;
extern __thread uint64_t g_malloc_slow_path;
void tiny_fast_print_profile(void) {
#ifndef HAKMEM_FORCE_LIBC_ALLOC_BUILD
if (!profile_enabled()) return;
if (g_tiny_malloc_count == 0 && g_tiny_free_count == 0) return; // No data
fprintf(stderr, "\n========== HAKMEM Tiny Fast Path Profile (RDTSC cycles) ==========\n");
// Routing statistics first
if (g_malloc_total_calls > 0) {
fprintf(stderr, "\n[ROUTING]\n");
fprintf(stderr, " Total malloc() calls: %lu\n", (unsigned long)g_malloc_total_calls);
fprintf(stderr, " Size <= %d (tiny range): %lu (%.1f%%)\n",
TINY_FAST_THRESHOLD,
(unsigned long)g_malloc_tiny_size_match,
100.0 * g_malloc_tiny_size_match / g_malloc_total_calls);
fprintf(stderr, " Fast path tried: %lu (%.1f%%)\n",
(unsigned long)g_malloc_fast_path_tried,
100.0 * g_malloc_fast_path_tried / g_malloc_total_calls);
fprintf(stderr, " Fast path returned NULL: %lu (%.1f%% of tried)\n",
(unsigned long)g_malloc_fast_path_null,
g_malloc_fast_path_tried > 0 ? 100.0 * g_malloc_fast_path_null / g_malloc_fast_path_tried : 0);
fprintf(stderr, " Slow path entered: %lu (%.1f%%)\n\n",
(unsigned long)g_malloc_slow_path,
100.0 * g_malloc_slow_path / g_malloc_total_calls);
}
if (g_tiny_malloc_count > 0) {
uint64_t avg_malloc = g_tiny_malloc_cycles / g_tiny_malloc_count;
fprintf(stderr, "[MALLOC] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_malloc_count,
(unsigned long)g_tiny_malloc_cycles,
(unsigned long)avg_malloc);
}
if (g_tiny_free_count > 0) {
uint64_t avg_free = g_tiny_free_cycles / g_tiny_free_count;
fprintf(stderr, "[FREE] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_free_count,
(unsigned long)g_tiny_free_cycles,
(unsigned long)avg_free);
}
if (g_tiny_fast_refill_count > 0) {
uint64_t avg_refill = g_tiny_refill_cycles / g_tiny_fast_refill_count;
fprintf(stderr, "[REFILL] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_fast_refill_count,
(unsigned long)g_tiny_refill_cycles,
(unsigned long)avg_refill);
// Refill success/failure breakdown
fprintf(stderr, "[REFILL SUCCESS] count=%lu (%.1f%%) - full batch\n",
(unsigned long)g_refill_success_count,
100.0 * g_refill_success_count / g_tiny_fast_refill_count);
fprintf(stderr, "[REFILL PARTIAL] count=%lu (%.1f%%) - some blocks\n",
(unsigned long)g_refill_partial_count,
100.0 * g_refill_partial_count / g_tiny_fast_refill_count);
fprintf(stderr, "[REFILL FAIL] count=%lu (%.1f%%) - zero blocks\n",
(unsigned long)g_refill_fail_count,
100.0 * g_refill_fail_count / g_tiny_fast_refill_count);
fprintf(stderr, "[REFILL AVG BLOCKS] %.1f per refill (target=%d)\n",
(double)g_refill_total_blocks / g_tiny_fast_refill_count,
TINY_FAST_REFILL_BATCH);
}
if (g_tiny_migration_count > 0) {
uint64_t avg_migration = g_tiny_migration_cycles / g_tiny_migration_count;
fprintf(stderr, "[MIGRATE] count=%lu, total_cycles=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_migration_count,
(unsigned long)g_tiny_migration_cycles,
(unsigned long)avg_migration);
}
fprintf(stderr, "===================================================================\n\n");
#endif // !HAKMEM_FORCE_LIBC_ALLOC_BUILD
}