Files
hakmem/core/box/bench_fast_box.c
Moe Charm (CI) cfa587c61d Phase 8-Step1-3: Unified Cache hot path optimization (config macro + prewarm + PGO init removal)
Goal: Reduce branches in Unified Cache hot paths (-2 branches per op)
Expected improvement: +2-3% in PGO mode

Changes:
1. Config Macro (Step 1):
   - Added TINY_FRONT_UNIFIED_CACHE_ENABLED macro to tiny_front_config_box.h
   - PGO mode: compile-time constant (1)
   - Normal mode: runtime function call unified_cache_enabled()
   - Replaced unified_cache_enabled() calls in 3 locations:
     * unified_cache_pop() line 142
     * unified_cache_push() line 182
     * unified_cache_pop_or_refill() line 228

2. Function Declaration Fix:
   - Moved unified_cache_enabled() from static inline to non-static
   - Implementation in tiny_unified_cache.c (was in .h as static inline)
   - Forward declaration in tiny_front_config_box.h
   - Resolves declaration conflict between config box and header

3. Prewarm (Step 2):
   - Added unified_cache_init() call to bench_fast_init()
   - Ensures cache is initialized before benchmark starts
   - Enables PGO builds to remove lazy init checks

4. Conditional Init Removal (Step 3):
   - Wrapped lazy init checks in #if !HAKMEM_TINY_FRONT_PGO
   - PGO builds assume prewarm → no init check needed (-1 branch)
   - Normal builds keep lazy init for safety
   - Applied to 3 functions: unified_cache_pop(), unified_cache_push(), unified_cache_pop_or_refill()

Performance Impact:
  PGO mode: -2 branches per operation (enabled check + init check)
  Normal mode: Same as before (runtime checks)

Branch Elimination (PGO):
  Before: if (!unified_cache_enabled()) + if (slots == NULL)
  After:  if (!1) [eliminated] + [init check removed]
  Result: -2 branches in alloc/free hot paths

Files Modified:
  core/box/tiny_front_config_box.h        - Config macro + forward declaration
  core/front/tiny_unified_cache.h         - Config macro usage + PGO conditionals
  core/front/tiny_unified_cache.c         - unified_cache_enabled() implementation
  core/box/bench_fast_box.c               - Prewarm call in bench_fast_init()

Note: BenchFast mode has pre-existing crash (not caused by these changes)

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 17:58:42 +09:00

224 lines
8.2 KiB
C

// bench_fast_box.c - BenchFast Mode Implementation
// Purpose: Ultra-minimal Tiny alloc/free for structural ceiling measurement
// WARNING: Bypasses ALL safety mechanisms - benchmark only!
#include "bench_fast_box.h"
#include "../hakmem_tiny.h"
#include "../tiny_region_id.h"
#include "../box/tiny_next_ptr_box.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// External Tiny infrastructure (defined in hakmem_tiny.c)
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern int g_tls_sll_enable;
extern int hak_tiny_size_to_class(size_t size);
extern const size_t g_tiny_class_sizes[];
// Public API fallbacks (correct signatures from hakmem.h)
#include "../hakmem.h"
// Guard: Disable BenchFast during initialization to avoid recursion
// NOTE: Defined here and declared extern in bench_fast_box.h so that
// malloc/free wrappers can also see it and skip BenchFast during init.
__thread int bench_fast_init_in_progress = 0;
// BenchFast alloc - Minimal path (POP-ONLY, NO REFILL)
// Flow:
// 1. size → class_idx (inline table lookup)
// 2. TLS SLL pop (3-4 instructions)
// 3. Write header + return (2-3 instructions)
// NOTE: No refill! Pool must be preallocated via bench_fast_init()
void* bench_fast_alloc(size_t size) {
// Guard: Avoid recursion during init phase
if (__builtin_expect(bench_fast_init_in_progress, 0)) {
// Initialization in progress - use normal allocator to avoid recursion
return hak_alloc_at(size, "bench_fast_alloc_init");
}
// 1. Size → class_idx (inline, 1-2 instructions)
int class_idx = hak_tiny_size_to_class(size);
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
fprintf(stderr, "[BENCH_FAST] Invalid size %zu (class %d out of range)\n",
size, class_idx);
return NULL; // Out of range
}
// 2. TLS SLL pop (3-4 instructions) - NO REFILL!
void* base = NULL;
void* head = g_tls_sll[class_idx].head;
if (__builtin_expect(head != NULL, 1)) {
// Read next pointer from header (header+1 = next ptr storage)
void* next = tiny_next_read(class_idx, head);
g_tls_sll[class_idx].head = next;
g_tls_sll[class_idx].count--;
base = head;
}
// 3. Pool exhausted - NO REFILL (benchmark failure)
if (__builtin_expect(base == NULL, 0)) {
fprintf(stderr, "[BENCH_FAST] Pool exhausted for C%d (size=%zu)\n",
class_idx, size);
fprintf(stderr, "[BENCH_FAST] Increase PREALLOC_COUNT or reduce iteration count\n");
return NULL;
}
// 4. Write header + return USER pointer (2-3 instructions)
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
tiny_region_id_write_header(base, class_idx); // Write 1-byte header (BASE first!)
return (void*)((char*)base + 1); // Return USER pointer
#else
return base; // No header mode - return BASE directly
#endif
}
// BenchFast free - Minimal path (3-5 instructions)
// Flow:
// 1. Read header (1 instruction)
// 2. BASE pointer (ptr-1) (1 instruction)
// 3. TLS SLL push (2-3 instructions)
void bench_fast_free(void* ptr) {
if (__builtin_expect(!ptr, 0)) return;
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
// 1. Read class_idx from header (1 instruction, 2-3 cycles)
int class_idx = tiny_region_id_read_header(ptr);
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
// Invalid header - fallback to normal free
hak_free_at(ptr, 0, "bench_fast_free");
return;
}
// 2. Compute BASE pointer (1 instruction)
void* base = (void*)((char*)ptr - 1);
// 3. TLS SLL push (2-3 instructions) - ALWAYS push if class_idx valid
// Fast path: Direct inline push (no Box API overhead, no capacity check)
tiny_next_write(class_idx, base, g_tls_sll[class_idx].head);
g_tls_sll[class_idx].head = base;
g_tls_sll[class_idx].count++;
#else
// Fallback to normal free (no header mode)
hak_free_at(ptr, 0, "bench_fast_free");
#endif
}
// BenchFast init - Preallocate pool to avoid recursion
// Strategy:
// 1. Called BEFORE benchmark (normal allocator OK)
// 2. Allocates 50,000 blocks per class (C2-C7)
// 3. Frees them to populate TLS SLL
// 4. BenchFast mode just pops from pre-filled pool (no refill)
// Returns: Total blocks preallocated, or 0 if disabled
int bench_fast_init(void) {
if (!bench_fast_enabled()) {
fprintf(stderr, "[BENCH_FAST] HAKMEM_BENCH_FAST_MODE not set, skipping init\n");
return 0;
}
// Set guard to prevent recursion during initialization
bench_fast_init_in_progress = 1;
// Phase 8-Step2: Prewarm Unified Cache (initialize before benchmark)
// This enables PGO builds to remove lazy init checks in hot paths
#ifdef USE_HAKMEM
extern void unified_cache_init(void);
unified_cache_init();
fprintf(stderr, "[BENCH_FAST] Unified Cache prewarmed\n");
#endif
fprintf(stderr, "[BENCH_FAST] Starting preallocation...\n");
int total = 0;
const int PREALLOC_COUNT = 50000; // Per class (300,000 total for C2-C7)
// Preallocate C2-C7 (32B-1024B, skip C0/C1 - too small, rarely used)
for (int cls = 2; cls <= 7; cls++) {
fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %d blocks...\n",
cls, g_tiny_class_sizes[cls], PREALLOC_COUNT);
for (int i = 0; i < PREALLOC_COUNT; i++) {
// Use normal allocator (hak_alloc_at) - recursion safe here
size_t size = g_tiny_class_sizes[cls];
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
// Adjust for header: if class size is N, we need N-1 bytes of user data
size = size - 1;
#endif
void* ptr = hak_alloc_at(size, "bench_fast_init");
if (!ptr) {
fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%d\n",
cls, i, PREALLOC_COUNT);
fprintf(stderr, "[BENCH_FAST] Total preallocated: %d blocks\n", total);
return total;
}
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
// Convert USER → BASE pointer
void* base = (void*)((char*)ptr - 1);
// Read and verify class from header
int header_cls = tiny_region_id_read_header(ptr);
if (header_cls != cls) {
fprintf(stderr, "[BENCH_FAST] Header mismatch: expected C%d, got C%d\n",
cls, header_cls);
// Free normally and continue
hak_free_at(ptr, size, "bench_fast_init_mismatch");
continue;
}
// Push directly to TLS SLL (bypass drain logic)
// This ensures blocks stay in TLS pool for BenchFast mode
tiny_next_write(cls, base, g_tls_sll[cls].head);
g_tls_sll[cls].head = base;
g_tls_sll[cls].count++;
#else
// No header mode - use normal free
free(ptr);
#endif
total++;
// Progress indicator every 10,000 blocks
if ((i + 1) % 10000 == 0) {
fprintf(stderr, "[BENCH_FAST] C%d: %d/%d blocks...\n",
cls, i + 1, PREALLOC_COUNT);
}
}
fprintf(stderr, "[BENCH_FAST] C%d complete: %u blocks in TLS SLL\n",
cls, g_tls_sll[cls].count);
}
fprintf(stderr, "[BENCH_FAST] Prealloc complete: %d total blocks\n", total);
fprintf(stderr, "[BENCH_FAST] TLS SLL counts:\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_tls_sll[cls].count > 0) {
fprintf(stderr, "[BENCH_FAST] C%d: %u blocks\n", cls, g_tls_sll[cls].count);
}
}
// Clear guard - initialization complete, BenchFast mode can now be used
bench_fast_init_in_progress = 0;
return total;
}
// BenchFast stats - Print remaining blocks per class
// Use after benchmark to verify pool wasn't exhausted
void bench_fast_stats(void) {
if (!bench_fast_enabled()) {
return;
}
fprintf(stderr, "[BENCH_FAST] Final TLS SLL counts:\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_tls_sll[cls].count > 0) {
fprintf(stderr, "[BENCH_FAST] C%d: %u blocks remaining\n",
cls, g_tls_sll[cls].count);
}
}
}