// bench_fast_box.c - BenchFast Mode Implementation // Purpose: Ultra-minimal Tiny alloc/free for structural ceiling measurement // WARNING: Bypasses ALL safety mechanisms - benchmark only! #include "bench_fast_box.h" #include "../hakmem_tiny.h" #include "../tiny_region_id.h" #include "../box/tiny_next_ptr_box.h" #include #include #include #include // External Tiny infrastructure (defined in hakmem_tiny.c) extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; extern int g_tls_sll_enable; extern int hak_tiny_size_to_class(size_t size); extern const size_t g_tiny_class_sizes[]; // Public API fallbacks (correct signatures from hakmem.h) #include "../hakmem.h" // Guard: Disable BenchFast during initialization to avoid recursion // Phase 8-TLS-Fix: Changed from __thread to atomic_int // Root Cause: pthread_once() creates new threads with fresh TLS (= 0), // breaking the guard. Atomic variable works across ALL threads. // Box Contract: Guard must protect entire process, not just calling thread. atomic_int g_bench_fast_init_in_progress = 0; // BenchFast alloc - Minimal path (POP-ONLY, NO REFILL) // Flow: // 1. size → class_idx (inline table lookup) // 2. TLS SLL pop (3-4 instructions) // 3. Write header + return (2-3 instructions) // NOTE: No refill! Pool must be preallocated via bench_fast_init() void* bench_fast_alloc(size_t size) { // Guard: Avoid recursion during init phase (atomic for cross-thread safety) if (__builtin_expect(atomic_load(&g_bench_fast_init_in_progress), 0)) { // Initialization in progress - use normal allocator to avoid recursion return hak_alloc_at(size, "bench_fast_alloc_init"); } // 1. Size → class_idx (inline, 1-2 instructions) int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { fprintf(stderr, "[BENCH_FAST] Invalid size %zu (class %d out of range)\n", size, class_idx); return NULL; // Out of range } // 2. TLS SLL pop (3-4 instructions) - NO REFILL! void* base = NULL; void* head = g_tls_sll[class_idx].head; if (__builtin_expect(head != NULL, 1)) { // Read next pointer from header (header+1 = next ptr storage) void* next = tiny_next_read(class_idx, head); g_tls_sll[class_idx].head = next; g_tls_sll[class_idx].count--; base = head; } // 3. Pool exhausted - NO REFILL (benchmark failure) if (__builtin_expect(base == NULL, 0)) { fprintf(stderr, "[BENCH_FAST] Pool exhausted for C%d (size=%zu)\n", class_idx, size); fprintf(stderr, "[BENCH_FAST] Increase PREALLOC_COUNT or reduce iteration count\n"); return NULL; } // 4. Write header + return USER pointer (2-3 instructions) // Phase 8-P3-Fix: Write header DIRECTLY (bypass tiny_region_id_write_header) // Reason: P3 optimization skips header writes by default (class_map mode) // But BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic) // Contract: BenchFast always writes headers, regardless of P3 optimization #ifdef HAKMEM_TINY_HEADER_CLASSIDX *(uint8_t*)base = (uint8_t)(0xa0 | (class_idx & 0x0f)); // Direct header write return (void*)((char*)base + 1); // Return USER pointer #else return base; // No header mode - return BASE directly #endif } // BenchFast free - Minimal path (3-5 instructions) // Flow: // 1. Read header (1 instruction) // 2. BASE pointer (ptr-1) (1 instruction) // 3. TLS SLL push (2-3 instructions) void bench_fast_free(void* ptr) { if (__builtin_expect(!ptr, 0)) return; #ifdef HAKMEM_TINY_HEADER_CLASSIDX // 1. Read class_idx from header (1 instruction, 2-3 cycles) int class_idx = tiny_region_id_read_header(ptr); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { // Invalid header - fallback to normal free hak_free_at(ptr, 0, "bench_fast_free"); return; } // 2. Compute BASE pointer (1 instruction) void* base = (void*)((char*)ptr - 1); // 3. TLS SLL push (2-3 instructions) - ALWAYS push if class_idx valid // Fast path: Direct inline push (no Box API overhead, no capacity check) tiny_next_write(class_idx, base, g_tls_sll[class_idx].head); g_tls_sll[class_idx].head = base; g_tls_sll[class_idx].count++; #else // Fallback to normal free (no header mode) hak_free_at(ptr, 0, "bench_fast_free"); #endif } // BenchFast init - Preallocate pool to avoid recursion // Strategy: // 1. Called BEFORE benchmark (normal allocator OK) // 2. Allocates 50,000 blocks per class (C2-C7) // 3. Frees them to populate TLS SLL // 4. BenchFast mode just pops from pre-filled pool (no refill) // Returns: Total blocks preallocated, or 0 if disabled int bench_fast_init(void) { if (!bench_fast_enabled()) { fprintf(stderr, "[BENCH_FAST] HAKMEM_BENCH_FAST_MODE not set, skipping init\n"); return 0; } // Set guard to prevent recursion during initialization (atomic for cross-thread safety) atomic_store(&g_bench_fast_init_in_progress, 1); // Phase 8 Root Cause Fix: REMOVED unified_cache_init() call // Reason: BenchFast uses TLS SLL directly, NOT Unified Cache // The prewarm was a design misunderstanding - BenchFast has its own allocation strategy // Calling unified_cache_init() created 16KB mmap allocations that crashed when freed // in BenchFast mode (header misclassification bug) fprintf(stderr, "[BENCH_FAST] Starting preallocation...\n"); // Layer 0 Root Cause Fix: Limit prealloc to actual TLS SLL capacity // Problem: Old code preallocated 50,000 blocks/class, but TLS SLL capacity is 128 (adaptive sizing) // The "lost" blocks (beyond capacity) caused heap corruption // Analysis: sll_cap_for_class() returns "desired" capacity (2048), but adaptive sizing // limits actual capacity to 128 at runtime. We must use the actual limit. // Solution: Hard-code to 128 blocks/class (observed actual capacity from runtime output) extern const size_t g_tiny_class_sizes[]; int total = 0; const uint32_t ACTUAL_TLS_SLL_CAPACITY = 128; // Observed actual capacity (adaptive sizing limit) // Preallocate C2-C7 (32B-1024B, skip C0/C1 - too small, rarely used) for (int cls = 2; cls <= 7; cls++) { uint32_t capacity = ACTUAL_TLS_SLL_CAPACITY; fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %u blocks (actual TLS SLL capacity)...\n", cls, g_tiny_class_sizes[cls], capacity); for (int i = 0; i < (int)capacity; i++) { // Use normal allocator (hak_alloc_at) - recursion safe here size_t size = g_tiny_class_sizes[cls]; #ifdef HAKMEM_TINY_HEADER_CLASSIDX // Adjust for header: if class size is N, we need N-1 bytes of user data size = size - 1; #endif void* ptr = hak_alloc_at(size, "bench_fast_init"); if (!ptr) { fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%u\n", cls, i, capacity); fprintf(stderr, "[BENCH_FAST] Total preallocated: %d blocks\n", total); return total; } #ifdef HAKMEM_TINY_HEADER_CLASSIDX // Convert USER → BASE pointer void* base = (void*)((char*)ptr - 1); // Read and verify class from header int header_cls = tiny_region_id_read_header(ptr); if (header_cls != cls) { fprintf(stderr, "[BENCH_FAST] Header mismatch: expected C%d, got C%d\n", cls, header_cls); // Free normally and continue hak_free_at(ptr, size, "bench_fast_init_mismatch"); continue; } // Push directly to TLS SLL (bypass drain logic) // This ensures blocks stay in TLS pool for BenchFast mode tiny_next_write(cls, base, g_tls_sll[cls].head); g_tls_sll[cls].head = base; g_tls_sll[cls].count++; #else // No header mode - use normal free free(ptr); #endif total++; // Progress indicator (only for large capacities) if (capacity >= 500 && (i + 1) % 500 == 0) { fprintf(stderr, "[BENCH_FAST] C%d: %d/%u blocks...\n", cls, i + 1, capacity); } } fprintf(stderr, "[BENCH_FAST] C%d complete: %u blocks in TLS SLL\n", cls, g_tls_sll[cls].count); } fprintf(stderr, "[BENCH_FAST] Prealloc complete: %d total blocks\n", total); fprintf(stderr, "[BENCH_FAST] TLS SLL counts:\n"); for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { if (g_tls_sll[cls].count > 0) { fprintf(stderr, "[BENCH_FAST] C%d: %u blocks\n", cls, g_tls_sll[cls].count); } } // Clear guard - initialization complete, BenchFast mode can now be used atomic_store(&g_bench_fast_init_in_progress, 0); return total; } // BenchFast stats - Print remaining blocks per class // Use after benchmark to verify pool wasn't exhausted void bench_fast_stats(void) { if (!bench_fast_enabled()) { return; } fprintf(stderr, "[BENCH_FAST] Final TLS SLL counts:\n"); for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { if (g_tls_sll[cls].count > 0) { fprintf(stderr, "[BENCH_FAST] C%d: %u blocks remaining\n", cls, g_tls_sll[cls].count); } } }