hakmem/core/box/tiny_front_cold_box.h

// tiny_front_cold_box.h - Phase 4-Step2: Tiny Front Cold Path Box
// Purpose: Slow path allocation (refill, diagnostics, error handling)
// Contract: Called on cache miss, handles SuperSlab refill + diagnostics
// Performance: Optimized for correctness, not speed (noinline, cold)
//
// Design Principles (Box Pattern):
// 1. Single Responsibility: Cold path ONLY (refill, errors, diagnostics)
// 2. Clear Contract: Returns USER pointer or NULL, handles all edge cases
// 3. Observable: Debug logging, error reporting, telemetry
// 4. Safe: Full error checking, defensive programming
// 5. Testable: Isolated from hot path, easy to test edge cases
//
// Performance Impact:
//   - noinline: Keeps hot path small (better i-cache locality)
//   - cold attribute: Hints compiler to optimize for size, not speed
//   - Infrequent execution: Called only on cache miss (~1-5% of allocations)

#ifndef TINY_FRONT_COLD_BOX_H
#define TINY_FRONT_COLD_BOX_H

#include <stdint.h>
#include <stddef.h>
#include <stdio.h>
#include "../hakmem_build_flags.h"
#include "../hakmem_tiny_config.h"
#include "../tiny_region_id.h"
#include "../front/tiny_unified_cache.h"  // For TinyUnifiedCache, unified_cache_refill
#include "tiny_layout_box.h"  // For tiny_user_offset()

// ============================================================================
// Box 3: Tiny Cold Refill + Alloc
// ============================================================================

// Refill cache from SuperSlab + allocate one object
//
// CONTRACT:
//   Input:  class_idx (0-7, pre-validated by caller)
//   Output: USER pointer on success, NULL on failure
//   Precondition: Cache miss detected by hot path
//   Postcondition: Cache refilled (if possible), one object allocated
//
// DESIGN:
//   - noinline: Keeps hot path small (better i-cache)
//   - cold: Hints compiler this is infrequent code
//   - Defensive: Full error checking, diagnostics
//
// PERFORMANCE:
//   - Called infrequently (~1-5% of allocations)
//   - Optimized for correctness, not speed
//   - Refill amortizes cost over batch (e.g., 64 objects)
//
// ERROR HANDLING:
//   - SuperSlab allocation failure → NULL
//   - Cache refill failure → NULL (fallback to normal path)
//   - Logs errors in debug builds
//
__attribute__((noinline, cold, unused))
static void* tiny_cold_refill_and_alloc(int class_idx) {
    // Refill cache from SuperSlab (batch allocation)
    // unified_cache_refill() returns first BASE block (wrapped)
    hak_base_ptr_t base = unified_cache_refill(class_idx);

    if (hak_base_is_null(base)) {
        // Refill failed (SuperSlab allocation error, or cache disabled)
        #if !HAKMEM_BUILD_RELEASE
        static __thread uint64_t g_refill_fail_count[TINY_NUM_CLASSES] = {0};
        if (g_refill_fail_count[class_idx] < 10) {
            fprintf(stderr, "[COLD_BOX] Refill failed: class_idx=%d\n", class_idx);
            fflush(stderr);
            g_refill_fail_count[class_idx]++;
        }
        #endif
        return NULL;
    }

    // Success: return USER pointer
    // NOTE: Header already written by unified_cache_refill()
    // (Removed redundant tiny_region_id_write_header() - P2 fix)
    #if HAKMEM_TINY_HEADER_CLASSIDX
    // Use centralized layout API for offset calculation
    size_t user_offset = tiny_user_offset(class_idx);
    void* raw_base = HAK_BASE_TO_RAW(base);
    return (void*)((char*)raw_base + user_offset);  // USER pointer
    #else
    return HAK_BASE_TO_RAW(base);
    #endif
}

// ============================================================================
// Box 3b: Tiny Cold Drain + Free
// ============================================================================

// Drain cache to SuperSlab + free one object
//
// CONTRACT:
//   Input:  class_idx (0-7), base pointer (BASE, not USER)
//   Output: 1=SUCCESS, 0=FAILURE
//   Precondition: Cache full detected by hot path
//   Postcondition: Cache drained (if possible), object freed
//
// DESIGN:
//   - noinline: Keeps hot path small
//   - cold: Infrequent execution
//   - Batch drain: Drain multiple objects to amortize cost
//
// PERFORMANCE:
//   - Called infrequently (~1-5% of frees)
//   - Batch drain amortizes cost (e.g., drain 32 objects)
//
__attribute__((noinline, cold, unused))
static int tiny_cold_drain_and_free(int class_idx, void* base) {
    extern __thread TinyUnifiedCache g_unified_cache[];
    TinyUnifiedCache* cache = &g_unified_cache[class_idx];
#if HAKMEM_BUILD_RELEASE
    (void)cache;
#endif

    // TODO: Implement batch drain logic
    // For now, just reject the free (caller falls back to normal path)
    #if !HAKMEM_BUILD_RELEASE
    static __thread uint64_t g_drain_count[TINY_NUM_CLASSES] = {0};
    if (g_drain_count[class_idx] < 10) {
        fprintf(stderr, "[COLD_BOX] Cache full, drain needed: class_idx=%d tail=%u head=%u\n",
                class_idx, cache->tail, cache->head);
        fflush(stderr);
        g_drain_count[class_idx]++;
    }
    #endif

    // Fallback: Return 0 (caller handles via normal free path)
    (void)base;  // Unused for now
    return 0;
}

// ============================================================================
// Box 3c: Tiny Cold Error Reporting
// ============================================================================

// Report error (debug builds only)
//
// CONTRACT:
//   Input:  class_idx, error reason string
//   Output: void (logs to stderr)
//   Precondition: Error detected in hot/cold path
//   Postcondition: Error logged (debug only, zero overhead in release)
//
__attribute__((noinline, cold, unused))
static void tiny_cold_report_error(int class_idx, const char* reason) {
    #if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "[COLD_BOX_ERROR] class_idx=%d reason=%s\n", class_idx, reason);
    fflush(stderr);
    #else
    (void)class_idx;
    (void)reason;
    #endif
}

// ============================================================================
// Performance Notes
// ============================================================================

// Cold path optimizations:
// 1. noinline: Reduces hot path code size → better i-cache
// 2. cold attribute: Compiler optimizes for size, not speed
// 3. Batch operations: Refill/drain multiple objects (amortize cost)
// 4. Defensive code: Full error checking (correctness > speed)
//
// Expected call frequency:
// - Refill: ~1-5% of allocations (depends on cache size)
// - Drain: ~1-5% of frees (depends on allocation pattern)
// - Error: <0.01% (only on actual errors)
//
// Impact on hot path:
// - Hot path stays small (~10-20 instructions)
// - Better i-cache locality (hot path doesn't include cold code)
// - CPU branch predictor learns hot path quickly

#endif // TINY_FRONT_COLD_BOX_H