// ============================================================================
// Box TLS-SLL API
// ============================================================================
#include "box/tls_sll_box.h"

// ============================================================================
// Step 3: Cold-path outline - Wrapper Context Handler
// ============================================================================
// Purpose: Handle allocations during wrapper calls (rare execution)
// Rationale: Avoid re-entrancy hazards with pthread locks during wrapper calls
// Step 3d: Force inline for readability without performance loss
__attribute__((always_inline))
static inline void* hak_tiny_alloc_wrapper(int class_idx) {
    ROUTE_BEGIN(class_idx);

    // Wrapper-context fast path: magazine-only (never take locks or refill)
    tiny_small_mags_init_once();
    if (__builtin_expect(class_idx > 3, 0)) tiny_mag_init_if_needed(class_idx);
    TinyTLSMag* mag = &g_tls_mags[class_idx];
    if (mag->top > 0) {
        void* p = mag->items[--mag->top].ptr;
        HAK_RET_ALLOC(class_idx, p);
    }

    // Try TLS active slabs (owner-only, lock-free)
    TinySlab* tls = g_tls_active_slab_a[class_idx];
    if (!(tls && tls->free_count > 0)) tls = g_tls_active_slab_b[class_idx];
    if (tls && tls->free_count > 0) {
        tiny_remote_drain_owner(tls);
        if (tls->free_count > 0) {
            int block_idx = hak_tiny_find_free_block(tls);
            if (block_idx >= 0) {
                hak_tiny_set_used(tls, block_idx);
                tls->free_count--;
                size_t bs = g_tiny_class_sizes[class_idx];
                void* p = (char*)tls->base + (block_idx * bs);
                HAK_RET_ALLOC(class_idx, p);
            }
        }
    }

// Optional: attempt limited refill under trylock (no remote drain)
    if (g_wrap_tiny_refill) {
        pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
        if (pthread_mutex_trylock(lock) == 0) {
            TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
            if (slab && slab->free_count > 0) {
                int room = mag->cap - mag->top;
                if (room > 16) room = 16;  // wrapper refill is small and quick
                if (room > slab->free_count) room = slab->free_count;
                if (room > 0) {
                    size_t bs = g_tiny_class_sizes[class_idx];
                    void* ret = NULL;
                    for (int i = 0; i < room; i++) {
                        int idx = hak_tiny_find_free_block(slab);
                        if (idx < 0) break;
                        hak_tiny_set_used(slab, idx);
                        slab->free_count--;
                        void* p = (char*)slab->base + (idx * bs);
                        if (i < room - 1) {
                            mag->items[mag->top].ptr = p;
                            mag->top++;
                        } else {
                            ret = p;  // return one directly
                        }
                    }
                    if (slab->free_count == 0) {
                        move_to_full_list(class_idx, slab);
                    }
                    pthread_mutex_unlock(lock);
                    if (ret) { HAK_RET_ALLOC(class_idx, ret); }
                } else {
                    pthread_mutex_unlock(lock);
                }
            } else {
                pthread_mutex_unlock(lock);
            }
        }
    }
    return NULL;  // empty → fallback to next allocator tier
}


void* hak_tiny_alloc(size_t size) {
#if !HAKMEM_BUILD_RELEASE
    if (!g_tiny_initialized) hak_tiny_init();
#else
    if (__builtin_expect(!g_tiny_initialized, 0)) {
        hak_tiny_init();
    }
#endif
    // Default (safe): Avoid using Tiny during wrapper calls（TLSガード or 関数）
    // If HAKMEM_WRAP_TINY=1, allow Tiny even when called from wrapper.
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
    if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) {
        static int log1 = 0;
        if (log1 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: in_wrapper\n"); log1++; }
        return NULL;
    }
# else
    extern int hak_in_wrapper(void);
    if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) {
        static int log2 = 0;
        if (log2 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: hak_in_wrapper\n"); log2++; }
        return NULL;
    }
# endif
#endif

    // ========================================================================
    // Cooperative stats polling (SIGUSR1 trigger safe point)
    hak_tiny_stats_poll();

    // ========================================================================
    // Phase 6-1.5: Ultra-Simple Fast Path (when enabled)
    // ========================================================================
    // Design: "Simple Front + Smart Back" - inspired by Mid-Large HAKX +171%
    // - 3-4 instruction fast path (Phase 6-1 style)
    // - Existing SuperSlab + ACE + Learning backend
    // Two variants:
    //   Phase 6-1.5: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1 (alignment guessing)
    //   Phase 6-1.6: -DHAKMEM_TINY_PHASE6_METADATA=1 (metadata header)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
    return hak_tiny_alloc_ultra_simple(size);
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
    return hak_tiny_alloc_metadata(size);
#endif
    // ========================================================================

    // 1. Size → class index
    int class_idx = hak_tiny_size_to_class(size);
    if (class_idx < 0) {
        static int log3 = 0;
        if (log3 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: class_idx < 0 for size %zu\n", size); log3++; }
        return NULL;  // >1KB
    }
    // Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
    ROUTE_BEGIN(class_idx);
    do {
        static int g_alloc_ring = -1;
        if (__builtin_expect(g_alloc_ring == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
            g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
        }
        if (g_alloc_ring) {
            tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, (uint16_t)class_idx, (void*)(uintptr_t)size, 0);
        }
    } while (0);

#if HAKMEM_TINY_MINIMAL_FRONT
    // Minimal Front for hot tiny classes (bench-focused):
    // SLL direct pop → minimal refill → pop, bypassing other layers.
    if (__builtin_expect(class_idx <= 3, 1)) {
        void* head = NULL;
        if (tls_sll_pop(class_idx, &head)) {
            HAK_RET_ALLOC(class_idx, head);
        }
        // Refill a small batch directly from TLS-cached SuperSlab
#if HAKMEM_TINY_P0_BATCH_REFILL
        (void)sll_refill_batch_from_ss(class_idx, 32);
#else
        (void)sll_refill_small_from_ss(class_idx, 32);
#endif
        if (tls_sll_pop(class_idx, &head)) {
            HAK_RET_ALLOC(class_idx, head);
        }
        // Fall through to slow path if still empty
    }
#endif

    // Ultra-Front: minimal per-class stack for hot tiny classes (opt-in)
    // Try ultra_pop → (optional) ultra_refill_small → ultra_pop before other layers
    if (__builtin_expect(g_ultra_simple && class_idx <= 3, 0)) {
        void* up = ultra_pop(class_idx);
        if (__builtin_expect(up == NULL, 0)) {
            (void)ultra_refill_small(class_idx);
            up = ultra_pop(class_idx);
        }
        if (__builtin_expect(up != NULL, 0)) {
            tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, up, 0xF0);
            HAK_RET_ALLOC(class_idx, up);
        }
    }

    if (__builtin_expect(!g_debug_fast0, 1)) {
#ifdef HAKMEM_TINY_BENCH_FASTPATH
        if (__builtin_expect(class_idx <= HAKMEM_TINY_BENCH_TINY_CLASSES, 1)) {
            if (__builtin_expect(class_idx <= 3, 1)) {
                unsigned char* done = &g_tls_bench_warm_done[class_idx];
                if (__builtin_expect(*done == 0, 0)) {
                    int warm = (class_idx == 0) ? HAKMEM_TINY_BENCH_WARMUP8 :
                              (class_idx == 1) ? HAKMEM_TINY_BENCH_WARMUP16 :
                              (class_idx == 2) ? HAKMEM_TINY_BENCH_WARMUP32 :
                                                 HAKMEM_TINY_BENCH_WARMUP64;
#if HAKMEM_TINY_P0_BATCH_REFILL
                    if (warm > 0) (void)sll_refill_batch_from_ss(class_idx, warm);
#else
                    if (warm > 0) (void)sll_refill_small_from_ss(class_idx, warm);
#endif
                    *done = 1;
                }
            }
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
            tiny_small_mags_init_once();
            if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
#endif
            void* head = NULL;
            if (tls_sll_pop(class_idx, &head)) {
                tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 0);
                HAK_RET_ALLOC(class_idx, head);
            }
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
            TinyTLSMag* mag = &g_tls_mags[class_idx];
            int t = mag->top;
            if (__builtin_expect(t > 0, 1)) {
                void* p = mag->items[--t].ptr;
                mag->top = t;
                tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, p, 1);
                HAK_RET_ALLOC(class_idx, p);
            }
#endif
            int bench_refill = (class_idx == 0) ? HAKMEM_TINY_BENCH_REFILL8 :
                               (class_idx == 1) ? HAKMEM_TINY_BENCH_REFILL16 :
                               (class_idx == 2) ? HAKMEM_TINY_BENCH_REFILL32 :
                                                  HAKMEM_TINY_BENCH_REFILL64;
#if HAKMEM_TINY_P0_BATCH_REFILL
            if (__builtin_expect(sll_refill_batch_from_ss(class_idx, bench_refill) > 0, 0)) {
#else
            if (__builtin_expect(sll_refill_small_from_ss(class_idx, bench_refill) > 0, 0)) {
#endif
                if (tls_sll_pop(class_idx, &head)) {
                    tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 2);
                    HAK_RET_ALLOC(class_idx, head);
                }
            }
            // fallthrough to slow path on miss
        }
#endif

        // TinyHotMag front: fast-tierが枯渇したとき、キャッシュを再補充してから利用する
        if (__builtin_expect(g_hotmag_enable && class_idx <= 2 && g_fast_head[class_idx] == NULL, 0)) {
            hotmag_init_if_needed(class_idx);
            TinyHotMag* hm = &g_tls_hot_mag[class_idx];
            void* hotmag_ptr = hotmag_pop(class_idx);
            if (__builtin_expect(hotmag_ptr == NULL, 0)) {
                if (hotmag_try_refill(class_idx, hm) > 0) {
                    hotmag_ptr = hotmag_pop(class_idx);
                }
            }
            if (__builtin_expect(hotmag_ptr != NULL, 1)) {
                if (__builtin_expect(class_idx == 7, 0)) { *(void**)hotmag_ptr = NULL; }
                tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, hotmag_ptr, 3);
                HAK_RET_ALLOC(class_idx, hotmag_ptr);
            }
        }

        if (g_hot_alloc_fn[class_idx] != NULL) {
            void* fast_hot = NULL;
            switch (class_idx) {
                case 0:
                    fast_hot = tiny_hot_pop_class0();
                    break;
                case 1:
                    fast_hot = tiny_hot_pop_class1();
                    break;
                case 2:
                    fast_hot = tiny_hot_pop_class2();
                    break;
                case 3:
                    fast_hot = tiny_hot_pop_class3();
                    break;
                default:
                    fast_hot = NULL;
                    break;
            }
            if (__builtin_expect(fast_hot != NULL, 1)) {
#if HAKMEM_BUILD_DEBUG
                g_tls_hit_count[class_idx]++;
#endif
                if (__builtin_expect(class_idx == 7, 0)) { *(void**)fast_hot = NULL; }
                tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_hot, 4);
                HAK_RET_ALLOC(class_idx, fast_hot);
            }
        }

        void* fast = tiny_fast_pop(class_idx);
        if (__builtin_expect(fast != NULL, 0)) {
#if HAKMEM_BUILD_DEBUG
            g_tls_hit_count[class_idx]++;
#endif
            if (__builtin_expect(class_idx == 7, 0)) { *(void**)fast = NULL; }
            tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast, 5);
            HAK_RET_ALLOC(class_idx, fast);
        }
    } else {
        tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, NULL, 0);
    }

    void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
    if (slow_ptr) {
        tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, slow_ptr, 6);
        HAK_RET_ALLOC(class_idx, slow_ptr);  // Increment stats for slow path success
    }
    tiny_alloc_dump_tls_state(class_idx, "fail", &g_tls_slabs[class_idx]);
    tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_NULL, (uint16_t)class_idx, NULL, 0);
    return slow_ptr;
}