#ifndef HAKMEM_TINY_REFILL_P0_INC_H
#define HAKMEM_TINY_REFILL_P0_INC_H

#include <stdio.h>
#include <stdatomic.h>

// hakmem_tiny_refill_p0.inc.h
// P0: Batch refill implementation (sll_refill_batch_from_ss only).
// Phase 12: DO NOT alias or redefine sll_refill_small_from_ss here.
// NOTE: This file is active only when HAKMEM_TINY_P0_BATCH_REFILL=1.

#if HAKMEM_TINY_P0_BATCH_REFILL

#include "hakmem_tiny_integrity.h"
#include "tiny_box_geometry.h"      // Box 3: Geometry & Capacity Calculator
#include "tiny_refill_opt.h"
#include "tiny_fc_api.h"
#include "superslab/superslab_inline.h"  // For _ss_remote_drain_to_freelist_unsafe()
#include "box/integrity_box.h"      // Box I: Integrity verification (Priority ALPHA)
#include "box/tiny_next_ptr_box.h"  // Box API: Next pointer read/write

// Debug counters (compile-time gated)
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_early_no_ss[];
extern unsigned long long g_rf_early_no_meta[];
extern unsigned long long g_rf_early_no_room[];
extern unsigned long long g_rf_early_want_zero[];
#endif

// P0 diagnostic logging is now permanently disabled (former ENV toggle removed).
static inline int p0_should_log(void) { return 0; }

// P0 batch refill entry point
static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
    // Phase E1-CORRECT: C7 now has headers, can use P0 batch refill

    HAK_CHECK_CLASS_IDX(class_idx, "sll_refill_batch_from_ss");
    if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
        static _Atomic int g_p0_class_oob_log = 0;
        if (atomic_fetch_add_explicit(&g_p0_class_oob_log, 1, memory_order_relaxed) == 0) {
            fprintf(stderr, "[P0_CLASS_OOB] class_idx=%d max_take=%d\n", class_idx, max_take);
        }
        return 0;
    }

    if (!g_use_superslab || max_take <= 0) {
#if HAKMEM_DEBUG_COUNTERS
        if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
#endif
        return 0;
    }

    TinyTLSSlab* tls = &g_tls_slabs[class_idx];

    // Phase 3c L1D Opt: Prefetch SuperSlab hot fields early
    if (tls->ss) {
        __builtin_prefetch(&tls->ss->slab_bitmap, 0, 3);
        __builtin_prefetch(&tls->ss->total_active_blocks, 0, 3);
    }

    uint32_t active_before = 0;
    if (tls->ss) {
        active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
    }

    if (!tls->ss) {
        if (!superslab_refill(class_idx)) {
            return 0;
        }
    }

    TinySlabMeta* meta = tls->meta;
    if (!meta) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_no_meta[class_idx]++;
#endif
        return 0;
    }

    // Phase 3c L1D Opt: Prefetch SlabMeta hot fields (freelist, used, capacity)
    __builtin_prefetch(&meta->freelist, 0, 3);

#if HAKMEM_INTEGRITY_LEVEL >= 4
    uint8_t* initial_slab_base =
        tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
    SlabMetadataState meta_initial =
        integrity_capture_slab_metadata(meta, initial_slab_base, class_idx);
    INTEGRITY_CHECK_SLAB_METADATA(meta_initial, "P0 refill entry");
#endif

    // Optional: Direct-FC fast path（全クラス対応 A/B）。
    // Fixed defaults after ENV cleanup:
    //  - C5優先: enabled
    //  - C7のみ: disabled
    //  - 全クラス: disabled
    do {
        const int g_direct_fc = 1;
        const int g_direct_fc_c7 = 0;
        const int g_direct_fc_all = 0;
        if (__builtin_expect(g_direct_fc_all ||
                             (g_direct_fc && class_idx == 5) ||
                             (g_direct_fc_c7 && class_idx == 7), 0)) {
            int room = tiny_fc_room(class_idx);
            if (room <= 0) return 0;

            uint32_t rmt = atomic_load_explicit(
                &tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
            const int g_drain_th = 64;
            if (rmt >= (uint32_t)g_drain_th) {
                _ss_remote_drain_to_freelist_unsafe(
                    tls->ss, tls->slab_idx, tls->meta);
            }

            void* out[128];
            int produced = 0;
            TinySlabMeta* m = tls->meta;
            size_t bs = tiny_stride_for_class(class_idx);
            uint8_t* base = tls->slab_base
                                ? tls->slab_base
                                : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
            while (produced < room) {
                if (m->freelist) {
                    void* p = m->freelist;
                    m->freelist = tiny_next_read(class_idx, p);
                    m->used++;
                    out[produced++] = p;
                } else if (m->carved < m->capacity) {
                    void* p = (void*)(base + ((size_t)m->carved * bs));
                    m->carved++;
                    m->used++;
                    out[produced++] = p;
                } else {
                    if (!superslab_refill(class_idx)) break;
                    tls = &g_tls_slabs[class_idx];
                    m = tls->meta;
                    base = tls->slab_base
                               ? tls->slab_base
                               : tiny_slab_base_for(tls->ss, tls->slab_idx);
                }
            }
            if (produced > 0) {
                ss_active_add(tls->ss, (uint32_t)produced);
                (void)tiny_fc_push_bulk(class_idx, out, produced);
                return produced;
            }
            // fallthrough to regular path
        }
    } while (0);

    uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
    int room = (int)sll_cap - (int)g_tls_sll[class_idx].count;
    if (room <= 0) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_no_room[class_idx]++;
#endif
        return 0;
    }

    uint32_t want = (uint32_t)max_take;
    if (want > (uint32_t)room) want = (uint32_t)room;
    if (want == 0) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_want_zero[class_idx]++;
#endif
        return 0;
    }

    size_t bs = tiny_stride_for_class(class_idx);
    int total_taken = 0;

    while (want > 0) {
        uintptr_t ss_base = 0;
        uintptr_t ss_limit = 0;
        if (tls->ss && tls->slab_idx >= 0) {
            uint8_t* slab_base =
                tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
            ss_base = (uintptr_t)slab_base;
            ss_limit = ss_base + tiny_usable_bytes_for_slab(tls->slab_idx);
        }

        if (tls->ss && tls->slab_idx >= 0) {
            uint32_t remote_count = atomic_load_explicit(
                &tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
            if (remote_count > 0) {
                _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
            }
        }

        TinyRefillChain chain;
        uint32_t from_freelist = trc_pop_from_freelist(
            meta, class_idx, ss_base, ss_limit, bs, want, &chain);
        if (from_freelist > 0) {
            trc_splice_to_sll(
                class_idx, &chain,
                &g_tls_sll[class_idx].head,
                &g_tls_sll[class_idx].count);
            ss_active_add(tls->ss, from_freelist);
            // Phase 1: Atomic increment for MT safety
            atomic_fetch_add_explicit(&meta->used, from_freelist, memory_order_relaxed);

            // Phase 3c L1D Opt: Prefetch next freelist entry after refill
            // Phase 1: Use atomic load for MT safety
            void* next_head = slab_freelist_load_relaxed(meta);
            if (next_head) {
                __builtin_prefetch(next_head, 0, 3);
            }

#if HAKMEM_DEBUG_COUNTERS
            extern unsigned long long g_rf_freelist_items[];
            g_rf_freelist_items[class_idx] += from_freelist;
#endif
            total_taken += from_freelist;
            want -= from_freelist;
            if (want == 0) break;
        }

        if (meta->carved >= meta->capacity) {
            if (!superslab_refill(class_idx)) break;
            tls = &g_tls_slabs[class_idx];
            meta = tls->meta;
            if (!meta) break;
            continue;
        }

        // NOTE: Pre-carve geometry validation removed (redundant)
        // Stride table is now correct in tiny_block_stride_for_class(),
        // and slab geometry is validated at allocation time by shared_pool.
        // Defense-in-depth validation adds overhead without benefit.

        uint32_t available = meta->capacity - meta->carved;
        uint32_t batch = want;
        if (batch > available) batch = available;
        if (batch == 0) break;

        uint8_t* slab_base = tls->slab_base
                                 ? tls->slab_base
                                 : tiny_slab_base_for(tls->ss, tls->slab_idx);

        TinyRefillChain carve;
        trc_linear_carve(slab_base, bs, meta, batch, class_idx, &carve);
        trc_splice_to_sll(
            class_idx, &carve,
            &g_tls_sll[class_idx].head,
            &g_tls_sll[class_idx].count);
        ss_active_add(tls->ss, batch);
#if HAKMEM_DEBUG_COUNTERS
        extern unsigned long long g_rf_carve_items[];
        g_rf_carve_items[class_idx] += batch;
#endif
        total_taken += batch;
        want -= batch;
    }

#if HAKMEM_DEBUG_COUNTERS
    g_rf_hit_slab[class_idx]++;
#endif

    if (tls->ss && p0_should_log()) {
        uint32_t active_after = atomic_load_explicit(
            &tls->ss->total_active_blocks, memory_order_relaxed);
        int32_t delta =
            (int32_t)active_after - (int32_t)active_before;
        fprintf(stderr,
                "[P0_COUNTER] cls=%d slab=%d taken=%d active_delta=%d\n",
                class_idx, tls->slab_idx, total_taken, delta);
    }

    return total_taken;
}

#endif // HAKMEM_TINY_P0_BATCH_REFILL
#endif // HAKMEM_TINY_REFILL_P0_INC_H