// hakmem_tiny_publish_box.inc
// Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers.
// Extracted from hakmem_tiny.c to keep hot-path logic focused.

// TLS hint: last adopted SuperSlab/slab to avoid rescans
#include "tiny_sticky.h"

// Mailbox box
#include "box/mailbox_box.h"

// Publish pipeline counters (visibility)
unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES]       = {0};
unsigned long long g_pub_same_empty[TINY_NUM_CLASSES]         = {0};
unsigned long long g_remote_transitions[TINY_NUM_CLASSES]     = {0};
unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0};
unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0};

// Slab-ring counters (debug)
unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES]   = {0};
unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES]    = {0};

// Slab entry encoding helpers (used by Bench/Slab-ring paths)
static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) {
    return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu);
}
static inline SuperSlab* slab_entry_ss(uintptr_t ent) {
    // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base
    return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
}
static inline int slab_entry_idx(uintptr_t ent) {
    return (int)(ent & 0x3Fu);
}

// ----------------------------------------------------------------------------
// Bench Mode Publish Mailbox (single-slot per class)
// ----------------------------------------------------------------------------
static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1
static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES];
#ifndef BENCH_MAILBOX_WIDTH
#define BENCH_MAILBOX_WIDTH 16
#endif
static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH];

static inline int bench_mode_enabled(void) {
    if (__builtin_expect(g_bench_mode == -1, 0)) {
        const char* b = getenv("HAKMEM_TINY_BENCH_MODE");
        g_bench_mode = (b && atoi(b) != 0) ? 1 : 0;
    }
    return g_bench_mode;
}

static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) {
    if (!bench_mode_enabled()) return;
    uintptr_t ent = slab_entry_make(ss, slab_idx);
    uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed);
    idx &= (BENCH_MAILBOX_WIDTH - 1);
    atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release);
}

static inline uintptr_t bench_pub_pop(int class_idx) {
    if (!bench_mode_enabled()) return (uintptr_t)0;
    for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) {
        uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
        if (ent) return ent;
    }
    return 0;
}

// ----------------------------------------------------------------------------
// Slab-Granular Partial Publish/Adopt (encoded entries)
// ----------------------------------------------------------------------------
#ifndef SLAB_PARTIAL_RING
#define SLAB_PARTIAL_RING 128
#endif
static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING];
static _Atomic(uint32_t)  g_slab_partial_rr2[TINY_NUM_CLASSES];

// ----------------------------------------------------------------------------
// Refill-stage counters (per class)
// ----------------------------------------------------------------------------
unsigned long long g_rf_total_calls[TINY_NUM_CLASSES]   = {0};
unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES]     = {0};
unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES]       = {0};
unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES]     = {0};
unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES]      = {0};
unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES]      = {0};
unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES]        = {0};
unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES]       = {0};
unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES]    = {0};

// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0)
unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES]   = {0};
unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0};

// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE)
unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES]   = {0};
unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES]  = {0};
unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES]  = {0};
unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES]    = {0};
unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES]   = {0};
unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES]  = {0};

// Refill item source breakdown (freelist vs carve)
unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_carve_items[TINY_NUM_CLASSES]    = {0};

static int g_rf_trace_en = -1;
static inline int rf_trace_enabled(void) {
    if (__builtin_expect(g_rf_trace_en == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_RF_TRACE");
        g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0;
    }
    return g_rf_trace_en;
}

static inline unsigned long long rf_now_ns(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec;
}

// Publish-side counters (debug)
unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0};
unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES]   = {0};
unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES]  = {0};

// Free pipeline counters
unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES]  = {0};
unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES]   = {0};
unsigned long long g_free_via_mag[TINY_NUM_CLASSES]       = {0};

// Front Gate Breakdown (debug counters)
unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES]   = {0};
unsigned long long g_front_sll_hit[TINY_NUM_CLASSES]   = {0};
unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
unsigned long long g_front_mag_hit[TINY_NUM_CLASSES]   = {0};

// Free-side trigger counters
unsigned long long g_first_free_transitions[TINY_NUM_CLASSES]   = {0};
unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES]  = {0};

// Adopt/Registry gate counters
unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES]   = {0};
unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES]  = {0};
unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES]      = {0};
unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES]  = {0};
unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES]   = {0};
unsigned long long g_fast_push_hits[TINY_NUM_CLASSES]     = {0};
unsigned long long g_fast_push_full[TINY_NUM_CLASSES]     = {0};
unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES]      = {0};
unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES]      = {0};
unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES]         = {0};
unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES]   = {0};
unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES]     = {0};
unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES]           = {0};
unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES]         = {0};
unsigned long long g_fast_lookup_none                            = 0;

// ----------------------------------------------------------------------------
// Live Superslab cap (must-adopt-before-mmap support)
// ----------------------------------------------------------------------------
static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value
__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0};
static inline int live_cap_for_class(int class_idx) {
    if (__builtin_expect(g_live_cap_env == -2, 0)) {
        const char* s = getenv("HAKMEM_SS_LIVE_CAP");
        if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); }
    }
    (void)class_idx;
    return g_live_cap_env;
}

// ----------------------------------------------------------------------------
// Hot Slot (global simple path)
// ----------------------------------------------------------------------------
static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot)
static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES];
static inline int hot_slot_enabled(void) {
    if (__builtin_expect(g_hot_slot_en == -1, 0)) {
        const char* s = getenv("HAKMEM_HOT_SLOT");
        g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0;
    }
    return g_hot_slot_en || bench_mode_enabled();
}
static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) {
    if (!hot_slot_enabled()) return;
    uintptr_t ent = slab_entry_make(ss, slab_idx);
    atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release);
}
static inline uintptr_t hot_slot_pop(int class_idx) {
    if (!hot_slot_enabled()) return (uintptr_t)0;
    return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel);
}

static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
    if (!ss) return;
    uintptr_t ent = slab_entry_make(ss, slab_idx);
    for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
        uintptr_t expected = 0;
        if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent,
                                                    memory_order_release, memory_order_relaxed)) {
            g_slab_publish_dbg[class_idx]++;
            return;
        }
    }
    // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot
    uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
    uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel);
    if (old) {
        for (int t = 0; t < 8; t++) {
            uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
            uintptr_t expected = 0;
            if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old,
                                                      memory_order_release, memory_order_relaxed)) {
                g_slab_requeue_dbg[class_idx]++;
                old = 0; break;
            }
        }
    }
    g_slab_publish_dbg[class_idx]++;
}

static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
    for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
        uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
        if (ent) return ent;
    }
    return 0;
}

void ss_partial_publish(int class_idx, SuperSlab* ss) {
    if (!ss) return;
    // Gate by listed flag to avoid repeated publishes of the same SS
    unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel);
    if (prev != 0u) return; // already listed

    // CRITICAL: Release ownership of all slabs so adopters can claim them!
    // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0).
    // The publishing thread must stop using this SS after publishing.
    int cap_pub = ss_slabs_capacity(ss);
    for (int s = 0; s < cap_pub; s++) {
        // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split
        TinySlabMeta* meta = ss_slab_meta_ptr(ss, s);
        uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE);
        if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) {
            uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev;
            tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE,
                                   (uint16_t)ss_slab_meta_class_idx_get(ss, s),
                                   meta,
                                   aux);
        }
    }

    // CRITICAL: Unbind current thread's TLS if it points to this SS!
    // Otherwise, the publishing thread will continue allocating from the published SS,
    // racing with adopters who acquire ownership.
    extern __thread TinyTLSSlab g_tls_slabs[];
    if (g_tls_slabs[class_idx].ss == ss) {
        g_tls_slabs[class_idx].ss = NULL;
        g_tls_slabs[class_idx].meta = NULL;
        g_tls_slabs[class_idx].slab_base = NULL;
        g_tls_slabs[class_idx].slab_idx = 0;
    }

    // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1)
    int best = -1; uint32_t best_score = 0;
    for (int s = 0; s < cap_pub; s++) {
        TinySlabMeta* m = &ss->slabs[s];
        uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
        int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0);
        unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u;
        uint32_t score = rc
                        + (m->freelist ? (1u<<30) : 0u)
                        + (listed ? (1u<<29) : 0u)
                        + (has_remote ? 1u : 0u);
        if (score > best_score) { best_score = score; best = s; }
    }
    if (best >= 0 && best < 256) {
        ss->publish_hint = (uint8_t)best;
        // Box: Ready push — provide slab-level candidate to adopters
        tiny_ready_push(class_idx, ss, best);
    } else {
        ss->publish_hint = 0xFF;
    }
    for (int i = 0; i < SS_PARTIAL_RING; i++) {
        SuperSlab* expected = NULL;
        if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
                                                    memory_order_release, memory_order_relaxed)) {
            g_ss_publish_dbg[class_idx]++;
            return;  // published
        }
    }
    // Ring full: replace one entry in round-robin to avoid dropping supply
    uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed);
    idx %= SS_PARTIAL_RING;
    SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel);
    if (old) {
        // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads
        // that just adopted from it. Draining without ownership checks causes freelist corruption.
        // The adopter will drain when needed (with proper ownership checks in tiny_refill.h).
        //
        // Previous code (UNSAFE):
        //   for (int s = 0; s < cap; s++) {
        //       ss_remote_drain_to_freelist(old, s);  // ← Race with concurrent adopter!
        //   }

        // Keep listed=1 while in overflow so it stays eligible for adopt
        // Push old into overflow stack (待機箱)
        SuperSlab* head;
        do {
            head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
            old->partial_next = head;
        } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old,
                                                        memory_order_release, memory_order_relaxed));
    }
    g_ss_publish_dbg[class_idx]++;
}

SuperSlab* ss_partial_adopt(int class_idx) {
    for (int i = 0; i < SS_PARTIAL_RING; i++) {
        SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel);
        if (ss) {
            // Clear listed flag on adopt to allow future publish of this SS
            atomic_store_explicit(&ss->listed, 0u, memory_order_release);
            g_ss_adopt_dbg[class_idx]++;
            return ss;
        }
    }
    // Fallback: adopt from overflow stack (LIFO)
    while (1) {
        SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
        if (!head) break;
        SuperSlab* next = head->partial_next;
        if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next,
                                                  memory_order_acq_rel, memory_order_relaxed)) {
            atomic_store_explicit(&head->listed, 0u, memory_order_release);
            g_ss_adopt_dbg[class_idx]++;
            return head;
        }
    }
    return NULL;
}

static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) {
    // Canonical binding under Phase 12:
    // - Per-slab TinySlabMeta.class_idx defines class for this slab
    // - slab_idx is the owning slab index within ss
    // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx)
    tls->ss = ss;
    tls->slab_idx = (uint8_t)slab_idx;
    tls->meta = &ss->slabs[slab_idx];
    tls->slab_base = tiny_slab_base_for(ss, slab_idx);
}

static inline uint32_t tiny_tls_default_refill(uint32_t cap) {
    if (cap == 0u) return 8u;
    uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u;
    if (low < 4u) low = 4u;
    return low;
}

static inline uint32_t tiny_tls_default_spill(uint32_t cap) {
    if (cap == 0u) return 0u;
    uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u);
    if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP;
    if (spill < cap) spill = cap;
    return (uint32_t)spill;
}

static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) {
    atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release);
    atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed);
    atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed);
    atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
}

static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) {
    atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release);
    atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
}

static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) {
    uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire);
    if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) {
        return;
    }
    uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire);
    if (target_cap != 0u && tls->cap != target_cap) {
        tls->cap = target_cap;
        uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed);
        if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap);
        tls->refill_low = target_refill;
        uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed);
        if (target_spill < target_cap) target_spill = target_cap;
        tls->spill_high = target_spill;
    }
    uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire);
    if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) {
        g_tls_trim_seen[class_idx] = trim_epoch;
        if (tls->count > tls->cap) {
            tls_list_spill_excess(class_idx, tls);
        }
    }
    g_tls_param_seen[class_idx] = seq;
}