// hakmem_tiny_publish_box.inc // Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers. // Extracted from hakmem_tiny.c to keep hot-path logic focused. // TLS hint: last adopted SuperSlab/slab to avoid rescans #include "tiny_sticky.h" // Mailbox box #include "box/mailbox_box.h" // Publish pipeline counters (visibility) unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0}; unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0}; unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0}; // Slab-ring counters (debug) unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0}; unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0}; // Slab entry encoding helpers (used by Bench/Slab-ring paths) static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) { return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu); } static inline SuperSlab* slab_entry_ss(uintptr_t ent) { // SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u)); } static inline int slab_entry_idx(uintptr_t ent) { return (int)(ent & 0x3Fu); } // ---------------------------------------------------------------------------- // Bench Mode Publish Mailbox (single-slot per class) // ---------------------------------------------------------------------------- static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1 static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES]; #ifndef BENCH_MAILBOX_WIDTH #define BENCH_MAILBOX_WIDTH 16 #endif static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH]; static inline int bench_mode_enabled(void) { if (__builtin_expect(g_bench_mode == -1, 0)) { const char* b = getenv("HAKMEM_TINY_BENCH_MODE"); g_bench_mode = (b && atoi(b) != 0) ? 1 : 0; } return g_bench_mode; } static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) { if (!bench_mode_enabled()) return; uintptr_t ent = slab_entry_make(ss, slab_idx); uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed); idx &= (BENCH_MAILBOX_WIDTH - 1); atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release); } static inline uintptr_t bench_pub_pop(int class_idx) { if (!bench_mode_enabled()) return (uintptr_t)0; for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) { uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel); if (ent) return ent; } return 0; } // ---------------------------------------------------------------------------- // Slab-Granular Partial Publish/Adopt (encoded entries) // ---------------------------------------------------------------------------- #ifndef SLAB_PARTIAL_RING #define SLAB_PARTIAL_RING 128 #endif static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING]; static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES]; // ---------------------------------------------------------------------------- // Refill-stage counters (per class) // ---------------------------------------------------------------------------- unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0}; // Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0) unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0}; // Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE) unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0}; // Refill item source breakdown (freelist vs carve) unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0}; static int g_rf_trace_en = -1; static inline int rf_trace_enabled(void) { if (__builtin_expect(g_rf_trace_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_RF_TRACE"); g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0; } return g_rf_trace_en; } static inline unsigned long long rf_now_ns(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec; } // Publish-side counters (debug) unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0}; // Free pipeline counters unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0}; // Front Gate Breakdown (debug counters) unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0}; unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0}; // Free-side trigger counters unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0}; unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0}; // Adopt/Registry gate counters unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0}; unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0}; unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_lookup_none = 0; // ---------------------------------------------------------------------------- // Live Superslab cap (must-adopt-before-mmap support) // ---------------------------------------------------------------------------- static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value __thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0}; static inline int live_cap_for_class(int class_idx) { if (__builtin_expect(g_live_cap_env == -2, 0)) { const char* s = getenv("HAKMEM_SS_LIVE_CAP"); if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); } } (void)class_idx; return g_live_cap_env; } // ---------------------------------------------------------------------------- // Hot Slot (global simple path) // ---------------------------------------------------------------------------- static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot) static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES]; static inline int hot_slot_enabled(void) { if (__builtin_expect(g_hot_slot_en == -1, 0)) { const char* s = getenv("HAKMEM_HOT_SLOT"); g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0; } return g_hot_slot_en || bench_mode_enabled(); } static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) { if (!hot_slot_enabled()) return; uintptr_t ent = slab_entry_make(ss, slab_idx); atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release); } static inline uintptr_t hot_slot_pop(int class_idx) { if (!hot_slot_enabled()) return (uintptr_t)0; return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel); } static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { if (!ss) return; uintptr_t ent = slab_entry_make(ss, slab_idx); for (int i = 0; i < SLAB_PARTIAL_RING; i++) { uintptr_t expected = 0; if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent, memory_order_release, memory_order_relaxed)) { g_slab_publish_dbg[class_idx]++; return; } } // Ring full: round-robin replace and try to requeue the displaced entry into another empty slot uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel); if (old) { for (int t = 0; t < 8; t++) { uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING; uintptr_t expected = 0; if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old, memory_order_release, memory_order_relaxed)) { g_slab_requeue_dbg[class_idx]++; old = 0; break; } } } g_slab_publish_dbg[class_idx]++; } static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) { for (int i = 0; i < SLAB_PARTIAL_RING; i++) { uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel); if (ent) return ent; } return 0; } void ss_partial_publish(int class_idx, SuperSlab* ss) { if (!ss) return; // Gate by listed flag to avoid repeated publishes of the same SS unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel); if (prev != 0u) return; // already listed // CRITICAL: Release ownership of all slabs so adopters can claim them! // Without this, ss_owner_try_acquire() will fail (requires owner_tid==0). // The publishing thread must stop using this SS after publishing. int cap_pub = ss_slabs_capacity(ss); for (int s = 0; s < cap_pub; s++) { // TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split TinySlabMeta* meta = ss_slab_meta_ptr(ss, s); uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE); if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) { uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev; tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE, (uint16_t)ss_slab_meta_class_idx_get(ss, s), meta, aux); } } // CRITICAL: Unbind current thread's TLS if it points to this SS! // Otherwise, the publishing thread will continue allocating from the published SS, // racing with adopters who acquire ownership. extern __thread TinyTLSSlab g_tls_slabs[]; if (g_tls_slabs[class_idx].ss == ss) { g_tls_slabs[class_idx].ss = NULL; g_tls_slabs[class_idx].meta = NULL; g_tls_slabs[class_idx].slab_base = NULL; g_tls_slabs[class_idx].slab_idx = 0; } // Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1) int best = -1; uint32_t best_score = 0; for (int s = 0; s < cap_pub; s++) { TinySlabMeta* m = &ss->slabs[s]; uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed); int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0); unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u; uint32_t score = rc + (m->freelist ? (1u<<30) : 0u) + (listed ? (1u<<29) : 0u) + (has_remote ? 1u : 0u); if (score > best_score) { best_score = score; best = s; } } if (best >= 0 && best < 256) { ss->publish_hint = (uint8_t)best; // Box: Ready push — provide slab-level candidate to adopters tiny_ready_push(class_idx, ss, best); } else { ss->publish_hint = 0xFF; } for (int i = 0; i < SS_PARTIAL_RING; i++) { SuperSlab* expected = NULL; if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss, memory_order_release, memory_order_relaxed)) { g_ss_publish_dbg[class_idx]++; return; // published } } // Ring full: replace one entry in round-robin to avoid dropping supply uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed); idx %= SS_PARTIAL_RING; SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel); if (old) { // NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads // that just adopted from it. Draining without ownership checks causes freelist corruption. // The adopter will drain when needed (with proper ownership checks in tiny_refill.h). // // Previous code (UNSAFE): // for (int s = 0; s < cap; s++) { // ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter! // } // Keep listed=1 while in overflow so it stays eligible for adopt // Push old into overflow stack (待機箱) SuperSlab* head; do { head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); old->partial_next = head; } while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old, memory_order_release, memory_order_relaxed)); } g_ss_publish_dbg[class_idx]++; } SuperSlab* ss_partial_adopt(int class_idx) { for (int i = 0; i < SS_PARTIAL_RING; i++) { SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel); if (ss) { // Clear listed flag on adopt to allow future publish of this SS atomic_store_explicit(&ss->listed, 0u, memory_order_release); g_ss_adopt_dbg[class_idx]++; return ss; } } // Fallback: adopt from overflow stack (LIFO) while (1) { SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire); if (!head) break; SuperSlab* next = head->partial_next; if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next, memory_order_acq_rel, memory_order_relaxed)) { atomic_store_explicit(&head->listed, 0u, memory_order_release); g_ss_adopt_dbg[class_idx]++; return head; } } return NULL; } static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) { // Canonical binding under Phase 12: // - Per-slab TinySlabMeta.class_idx defines class for this slab // - slab_idx is the owning slab index within ss // - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx) tls->ss = ss; tls->slab_idx = (uint8_t)slab_idx; tls->meta = &ss->slabs[slab_idx]; tls->slab_base = tiny_slab_base_for(ss, slab_idx); } static inline uint32_t tiny_tls_default_refill(uint32_t cap) { if (cap == 0u) return 8u; uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u; if (low < 4u) low = 4u; return low; } static inline uint32_t tiny_tls_default_spill(uint32_t cap) { if (cap == 0u) return 0u; uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u); if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP; if (spill < cap) spill = cap; return (uint32_t)spill; } static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) { atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release); atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed); atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed); atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); } static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) { atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release); atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release); } static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) { uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire); if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) { return; } uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire); if (target_cap != 0u && tls->cap != target_cap) { tls->cap = target_cap; uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed); if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap); tls->refill_low = target_refill; uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed); if (target_spill < target_cap) target_spill = target_cap; tls->spill_high = target_spill; } uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire); if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) { g_tls_trim_seen[class_idx] = trim_epoch; if (tls->count > tls->cap) { tls_list_spill_excess(class_idx, tls); } } g_tls_param_seen[class_idx] = seq; }