Files
hakmem/core/hakmem_tiny_publish_box.inc

428 lines
20 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_publish_box.inc
// Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers.
// Extracted from hakmem_tiny.c to keep hot-path logic focused.
// TLS hint: last adopted SuperSlab/slab to avoid rescans
#include "tiny_sticky.h"
// Tiny Page Box: C5〜C7 用 Tiny-Plus page poolSuperslab/Warm Pool より前段の箱)
// tiny_tls_bind_slab() で新しい TLS Slab が bind されたタイミングで
// tiny_page_box_on_new_slab(class_idx, tls) を呼び出し、Page Box 側の page pool を更新する。
#include "box/tiny_page_box.h"
// Mailbox box
#include "box/mailbox_box.h"
// Publish pipeline counters (visibility)
unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0};
unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0};
unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0};
unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0};
unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0};
// Slab-ring counters (debug)
unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0};
// Slab entry encoding helpers (used by Bench/Slab-ring paths)
static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) {
return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu);
}
static inline SuperSlab* slab_entry_ss(uintptr_t ent) {
// SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base
return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
}
static inline int slab_entry_idx(uintptr_t ent) {
return (int)(ent & 0x3Fu);
}
// ----------------------------------------------------------------------------
// Bench Mode Publish Mailbox (single-slot per class)
// ----------------------------------------------------------------------------
static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1
static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES];
#ifndef BENCH_MAILBOX_WIDTH
#define BENCH_MAILBOX_WIDTH 16
#endif
static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH];
static inline int bench_mode_enabled(void) {
if (__builtin_expect(g_bench_mode == -1, 0)) {
const char* b = getenv("HAKMEM_TINY_BENCH_MODE");
g_bench_mode = (b && atoi(b) != 0) ? 1 : 0;
}
return g_bench_mode;
}
static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) {
if (!bench_mode_enabled()) return;
uintptr_t ent = slab_entry_make(ss, slab_idx);
uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed);
idx &= (BENCH_MAILBOX_WIDTH - 1);
atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release);
}
static inline uintptr_t bench_pub_pop(int class_idx) {
if (!bench_mode_enabled()) return (uintptr_t)0;
for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) {
uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
if (ent) return ent;
}
return 0;
}
// ----------------------------------------------------------------------------
// Slab-Granular Partial Publish/Adopt (encoded entries)
// ----------------------------------------------------------------------------
#ifndef SLAB_PARTIAL_RING
#define SLAB_PARTIAL_RING 128
#endif
static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING];
static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES];
// ----------------------------------------------------------------------------
// Refill-stage counters (per class)
// ----------------------------------------------------------------------------
unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0};
// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0)
unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0};
// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE)
unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0};
// Refill item source breakdown (freelist vs carve)
unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0};
static int g_rf_trace_en = -1;
static inline int rf_trace_enabled(void) {
if (__builtin_expect(g_rf_trace_en == -1, 0)) {
// Unified trace: HAKMEM_TINY_RF_TRACE or HAKMEM_TRACE=refill
g_rf_trace_en = hak_trace_check("HAKMEM_TINY_RF_TRACE", "refill");
}
return g_rf_trace_en;
}
static inline unsigned long long rf_now_ns(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec;
}
// Publish-side counters (debug)
unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0};
unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0};
unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0};
// Free pipeline counters
unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0};
unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0};
unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0};
// Front Gate Breakdown (debug counters)
unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0};
unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0};
unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0};
// Free-side trigger counters
unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0};
unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0};
// Adopt/Registry gate counters
unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0};
unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0};
unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0};
unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0};
unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0};
unsigned long long g_fast_lookup_none = 0;
// ----------------------------------------------------------------------------
// Live Superslab cap (must-adopt-before-mmap support)
// ----------------------------------------------------------------------------
static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value
__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0};
static inline int live_cap_for_class(int class_idx) {
if (__builtin_expect(g_live_cap_env == -2, 0)) {
const char* s = getenv("HAKMEM_SS_LIVE_CAP");
if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); }
}
(void)class_idx;
return g_live_cap_env;
}
// ----------------------------------------------------------------------------
// Hot Slot (global simple path)
// ----------------------------------------------------------------------------
static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot)
static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES];
static inline int hot_slot_enabled(void) {
if (__builtin_expect(g_hot_slot_en == -1, 0)) {
const char* s = getenv("HAKMEM_HOT_SLOT");
g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0;
}
return g_hot_slot_en || bench_mode_enabled();
}
static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) {
if (!hot_slot_enabled()) return;
uintptr_t ent = slab_entry_make(ss, slab_idx);
atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release);
}
static inline uintptr_t hot_slot_pop(int class_idx) {
if (!hot_slot_enabled()) return (uintptr_t)0;
return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel);
}
static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
if (!ss) return;
uintptr_t ent = slab_entry_make(ss, slab_idx);
for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
uintptr_t expected = 0;
if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent,
memory_order_release, memory_order_relaxed)) {
g_slab_publish_dbg[class_idx]++;
return;
}
}
// Ring full: round-robin replace and try to requeue the displaced entry into another empty slot
uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel);
if (old) {
for (int t = 0; t < 8; t++) {
uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
uintptr_t expected = 0;
if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old,
memory_order_release, memory_order_relaxed)) {
g_slab_requeue_dbg[class_idx]++;
old = 0; break;
}
}
}
g_slab_publish_dbg[class_idx]++;
}
static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
if (ent) return ent;
}
return 0;
}
void ss_partial_publish(int class_idx, SuperSlab* ss) {
if (!ss) return;
// Gate by listed flag to avoid repeated publishes of the same SS
unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel);
if (prev != 0u) return; // already listed
// CRITICAL: Release ownership of all slabs so adopters can claim them!
// Without this, ss_owner_try_acquire() will fail (requires owner_tid==0).
// The publishing thread must stop using this SS after publishing.
int cap_pub = ss_slabs_capacity(ss);
for (int s = 0; s < cap_pub; s++) {
// TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split
TinySlabMeta* meta = ss_slab_meta_ptr(ss, s);
uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE);
if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) {
uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev;
tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE,
(uint16_t)ss_slab_meta_class_idx_get(ss, s),
meta,
aux);
}
}
// CRITICAL: Unbind current thread's TLS if it points to this SS!
// Otherwise, the publishing thread will continue allocating from the published SS,
// racing with adopters who acquire ownership.
extern __thread TinyTLSSlab g_tls_slabs[];
if (g_tls_slabs[class_idx].ss == ss) {
g_tls_slabs[class_idx].ss = NULL;
g_tls_slabs[class_idx].meta = NULL;
g_tls_slabs[class_idx].slab_base = NULL;
g_tls_slabs[class_idx].slab_idx = 0;
}
// Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1)
int best = -1; uint32_t best_score = 0;
for (int s = 0; s < cap_pub; s++) {
TinySlabMeta* m = &ss->slabs[s];
uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0);
unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u;
uint32_t score = rc
+ (m->freelist ? (1u<<30) : 0u)
+ (listed ? (1u<<29) : 0u)
+ (has_remote ? 1u : 0u);
if (score > best_score) { best_score = score; best = s; }
}
if (best >= 0 && best < 256) {
ss->publish_hint = (uint8_t)best;
// Box: Ready push — provide slab-level candidate to adopters
tiny_ready_push(class_idx, ss, best);
} else {
ss->publish_hint = 0xFF;
}
for (int i = 0; i < SS_PARTIAL_RING; i++) {
SuperSlab* expected = NULL;
if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
memory_order_release, memory_order_relaxed)) {
g_ss_publish_dbg[class_idx]++;
return; // published
}
}
// Ring full: replace one entry in round-robin to avoid dropping supply
uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed);
idx %= SS_PARTIAL_RING;
SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel);
if (old) {
// NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads
// that just adopted from it. Draining without ownership checks causes freelist corruption.
// The adopter will drain when needed (with proper ownership checks in tiny_refill.h).
//
// Previous code (UNSAFE):
// for (int s = 0; s < cap; s++) {
// ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter!
// }
// Keep listed=1 while in overflow so it stays eligible for adopt
// Push old into overflow stack (待機箱)
SuperSlab* head;
do {
head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
old->partial_next = head;
} while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old,
memory_order_release, memory_order_relaxed));
}
g_ss_publish_dbg[class_idx]++;
}
SuperSlab* ss_partial_adopt(int class_idx) {
for (int i = 0; i < SS_PARTIAL_RING; i++) {
SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel);
if (ss) {
// Clear listed flag on adopt to allow future publish of this SS
atomic_store_explicit(&ss->listed, 0u, memory_order_release);
g_ss_adopt_dbg[class_idx]++;
return ss;
}
}
// Fallback: adopt from overflow stack (LIFO)
while (1) {
SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
if (!head) break;
SuperSlab* next = head->partial_next;
if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next,
memory_order_acq_rel, memory_order_relaxed)) {
atomic_store_explicit(&head->listed, 0u, memory_order_release);
g_ss_adopt_dbg[class_idx]++;
return head;
}
}
return NULL;
}
static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) {
// Canonical binding under Phase 12:
// - Per-slab TinySlabMeta.class_idx defines class for this slab
// - slab_idx is the owning slab index within ss
// - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx)
tls->ss = ss;
tls->slab_idx = (uint8_t)slab_idx;
tls->meta = &ss->slabs[slab_idx];
tls->slab_base = tiny_slab_base_for(ss, slab_idx);
// Tiny Page Box にも新しい slab を通知しておく(有効クラスのみ)
int pb_class = tls->meta ? (int)tls->meta->class_idx : -1;
tiny_page_box_on_new_slab(pb_class, tls);
}
static inline uint32_t tiny_tls_default_refill(uint32_t cap) {
if (cap == 0u) return 8u;
uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u;
if (low < 4u) low = 4u;
return low;
}
static inline uint32_t tiny_tls_default_spill(uint32_t cap) {
if (cap == 0u) return 0u;
uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u);
if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP;
if (spill < cap) spill = cap;
return (uint32_t)spill;
}
static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) {
atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release);
atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed);
atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed);
atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
}
static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) {
atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release);
atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
}
static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) {
uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire);
if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) {
return;
}
uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire);
if (target_cap != 0u && tls->cap != target_cap) {
tls->cap = target_cap;
uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed);
if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap);
tls->refill_low = target_refill;
uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed);
if (target_spill < target_cap) target_spill = target_cap;
tls->spill_high = target_spill;
}
uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire);
if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) {
g_tls_trim_seen[class_idx] = trim_epoch;
if (tls->count > tls->cap) {
tls_list_spill_excess(class_idx, tls);
}
}
g_tls_param_seen[class_idx] = seq;
}