Files
hakmem/core/hakmem_tiny_globals_box.inc

255 lines
12 KiB
PHP
Raw Normal View History

Refactor: Extract 5 Box modules from hakmem_tiny.c (-52% size reduction) Split hakmem_tiny.c (2081 lines) into focused modules for better maintainability. ## Changes **hakmem_tiny.c**: 2081 → 995 lines (-1086 lines, -52% reduction) ## Extracted Modules (5 boxes) 1. **config_box** (211 lines) - Size class tables, integrity counters - Debug flags, benchmark macros - HAK_RET_ALLOC/HAK_STAT_FREE instrumentation 2. **publish_box** (419 lines) - Publish/Adopt counters and statistics - Bench mailbox, partial ring - Live cap/Hot slot management - TLS helper functions (tiny_tls_default_*) 3. **globals_box** (256 lines) - Global variable declarations (~70 variables) - TinyPool instance and initialization flag - TLS variables (g_tls_lists, g_fast_head, g_fast_count) - SuperSlab configuration (partial ring, empty reserves) - Adopt gate functions 4. **phase6_wrappers_box** (122 lines) - Phase 6 Box Theory wrapper layer - hak_tiny_alloc_fast_wrapper() - hak_tiny_free_fast_wrapper() - Diagnostic instrumentation 5. **ace_guard_box** (100 lines) - ACE Learning Layer (hkm_ace_set_drain_threshold) - FastCache API (tiny_fc_room, tiny_fc_push_bulk) - Tiny Guard debugging system (5 functions) ## Benefits - **Readability**: Giant 2k file → focused 1k core + 5 coherent modules - **Maintainability**: Each box has clear responsibility and boundaries - **Build**: All modules compile successfully ✅ ## Technical Details - Phase 1: ChatGPT extracted config_box + publish_box (-625 lines) - Phase 2-4: Claude extracted globals_box + phase6_wrappers_box + ace_guard_box (-461 lines) - All extractions use .inc files (same translation unit, preserves static/TLS linkage) - Fixed Makefile: Added tiny_sizeclass_hist_box.o to OBJS_BASE and BENCH_HAKMEM_OBJS_BASE 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 01:16:45 +09:00
// ============================================================================
// Global State
// ============================================================================
// Global pool instance (extern declared in hakmem_tiny.h)
TinyPool g_tiny_pool;
int g_tiny_initialized = 0; // Not static (extern in header for inline access)
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
//
// Results:
// Phase 1 (Push - deferred free): +1 instruction, zero benefit
// Phase 2 (Pull - background refill): +77 instructions, -3% performance
//
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
// - TLS Magazine capacity: 2048 items
// - Benchmark working set: 100 items
// - Magazine hit rate: 100% after warmup
// - Slow path never executed!
//
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
// - glibc: ~40 instructions/op (5-7× faster)
// - Gap is architectural (bitmap vs free-list, research features)
//
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
// Decision: Enable by default (proven production-ready)
static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!)
// Optional: allow limited trylock-based refill during wrapper calls
static int g_wrap_tiny_refill = 0;
// Remote-free drain controls
static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
// ACE Learning Layer: Per-class remote drain thresholds
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
// Old: XOR RNG sampling (10-15 ns overhead)
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
// Step 2: Slab Registry (Hash Table)
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
// Registry lock
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
static int g_use_registry = 1; // Default ON for thread-safety
// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64)
static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
// hakmem_tiny_tls_list.h already included at top
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
static int g_fast_enable = 1;
static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
typedef void* (*TinyHotAllocFn)(void);
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
static __thread void* g_fast_head[TINY_NUM_CLASSES];
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
// NEW: Per-thread active slabs (up to 2 per class)
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
TinySlab* cand = g_tls_active_slab_a[class_idx];
if (cand) {
uintptr_t base = (uintptr_t)cand->base;
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
return cand;
}
}
cand = g_tls_active_slab_b[class_idx];
if (cand) {
uintptr_t base = (uintptr_t)cand->base;
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
return cand;
}
}
return NULL;
}
// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
extern int g_use_superslab;
#if !HAKMEM_BUILD_RELEASE
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
if (!ptr) return;
// ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
SuperSlab* ss = hak_super_lookup(ptr);
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
} else {
int slab_idx = slab_index_for(ss, base_ptr);
if (slab_idx < 0) {
tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
} else {
// Fail-Fast: class vs SuperSlab size_class must be consistent.
TinySlabMeta* meta = &ss->slabs[slab_idx];
if (meta->class_idx != (uint8_t)cls) {
tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
}
size_t blk = g_tiny_class_sizes[cls];
uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
uintptr_t delta = (uintptr_t)base_ptr - base;
if (blk == 0 || (delta % blk) != 0) {
tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
} else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
}
}
}
}
if (!__builtin_expect(g_debug_remote_guard, 0)) return;
if (!g_use_superslab) return;
SuperSlab* ss = hak_super_lookup(ptr);
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
int slab_idx = slab_index_for(ss, base_ptr);
if (slab_idx >= 0) {
tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
}
}
#else
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
#endif
// Debug counters for SuperSlab investigation
#if HAKMEM_DEBUG_COUNTERS
int g_superslab_alloc_count = 0;
int g_superslab_fail_count = 0;
int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees
int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected
int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes
int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls
#endif
// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default)
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class
static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default
static uint32_t g_ss_partial_interval = 4;
static _Atomic uint32_t g_ss_partial_epoch = 0;
// Phase 6.24: Unified TLS slab cache (Medium fix)
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
// Phase E4: 64B alignment for L1 cache optimization
__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
// ----------------------------------------------------------------------------
// Per-class partial SuperSlab slot (single-slot publish/adopt)
// ----------------------------------------------------------------------------
// Small ring of partial SuperSlabs per class (publish/adopt)
#ifndef SS_PARTIAL_RING
#define SS_PARTIAL_RING 64
#endif
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs
static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active
static _Atomic int g_ss_adopt_log_once = 0;
static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
reason ? reason : "unknown", class_idx);
}
}
static inline void tiny_adopt_gate_parse_env(void) {
if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
if (!env || *env == '\0') {
g_ss_adopt_env = 0; // auto
} else if (*env == '0') {
g_ss_adopt_env = -1; // forced OFF
atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
} else {
g_ss_adopt_env = 1; // forced ON
atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
tiny_adopt_gate_log_activation("env", -1);
}
}
}
int tiny_adopt_gate_should_publish(void) {
tiny_adopt_gate_parse_env();
if (g_ss_adopt_env == 1) return 1;
if (g_ss_adopt_env == -1) return 0;
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}
int tiny_adopt_gate_should_adopt(void) {
tiny_adopt_gate_parse_env();
if (g_ss_adopt_env == 1) return 1;
if (g_ss_adopt_env == -1) return 0;
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}
void tiny_adopt_gate_on_remote_seen(int class_idx) {
tiny_adopt_gate_parse_env();
atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
if (g_ss_adopt_env == -1) return;
int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
if (prev == 0) {
tiny_adopt_gate_log_activation("remote", class_idx);
}
}
// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers