## Summary - ChatGPT により bench_profile.h の setenv segfault を修正(RTLD_NEXT 経由に切り替え) - core/box/pool_zero_mode_box.h 新設:ENV キャッシュ経由で ZERO_MODE を統一管理 - core/hakmem_pool.c で zero mode に応じた memset 制御(FULL/header/off) - A/B テスト結果:ZERO_MODE=header で +15.34% improvement(1M iterations, C6-heavy) ## Files Modified - core/box/pool_api.inc.h: pool_zero_mode_box.h include - core/bench_profile.h: glibc setenv → malloc+putenv(segfault 回避) - core/hakmem_pool.c: zero mode 参照・制御ロジック - core/box/pool_zero_mode_box.h (新設): enum/getter - CURRENT_TASK.md: Phase ML1 結果記載 ## Test Results | Iterations | ZERO_MODE=full | ZERO_MODE=header | Improvement | |-----------|----------------|-----------------|------------| | 10K | 3.06 M ops/s | 3.17 M ops/s | +3.65% | | 1M | 23.71 M ops/s | 27.34 M ops/s | **+15.34%** | 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
255 lines
12 KiB
C++
255 lines
12 KiB
C++
// ============================================================================
|
||
// Global State
|
||
// ============================================================================
|
||
|
||
// Global pool instance (extern declared in hakmem_tiny.h)
|
||
TinyPool g_tiny_pool;
|
||
int g_tiny_initialized = 0; // Not static (extern in header for inline access)
|
||
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
|
||
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
|
||
//
|
||
// Results:
|
||
// Phase 1 (Push - deferred free): +1 instruction, zero benefit
|
||
// Phase 2 (Pull - background refill): +77 instructions, -3% performance
|
||
//
|
||
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
|
||
// - TLS Magazine capacity: 2048 items
|
||
// - Benchmark working set: 100 items
|
||
// - Magazine hit rate: 100% after warmup
|
||
// - Slow path never executed!
|
||
//
|
||
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
|
||
// - glibc: ~40 instructions/op (5-7× faster)
|
||
// - Gap is architectural (bitmap vs free-list, research features)
|
||
//
|
||
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
|
||
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
|
||
// Decision: Enable by default (proven production-ready)
|
||
static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!)
|
||
// Optional: allow limited trylock-based refill during wrapper calls
|
||
static int g_wrap_tiny_refill = 0;
|
||
// Remote-free drain controls
|
||
static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
|
||
static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
|
||
|
||
// ACE Learning Layer: Per-class remote drain thresholds
|
||
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
|
||
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
|
||
// Old: XOR RNG sampling (10-15 ns overhead)
|
||
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
|
||
static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
|
||
|
||
// Step 2: Slab Registry (Hash Table)
|
||
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
|
||
|
||
PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
|
||
|
||
// Registry lock
|
||
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
|
||
|
||
// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
|
||
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
|
||
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
|
||
static int g_use_registry = 1; // Default ON for thread-safety
|
||
|
||
// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
|
||
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
|
||
static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64)
|
||
static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
|
||
|
||
// hakmem_tiny_tls_list.h already included at top
|
||
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||
static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
|
||
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
|
||
static int g_fast_enable = 1;
|
||
static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
|
||
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
|
||
static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1
|
||
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
|
||
|
||
|
||
typedef void* (*TinyHotAllocFn)(void);
|
||
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
|
||
static __thread void* g_fast_head[TINY_NUM_CLASSES];
|
||
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
|
||
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
|
||
|
||
uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
|
||
uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
|
||
uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
|
||
uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
|
||
uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
|
||
uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
|
||
|
||
// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
|
||
// NEW: Per-thread active slabs (up to 2 per class)
|
||
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
|
||
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
|
||
|
||
static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
|
||
TinySlab* cand = g_tls_active_slab_a[class_idx];
|
||
if (cand) {
|
||
uintptr_t base = (uintptr_t)cand->base;
|
||
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
|
||
return cand;
|
||
}
|
||
}
|
||
cand = g_tls_active_slab_b[class_idx];
|
||
if (cand) {
|
||
uintptr_t base = (uintptr_t)cand->base;
|
||
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
|
||
return cand;
|
||
}
|
||
}
|
||
return NULL;
|
||
}
|
||
|
||
// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
|
||
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
|
||
extern int g_use_superslab;
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
|
||
if (!ptr) return;
|
||
// ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
|
||
void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
|
||
if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
||
SuperSlab* ss = hak_super_lookup(ptr);
|
||
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
|
||
tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
|
||
} else {
|
||
int slab_idx = slab_index_for(ss, base_ptr);
|
||
if (slab_idx < 0) {
|
||
tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
|
||
} else {
|
||
// Fail-Fast: class vs SuperSlab size_class must be consistent.
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
if (meta->class_idx != (uint8_t)cls) {
|
||
tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
|
||
}
|
||
size_t blk = g_tiny_class_sizes[cls];
|
||
uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
|
||
uintptr_t delta = (uintptr_t)base_ptr - base;
|
||
if (blk == 0 || (delta % blk) != 0) {
|
||
tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
|
||
} else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
|
||
tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
|
||
}
|
||
}
|
||
}
|
||
}
|
||
if (!__builtin_expect(g_debug_remote_guard, 0)) return;
|
||
if (!g_use_superslab) return;
|
||
SuperSlab* ss = hak_super_lookup(ptr);
|
||
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
|
||
int slab_idx = slab_index_for(ss, base_ptr);
|
||
if (slab_idx >= 0) {
|
||
tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
|
||
}
|
||
}
|
||
#else
|
||
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
|
||
#endif
|
||
|
||
// Debug counters for SuperSlab investigation
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
int g_superslab_alloc_count = 0;
|
||
int g_superslab_fail_count = 0;
|
||
int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees
|
||
int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected
|
||
int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes
|
||
int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls
|
||
#endif
|
||
|
||
// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
|
||
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
|
||
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
|
||
#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default)
|
||
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class
|
||
static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs
|
||
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
|
||
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
|
||
static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default
|
||
static uint32_t g_ss_partial_interval = 4;
|
||
static _Atomic uint32_t g_ss_partial_epoch = 0;
|
||
|
||
// Phase 6.24: Unified TLS slab cache (Medium fix)
|
||
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
|
||
// Phase E4: 64B alignment for L1 cache optimization
|
||
__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
|
||
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
|
||
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
|
||
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
|
||
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
|
||
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
|
||
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
|
||
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
|
||
|
||
// ----------------------------------------------------------------------------
|
||
// Per-class partial SuperSlab slot (single-slot publish/adopt)
|
||
// ----------------------------------------------------------------------------
|
||
// Small ring of partial SuperSlabs per class (publish/adopt)
|
||
#ifndef SS_PARTIAL_RING
|
||
#define SS_PARTIAL_RING 64
|
||
#endif
|
||
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
|
||
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
|
||
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
|
||
|
||
// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
|
||
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
|
||
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
|
||
_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs
|
||
static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
|
||
static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active
|
||
static _Atomic int g_ss_adopt_log_once = 0;
|
||
|
||
static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
|
||
if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
|
||
fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
|
||
reason ? reason : "unknown", class_idx);
|
||
}
|
||
}
|
||
|
||
static inline void tiny_adopt_gate_parse_env(void) {
|
||
if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
|
||
const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
|
||
if (!env || *env == '\0') {
|
||
g_ss_adopt_env = 0; // auto
|
||
} else if (*env == '0') {
|
||
g_ss_adopt_env = -1; // forced OFF
|
||
atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
|
||
} else {
|
||
g_ss_adopt_env = 1; // forced ON
|
||
atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
|
||
tiny_adopt_gate_log_activation("env", -1);
|
||
}
|
||
}
|
||
}
|
||
|
||
int tiny_adopt_gate_should_publish(void) {
|
||
tiny_adopt_gate_parse_env();
|
||
if (g_ss_adopt_env == 1) return 1;
|
||
if (g_ss_adopt_env == -1) return 0;
|
||
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
|
||
}
|
||
|
||
int tiny_adopt_gate_should_adopt(void) {
|
||
tiny_adopt_gate_parse_env();
|
||
if (g_ss_adopt_env == 1) return 1;
|
||
if (g_ss_adopt_env == -1) return 0;
|
||
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
|
||
}
|
||
|
||
void tiny_adopt_gate_on_remote_seen(int class_idx) {
|
||
tiny_adopt_gate_parse_env();
|
||
atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
|
||
if (g_ss_adopt_env == -1) return;
|
||
int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
|
||
if (prev == 0) {
|
||
tiny_adopt_gate_log_activation("remote", class_idx);
|
||
}
|
||
}
|
||
|
||
// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers
|