Files
hakmem/core/hakmem_tiny_globals_box.inc
Moe Charm (CI) acc64f2438 Phase ML1: Pool v1 memset 89.73% overhead 軽量化 (+15.34% improvement)
## Summary
- ChatGPT により bench_profile.h の setenv segfault を修正(RTLD_NEXT 経由に切り替え)
- core/box/pool_zero_mode_box.h 新設:ENV キャッシュ経由で ZERO_MODE を統一管理
- core/hakmem_pool.c で zero mode に応じた memset 制御(FULL/header/off)
- A/B テスト結果:ZERO_MODE=header で +15.34% improvement(1M iterations, C6-heavy)

## Files Modified
- core/box/pool_api.inc.h: pool_zero_mode_box.h include
- core/bench_profile.h: glibc setenv → malloc+putenv(segfault 回避)
- core/hakmem_pool.c: zero mode 参照・制御ロジック
- core/box/pool_zero_mode_box.h (新設): enum/getter
- CURRENT_TASK.md: Phase ML1 結果記載

## Test Results
| Iterations | ZERO_MODE=full | ZERO_MODE=header | Improvement |
|-----------|----------------|-----------------|------------|
| 10K       | 3.06 M ops/s   | 3.17 M ops/s    | +3.65%     |
| 1M        | 23.71 M ops/s  | 27.34 M ops/s   | **+15.34%** |

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-10 09:08:18 +09:00

255 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// Global State
// ============================================================================
// Global pool instance (extern declared in hakmem_tiny.h)
TinyPool g_tiny_pool;
int g_tiny_initialized = 0; // Not static (extern in header for inline access)
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
//
// Results:
// Phase 1 (Push - deferred free): +1 instruction, zero benefit
// Phase 2 (Pull - background refill): +77 instructions, -3% performance
//
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
// - TLS Magazine capacity: 2048 items
// - Benchmark working set: 100 items
// - Magazine hit rate: 100% after warmup
// - Slow path never executed!
//
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
// - glibc: ~40 instructions/op (5-7× faster)
// - Gap is architectural (bitmap vs free-list, research features)
//
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
// Decision: Enable by default (proven production-ready)
static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!)
// Optional: allow limited trylock-based refill during wrapper calls
static int g_wrap_tiny_refill = 0;
// Remote-free drain controls
static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
// ACE Learning Layer: Per-class remote drain thresholds
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
// Old: XOR RNG sampling (10-15 ns overhead)
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
// Step 2: Slab Registry (Hash Table)
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
// Registry lock
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
static int g_use_registry = 1; // Default ON for thread-safety
// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64)
static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
// hakmem_tiny_tls_list.h already included at top
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
static int g_fast_enable = 1;
static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
typedef void* (*TinyHotAllocFn)(void);
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
static __thread void* g_fast_head[TINY_NUM_CLASSES];
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
// NEW: Per-thread active slabs (up to 2 per class)
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
TinySlab* cand = g_tls_active_slab_a[class_idx];
if (cand) {
uintptr_t base = (uintptr_t)cand->base;
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
return cand;
}
}
cand = g_tls_active_slab_b[class_idx];
if (cand) {
uintptr_t base = (uintptr_t)cand->base;
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
return cand;
}
}
return NULL;
}
// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
extern int g_use_superslab;
#if !HAKMEM_BUILD_RELEASE
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
if (!ptr) return;
// ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
SuperSlab* ss = hak_super_lookup(ptr);
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
} else {
int slab_idx = slab_index_for(ss, base_ptr);
if (slab_idx < 0) {
tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
} else {
// Fail-Fast: class vs SuperSlab size_class must be consistent.
TinySlabMeta* meta = &ss->slabs[slab_idx];
if (meta->class_idx != (uint8_t)cls) {
tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
}
size_t blk = g_tiny_class_sizes[cls];
uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
uintptr_t delta = (uintptr_t)base_ptr - base;
if (blk == 0 || (delta % blk) != 0) {
tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
} else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
}
}
}
}
if (!__builtin_expect(g_debug_remote_guard, 0)) return;
if (!g_use_superslab) return;
SuperSlab* ss = hak_super_lookup(ptr);
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
int slab_idx = slab_index_for(ss, base_ptr);
if (slab_idx >= 0) {
tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
}
}
#else
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
#endif
// Debug counters for SuperSlab investigation
#if HAKMEM_DEBUG_COUNTERS
int g_superslab_alloc_count = 0;
int g_superslab_fail_count = 0;
int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees
int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected
int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes
int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls
#endif
// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default)
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class
static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default
static uint32_t g_ss_partial_interval = 4;
static _Atomic uint32_t g_ss_partial_epoch = 0;
// Phase 6.24: Unified TLS slab cache (Medium fix)
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
// Phase E4: 64B alignment for L1 cache optimization
__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
// ----------------------------------------------------------------------------
// Per-class partial SuperSlab slot (single-slot publish/adopt)
// ----------------------------------------------------------------------------
// Small ring of partial SuperSlabs per class (publish/adopt)
#ifndef SS_PARTIAL_RING
#define SS_PARTIAL_RING 64
#endif
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs
static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active
static _Atomic int g_ss_adopt_log_once = 0;
static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
reason ? reason : "unknown", class_idx);
}
}
static inline void tiny_adopt_gate_parse_env(void) {
if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
if (!env || *env == '\0') {
g_ss_adopt_env = 0; // auto
} else if (*env == '0') {
g_ss_adopt_env = -1; // forced OFF
atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
} else {
g_ss_adopt_env = 1; // forced ON
atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
tiny_adopt_gate_log_activation("env", -1);
}
}
}
int tiny_adopt_gate_should_publish(void) {
tiny_adopt_gate_parse_env();
if (g_ss_adopt_env == 1) return 1;
if (g_ss_adopt_env == -1) return 0;
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}
int tiny_adopt_gate_should_adopt(void) {
tiny_adopt_gate_parse_env();
if (g_ss_adopt_env == 1) return 1;
if (g_ss_adopt_env == -1) return 0;
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
}
void tiny_adopt_gate_on_remote_seen(int class_idx) {
tiny_adopt_gate_parse_env();
atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
if (g_ss_adopt_env == -1) return;
int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
if (prev == 0) {
tiny_adopt_gate_log_activation("remote", class_idx);
}
}
// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers