Files
hakmem/core/hakmem_tiny_alloc.inc
Moe Charm (CI) 8052e8b320 Phase 24-26: Hot path atomic telemetry prune (+2.00% cumulative)
Summary:
- Phase 24 (alloc stats): +0.93% GO
- Phase 25 (free stats): +1.07% GO
- Phase 26 (diagnostics): -0.33% NEUTRAL (code cleanliness)
- Total: 11 atomics compiled-out, +2.00% improvement

Phase 24: OBSERVE tax prune (tiny_class_stats_box.h)
- Added HAKMEM_TINY_CLASS_STATS_COMPILED (default: 0)
- Wrapped 5 stats functions: uc_miss, warm_hit, shared_lock, tls_carve_*
- Result: +0.93% (baseline 56.675M vs compiled-in 56.151M ops/s)

Phase 25: Tiny free stats prune (tiny_superslab_free.inc.h)
- Added HAKMEM_TINY_FREE_STATS_COMPILED (default: 0)
- Wrapped g_free_ss_enter atomic in free hot path
- Result: +1.07% (baseline 57.017M vs compiled-in 56.415M ops/s)

Phase 26: Hot path diagnostic atomics prune
- Added 5 compile gates for low-frequency error counters:
  - HAKMEM_TINY_C7_FREE_COUNT_COMPILED
  - HAKMEM_TINY_HDR_MISMATCH_LOG_COMPILED
  - HAKMEM_TINY_HDR_META_MISMATCH_COMPILED
  - HAKMEM_TINY_METRIC_BAD_CLASS_COMPILED
  - HAKMEM_TINY_HDR_META_FAST_COMPILED
- Result: -0.33% NEUTRAL (within noise, kept for cleanliness)

Alignment with mimalloc principles:
- "No atomics on hot path" - telemetry moved to compile-time opt-in
- Fixed per-op tax elimination
- Production builds: maximum performance (atomics compiled-out)
- Research builds: full diagnostics (COMPILED=1)

Generated with Claude Code
https://claude.com/claude-code

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-16 05:35:11 +09:00

344 lines
14 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// Box TLS-SLL API
// ============================================================================
#include "box/tls_sll_box.h"
#include "front/tiny_heap_v2.h"
// Optional: track alloc->class routing for sizes near 1KB (env: HAKMEM_TINY_ALLOC_1024_METRIC)
extern _Atomic uint64_t g_tiny_alloc_ge1024[TINY_NUM_CLASSES];
static inline void tiny_diag_track_size_ge1024(size_t req_size, int class_idx) {
if (__builtin_expect(req_size < 1024, 1)) return;
static int s_metric_en = -1;
if (__builtin_expect(s_metric_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_1024_METRIC");
s_metric_en = (e && *e && *e != '0') ? 1 : 0;
}
if (!__builtin_expect(s_metric_en, 0)) return;
if (__builtin_expect(class_idx >= 0 && class_idx < TINY_NUM_CLASSES, 1)) {
atomic_fetch_add_explicit(&g_tiny_alloc_ge1024[class_idx], 1, memory_order_relaxed);
} else {
// Phase 26D: Compile-out g_metric_bad_class_once atomic (default OFF)
#if HAKMEM_METRIC_BAD_CLASS_COMPILED
static _Atomic int g_metric_bad_class_once = 0;
if (atomic_fetch_add_explicit(&g_metric_bad_class_once, 1, memory_order_relaxed) == 0) {
fprintf(stderr, "[ALLOC_1024_METRIC] bad class_idx=%d size=%zu\n", class_idx, req_size);
}
#else
// No-op when compiled out
(void)0;
#endif
}
}
// ============================================================================
// Step 3: Cold-path outline - Wrapper Context Handler
// ============================================================================
// Purpose: Handle allocations during wrapper calls (rare execution)
// Rationale: Avoid re-entrancy hazards with pthread locks during wrapper calls
// Step 3d: Force inline for readability without performance loss
__attribute__((always_inline))
static inline void* hak_tiny_alloc_wrapper(int class_idx) {
ROUTE_BEGIN(class_idx);
// Wrapper-context fast path: magazine-only (never take locks or refill)
tiny_small_mags_init_once();
if (__builtin_expect(class_idx > 3, 0)) tiny_mag_init_if_needed(class_idx);
TinyTLSMag* mag = &g_tls_mags[class_idx];
if (mag->top > 0) {
void* p = mag->items[--mag->top].ptr;
HAK_RET_ALLOC(class_idx, p);
}
// Try TLS active slabs (owner-only, lock-free)
TinySlab* tls = g_tls_active_slab_a[class_idx];
if (!(tls && tls->free_count > 0)) tls = g_tls_active_slab_b[class_idx];
if (tls && tls->free_count > 0) {
tiny_remote_drain_owner(tls);
if (tls->free_count > 0) {
int block_idx = hak_tiny_find_free_block(tls);
if (block_idx >= 0) {
hak_tiny_set_used(tls, block_idx);
tls->free_count--;
size_t bs = g_tiny_class_sizes[class_idx];
void* p = (char*)tls->base + (block_idx * bs);
HAK_RET_ALLOC(class_idx, p);
}
}
}
// Optional: attempt limited refill under trylock (no remote drain)
if (g_wrap_tiny_refill) {
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
if (pthread_mutex_trylock(lock) == 0) {
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
if (slab && slab->free_count > 0) {
int room = mag->cap - mag->top;
if (room > 16) room = 16; // wrapper refill is small and quick
if (room > slab->free_count) room = slab->free_count;
if (room > 0) {
size_t bs = g_tiny_class_sizes[class_idx];
void* ret = NULL;
for (int i = 0; i < room; i++) {
int idx = hak_tiny_find_free_block(slab);
if (idx < 0) break;
hak_tiny_set_used(slab, idx);
slab->free_count--;
void* p = (char*)slab->base + (idx * bs);
if (i < room - 1) {
mag->items[mag->top].ptr = p;
mag->top++;
} else {
ret = p; // return one directly
}
}
if (slab->free_count == 0) {
move_to_full_list(class_idx, slab);
}
pthread_mutex_unlock(lock);
if (ret) { HAK_RET_ALLOC(class_idx, ret); }
} else {
pthread_mutex_unlock(lock);
}
} else {
pthread_mutex_unlock(lock);
}
}
}
return NULL; // empty → fallback to next allocator tier
}
void* hak_tiny_alloc(size_t size) {
#if !HAKMEM_BUILD_RELEASE
if (!g_tiny_initialized) hak_tiny_init();
#else
if (__builtin_expect(!g_tiny_initialized, 0)) {
hak_tiny_init();
}
#endif
// Default (safe): Avoid using Tiny during wrapper callsTLSガード or 関数)
// If HAKMEM_WRAP_TINY=1, allow Tiny even when called from wrapper.
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) {
static int log1 = 0;
if (log1 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: in_wrapper\n"); log1++; }
return NULL;
}
# else
extern int hak_in_wrapper(void);
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) {
static int log2 = 0;
if (log2 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: hak_in_wrapper\n"); log2++; }
return NULL;
}
# endif
#endif
// ========================================================================
// Cooperative stats polling (SIGUSR1 trigger safe point)
hak_tiny_stats_poll();
// ========================================================================
// Phase 6-1.6: Metadata Header Front (optional)
// ========================================================================
// Design: "Simple Front + Smart Back" - inspired by Mid-Large HAKX +171%
// - 3-4 instruction fast path (Phase 6-1 style)
// - Existing SuperSlab + ACE + Learning backend
//
// NOTE:
// - Phase 6-1.5 (HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) はレガシー経路として
// アーカイブ済み。現在は Metadata variant のみサポート。
#ifdef HAKMEM_TINY_PHASE6_METADATA
return hak_tiny_alloc_metadata(size);
#endif
// ========================================================================
// 1. Size → class index
int class_idx = hak_tiny_size_to_class(size);
if (class_idx < 0) {
static int log3 = 0;
if (log3 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: class_idx < 0 for size %zu\n", size); log3++; }
return NULL; // >1KB
}
#define HAK_RET_ALLOC_WITH_METRIC(ptr) do { \
tiny_diag_track_size_ge1024(size, class_idx); \
HAK_RET_ALLOC(class_idx, (ptr)); \
} while(0)
// Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
ROUTE_BEGIN(class_idx);
do {
static int g_alloc_ring = -1;
if (__builtin_expect(g_alloc_ring == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
}
if (g_alloc_ring) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, (uint16_t)class_idx, (void*)(uintptr_t)size, 0);
}
} while (0);
// Phase 13-A/B: Tiny Heap v2 front (tcache-like, A/B)
if (__builtin_expect(tiny_heap_v2_enabled() && front_prune_heapv2_enabled() && class_idx <= 3, 0)) {
void* base = tiny_heap_v2_alloc_by_class(class_idx);
if (base) {
front_metrics_heapv2_hit(class_idx);
HAK_RET_ALLOC_WITH_METRIC(base); // Header write + return USER pointer
} else {
front_metrics_heapv2_miss(class_idx);
}
// Fall through to existing front path if HeapV2 misses
}
#if HAKMEM_TINY_MINIMAL_FRONT
// Minimal Front for hot tiny classes (bench-focused):
// SLL direct pop → minimal refill → pop, bypassing other layers.
if (__builtin_expect(class_idx <= 3, 1)) {
void* head = NULL;
if (tls_sll_pop(class_idx, &head)) {
HAK_RET_ALLOC_WITH_METRIC(head);
}
// Refill a small batch directly from TLS-cached SuperSlab
#if HAKMEM_TINY_P0_BATCH_REFILL
(void)sll_refill_batch_from_ss(class_idx, 32);
#else
(void)sll_refill_small_from_ss(class_idx, 32);
#endif
if (tls_sll_pop(class_idx, &head)) {
HAK_RET_ALLOC_WITH_METRIC(head);
}
// Fall through to slow path if still empty
}
#endif
// Ultra-Front - REMOVED (dead code cleanup 2025-11-27)
if (__builtin_expect(!g_debug_fast0, 1)) {
#ifdef HAKMEM_TINY_BENCH_FASTPATH
if (__builtin_expect(class_idx <= HAKMEM_TINY_BENCH_TINY_CLASSES, 1)) {
if (__builtin_expect(class_idx <= 3, 1)) {
unsigned char* done = &g_tls_bench_warm_done[class_idx];
if (__builtin_expect(*done == 0, 0)) {
int warm = (class_idx == 0) ? HAKMEM_TINY_BENCH_WARMUP8 :
(class_idx == 1) ? HAKMEM_TINY_BENCH_WARMUP16 :
(class_idx == 2) ? HAKMEM_TINY_BENCH_WARMUP32 :
HAKMEM_TINY_BENCH_WARMUP64;
#if HAKMEM_TINY_P0_BATCH_REFILL
if (warm > 0) (void)sll_refill_batch_from_ss(class_idx, warm);
#else
if (warm > 0) (void)sll_refill_small_from_ss(class_idx, warm);
#endif
*done = 1;
}
}
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
tiny_small_mags_init_once();
if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
#endif
void* head = NULL;
if (tls_sll_pop(class_idx, &head)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 0);
HAK_RET_ALLOC_WITH_METRIC(head);
}
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
TinyTLSMag* mag = &g_tls_mags[class_idx];
int t = mag->top;
if (__builtin_expect(t > 0, 1)) {
void* p = mag->items[--t].ptr;
mag->top = t;
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, p, 1);
HAK_RET_ALLOC_WITH_METRIC(p);
}
#endif
int bench_refill = (class_idx == 0) ? HAKMEM_TINY_BENCH_REFILL8 :
(class_idx == 1) ? HAKMEM_TINY_BENCH_REFILL16 :
(class_idx == 2) ? HAKMEM_TINY_BENCH_REFILL32 :
HAKMEM_TINY_BENCH_REFILL64;
#if HAKMEM_TINY_P0_BATCH_REFILL
if (__builtin_expect(sll_refill_batch_from_ss(class_idx, bench_refill) > 0, 0)) {
#else
if (__builtin_expect(sll_refill_small_from_ss(class_idx, bench_refill) > 0, 0)) {
#endif
if (tls_sll_pop(class_idx, &head)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 2);
HAK_RET_ALLOC_WITH_METRIC(head);
}
}
// fallthrough to slow path on miss
}
#endif
// TinyHotMag front: fast-tierが枯渇したとき、キャッシュを再補充してから利用する
if (__builtin_expect(g_hotmag_enable && class_idx <= 2 && g_fast_head[class_idx] == NULL, 0)) {
hotmag_init_if_needed(class_idx);
TinyHotMag* hm = &g_tls_hot_mag[class_idx];
void* hotmag_ptr = hotmag_pop(class_idx);
if (__builtin_expect(hotmag_ptr == NULL, 0)) {
if (hotmag_try_refill(class_idx, hm) > 0) {
hotmag_ptr = hotmag_pop(class_idx);
}
}
if (__builtin_expect(hotmag_ptr != NULL, 1)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, hotmag_ptr, 3);
HAK_RET_ALLOC_WITH_METRIC(hotmag_ptr);
}
}
if (g_hot_alloc_fn[class_idx] != NULL) {
void* fast_hot = NULL;
switch (class_idx) {
case 0:
fast_hot = tiny_hot_pop_class0();
break;
case 1:
fast_hot = tiny_hot_pop_class1();
break;
case 2:
fast_hot = tiny_hot_pop_class2();
break;
case 3:
fast_hot = tiny_hot_pop_class3();
break;
default:
fast_hot = NULL;
break;
}
if (__builtin_expect(fast_hot != NULL, 1)) {
#if HAKMEM_BUILD_DEBUG
g_tls_hit_count[class_idx]++;
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_hot, 4);
HAK_RET_ALLOC_WITH_METRIC(fast_hot);
}
}
hak_base_ptr_t fast = tiny_fast_pop(class_idx);
if (__builtin_expect(!hak_base_is_null(fast), 0)) {
void* fast_raw = HAK_BASE_TO_RAW(fast);
#if HAKMEM_BUILD_DEBUG
g_tls_hit_count[class_idx]++;
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_raw, 5);
HAK_RET_ALLOC_WITH_METRIC(fast_raw);
}
} else {
tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, NULL, 0);
}
void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
if (slow_ptr) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, slow_ptr, 6);
HAK_RET_ALLOC_WITH_METRIC(slow_ptr); // Increment stats for slow path success
}
#if !HAKMEM_BUILD_RELEASE
tiny_alloc_dump_tls_state(class_idx, "fail", &g_tls_slabs[class_idx]);
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_NULL, (uint16_t)class_idx, NULL, 0);
return slow_ptr;
}
#undef HAK_RET_ALLOC_WITH_METRIC