Files
hakmem/core/hakmem_tiny_alloc.inc
Moe Charm (CI) 52386401b3 Debug Counters Implementation - Clean History
Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 12:31:14 +09:00

286 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// Step 3: Cold-path outline - Wrapper Context Handler
// ============================================================================
// Purpose: Handle allocations during wrapper calls (rare execution)
// Rationale: Avoid re-entrancy hazards with pthread locks during wrapper calls
// Step 3d: Force inline for readability without performance loss
__attribute__((always_inline))
static inline void* hak_tiny_alloc_wrapper(int class_idx) {
// Wrapper-context fast path: magazine-only (never take locks or refill)
tiny_small_mags_init_once();
if (__builtin_expect(class_idx > 3, 0)) tiny_mag_init_if_needed(class_idx);
TinyTLSMag* mag = &g_tls_mags[class_idx];
if (mag->top > 0) {
void* p = mag->items[--mag->top].ptr;
HAK_RET_ALLOC(class_idx, p);
}
// Try TLS active slabs (owner-only, lock-free)
TinySlab* tls = g_tls_active_slab_a[class_idx];
if (!(tls && tls->free_count > 0)) tls = g_tls_active_slab_b[class_idx];
if (tls && tls->free_count > 0) {
tiny_remote_drain_owner(tls);
if (tls->free_count > 0) {
int block_idx = hak_tiny_find_free_block(tls);
if (block_idx >= 0) {
hak_tiny_set_used(tls, block_idx);
tls->free_count--;
size_t bs = g_tiny_class_sizes[class_idx];
void* p = (char*)tls->base + (block_idx * bs);
HAK_RET_ALLOC(class_idx, p);
}
}
}
// Optional: attempt limited refill under trylock (no remote drain)
if (g_wrap_tiny_refill) {
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
if (pthread_mutex_trylock(lock) == 0) {
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
if (slab && slab->free_count > 0) {
int room = mag->cap - mag->top;
if (room > 16) room = 16; // wrapper refill is small and quick
if (room > slab->free_count) room = slab->free_count;
if (room > 0) {
size_t bs = g_tiny_class_sizes[class_idx];
void* ret = NULL;
for (int i = 0; i < room; i++) {
int idx = hak_tiny_find_free_block(slab);
if (idx < 0) break;
hak_tiny_set_used(slab, idx);
slab->free_count--;
void* p = (char*)slab->base + (idx * bs);
if (i < room - 1) {
mag->items[mag->top].ptr = p;
mag->top++;
} else {
ret = p; // return one directly
}
}
if (slab->free_count == 0) {
move_to_full_list(class_idx, slab);
}
pthread_mutex_unlock(lock);
if (ret) { HAK_RET_ALLOC(class_idx, ret); }
} else {
pthread_mutex_unlock(lock);
}
} else {
pthread_mutex_unlock(lock);
}
}
}
return NULL; // empty → fallback to next allocator tier
}
void* hak_tiny_alloc(size_t size) {
#if !HAKMEM_BUILD_RELEASE
if (!g_tiny_initialized) hak_tiny_init();
#else
if (__builtin_expect(!g_tiny_initialized, 0)) {
hak_tiny_init();
}
#endif
// Default (safe): Avoid using Tiny during wrapper callsTLSガード or 関数)
// If HAKMEM_WRAP_TINY=1, allow Tiny even when called from wrapper.
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) {
static int log1 = 0;
if (log1 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: in_wrapper\n"); log1++; }
return NULL;
}
# else
extern int hak_in_wrapper(void);
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) {
static int log2 = 0;
if (log2 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: hak_in_wrapper\n"); log2++; }
return NULL;
}
# endif
#endif
// ========================================================================
// Cooperative stats polling (SIGUSR1 trigger safe point)
hak_tiny_stats_poll();
// ========================================================================
// Phase 6-1.5: Ultra-Simple Fast Path (when enabled)
// ========================================================================
// Design: "Simple Front + Smart Back" - inspired by Mid-Large HAKX +171%
// - 3-4 instruction fast path (Phase 6-1 style)
// - Existing SuperSlab + ACE + Learning backend
// Two variants:
// Phase 6-1.5: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1 (alignment guessing)
// Phase 6-1.6: -DHAKMEM_TINY_PHASE6_METADATA=1 (metadata header)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
return hak_tiny_alloc_ultra_simple(size);
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
return hak_tiny_alloc_metadata(size);
#endif
// ========================================================================
// 1. Size → class index
int class_idx = hak_tiny_size_to_class(size);
if (class_idx < 0) {
static int log3 = 0;
if (log3 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: class_idx < 0 for size %zu\n", size); log3++; }
return NULL; // >1KB
}
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, (uint16_t)class_idx, (void*)(uintptr_t)size, 0);
#if HAKMEM_TINY_MINIMAL_FRONT
// Minimal Front for hot tiny classes (bench-focused):
// SLL direct pop → minimal refill → pop, bypassing other layers.
if (__builtin_expect(class_idx <= 3, 1)) {
void* head = g_tls_sll_head[class_idx];
if (__builtin_expect(head != NULL, 1)) {
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
// Refill a small batch directly from TLS-cached SuperSlab
(void)sll_refill_small_from_ss(class_idx, 32);
head = g_tls_sll_head[class_idx];
if (__builtin_expect(head != NULL, 1)) {
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
// Fall through to slow path if still empty
}
#endif
// Ultra-Front: minimal per-class stack for hot tiny classes (opt-in)
// Try ultra_pop → (optional) ultra_refill_small → ultra_pop before other layers
if (__builtin_expect(g_ultra_simple && class_idx <= 3, 0)) {
void* up = ultra_pop(class_idx);
if (__builtin_expect(up == NULL, 0)) {
(void)ultra_refill_small(class_idx);
up = ultra_pop(class_idx);
}
if (__builtin_expect(up != NULL, 0)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, up, 0xF0);
HAK_RET_ALLOC(class_idx, up);
}
}
if (__builtin_expect(!g_debug_fast0, 1)) {
#ifdef HAKMEM_TINY_BENCH_FASTPATH
if (__builtin_expect(class_idx <= HAKMEM_TINY_BENCH_TINY_CLASSES, 1)) {
if (__builtin_expect(class_idx <= 3, 1)) {
unsigned char* done = &g_tls_bench_warm_done[class_idx];
if (__builtin_expect(*done == 0, 0)) {
int warm = (class_idx == 0) ? HAKMEM_TINY_BENCH_WARMUP8 :
(class_idx == 1) ? HAKMEM_TINY_BENCH_WARMUP16 :
(class_idx == 2) ? HAKMEM_TINY_BENCH_WARMUP32 :
HAKMEM_TINY_BENCH_WARMUP64;
if (warm > 0) (void)sll_refill_small_from_ss(class_idx, warm);
*done = 1;
}
}
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
tiny_small_mags_init_once();
if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
#endif
void* head = g_tls_sll_head[class_idx];
if (__builtin_expect(head != NULL, 1)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 0);
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
TinyTLSMag* mag = &g_tls_mags[class_idx];
int t = mag->top;
if (__builtin_expect(t > 0, 1)) {
void* p = mag->items[--t].ptr;
mag->top = t;
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, p, 1);
HAK_RET_ALLOC(class_idx, p);
}
#endif
int bench_refill = (class_idx == 0) ? HAKMEM_TINY_BENCH_REFILL8 :
(class_idx == 1) ? HAKMEM_TINY_BENCH_REFILL16 :
(class_idx == 2) ? HAKMEM_TINY_BENCH_REFILL32 :
HAKMEM_TINY_BENCH_REFILL64;
if (__builtin_expect(sll_refill_small_from_ss(class_idx, bench_refill) > 0, 0)) {
head = g_tls_sll_head[class_idx];
if (head) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 2);
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
}
// fallthrough to slow path on miss
}
#endif
// TinyHotMag front: fast-tierが枯渇したとき、キャッシュを再補充してから利用する
if (__builtin_expect(g_hotmag_enable && class_idx <= 2 && g_fast_head[class_idx] == NULL, 0)) {
hotmag_init_if_needed(class_idx);
TinyHotMag* hm = &g_tls_hot_mag[class_idx];
void* hotmag_ptr = hotmag_pop(class_idx);
if (__builtin_expect(hotmag_ptr == NULL, 0)) {
if (hotmag_try_refill(class_idx, hm) > 0) {
hotmag_ptr = hotmag_pop(class_idx);
}
}
if (__builtin_expect(hotmag_ptr != NULL, 1)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, hotmag_ptr, 3);
HAK_RET_ALLOC(class_idx, hotmag_ptr);
}
}
if (g_hot_alloc_fn[class_idx] != NULL) {
void* fast_hot = NULL;
switch (class_idx) {
case 0:
fast_hot = tiny_hot_pop_class0();
break;
case 1:
fast_hot = tiny_hot_pop_class1();
break;
case 2:
fast_hot = tiny_hot_pop_class2();
break;
case 3:
fast_hot = tiny_hot_pop_class3();
break;
default:
fast_hot = NULL;
break;
}
if (__builtin_expect(fast_hot != NULL, 1)) {
#if HAKMEM_BUILD_DEBUG
g_tls_hit_count[class_idx]++;
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_hot, 4);
HAK_RET_ALLOC(class_idx, fast_hot);
}
}
void* fast = tiny_fast_pop(class_idx);
if (__builtin_expect(fast != NULL, 0)) {
#if HAKMEM_BUILD_DEBUG
g_tls_hit_count[class_idx]++;
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast, 5);
HAK_RET_ALLOC(class_idx, fast);
}
} else {
tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, NULL, 0);
}
void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
if (slow_ptr) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, slow_ptr, 6);
HAK_RET_ALLOC(class_idx, slow_ptr); // Increment stats for slow path success
}
tiny_alloc_dump_tls_state(class_idx, "fail", &g_tls_slabs[class_idx]);
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_NULL, (uint16_t)class_idx, NULL, 0);
return slow_ptr;
}