// ============================================================================ // Box TLS-SLL API // ============================================================================ #include "box/tls_sll_box.h" #include "front/tiny_heap_v2.h" // Optional: track alloc->class routing for sizes near 1KB (env: HAKMEM_TINY_ALLOC_1024_METRIC) extern _Atomic uint64_t g_tiny_alloc_ge1024[TINY_NUM_CLASSES]; static inline void tiny_diag_track_size_ge1024(size_t req_size, int class_idx) { if (__builtin_expect(req_size < 1024, 1)) return; static int s_metric_en = -1; if (__builtin_expect(s_metric_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ALLOC_1024_METRIC"); s_metric_en = (e && *e && *e != '0') ? 1 : 0; } if (!__builtin_expect(s_metric_en, 0)) return; if (__builtin_expect(class_idx >= 0 && class_idx < TINY_NUM_CLASSES, 1)) { atomic_fetch_add_explicit(&g_tiny_alloc_ge1024[class_idx], 1, memory_order_relaxed); } else { static _Atomic int g_metric_bad_class_once = 0; if (atomic_fetch_add_explicit(&g_metric_bad_class_once, 1, memory_order_relaxed) == 0) { fprintf(stderr, "[ALLOC_1024_METRIC] bad class_idx=%d size=%zu\n", class_idx, req_size); } } } // ============================================================================ // Step 3: Cold-path outline - Wrapper Context Handler // ============================================================================ // Purpose: Handle allocations during wrapper calls (rare execution) // Rationale: Avoid re-entrancy hazards with pthread locks during wrapper calls // Step 3d: Force inline for readability without performance loss __attribute__((always_inline)) static inline void* hak_tiny_alloc_wrapper(int class_idx) { ROUTE_BEGIN(class_idx); // Wrapper-context fast path: magazine-only (never take locks or refill) tiny_small_mags_init_once(); if (__builtin_expect(class_idx > 3, 0)) tiny_mag_init_if_needed(class_idx); TinyTLSMag* mag = &g_tls_mags[class_idx]; if (mag->top > 0) { void* p = mag->items[--mag->top].ptr; HAK_RET_ALLOC(class_idx, p); } // Try TLS active slabs (owner-only, lock-free) TinySlab* tls = g_tls_active_slab_a[class_idx]; if (!(tls && tls->free_count > 0)) tls = g_tls_active_slab_b[class_idx]; if (tls && tls->free_count > 0) { tiny_remote_drain_owner(tls); if (tls->free_count > 0) { int block_idx = hak_tiny_find_free_block(tls); if (block_idx >= 0) { hak_tiny_set_used(tls, block_idx); tls->free_count--; size_t bs = g_tiny_class_sizes[class_idx]; void* p = (char*)tls->base + (block_idx * bs); HAK_RET_ALLOC(class_idx, p); } } } // Optional: attempt limited refill under trylock (no remote drain) if (g_wrap_tiny_refill) { pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; if (pthread_mutex_trylock(lock) == 0) { TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; if (slab && slab->free_count > 0) { int room = mag->cap - mag->top; if (room > 16) room = 16; // wrapper refill is small and quick if (room > slab->free_count) room = slab->free_count; if (room > 0) { size_t bs = g_tiny_class_sizes[class_idx]; void* ret = NULL; for (int i = 0; i < room; i++) { int idx = hak_tiny_find_free_block(slab); if (idx < 0) break; hak_tiny_set_used(slab, idx); slab->free_count--; void* p = (char*)slab->base + (idx * bs); if (i < room - 1) { mag->items[mag->top].ptr = p; mag->top++; } else { ret = p; // return one directly } } if (slab->free_count == 0) { move_to_full_list(class_idx, slab); } pthread_mutex_unlock(lock); if (ret) { HAK_RET_ALLOC(class_idx, ret); } } else { pthread_mutex_unlock(lock); } } else { pthread_mutex_unlock(lock); } } } return NULL; // empty → fallback to next allocator tier } void* hak_tiny_alloc(size_t size) { #if !HAKMEM_BUILD_RELEASE if (!g_tiny_initialized) hak_tiny_init(); #else if (__builtin_expect(!g_tiny_initialized, 0)) { hak_tiny_init(); } #endif // Default (safe): Avoid using Tiny during wrapper calls(TLSガード or 関数) // If HAKMEM_WRAP_TINY=1, allow Tiny even when called from wrapper. #if !HAKMEM_BUILD_RELEASE # if HAKMEM_WRAPPER_TLS_GUARD if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) { static int log1 = 0; if (log1 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: in_wrapper\n"); log1++; } return NULL; } # else extern int hak_in_wrapper(void); if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) { static int log2 = 0; if (log2 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: hak_in_wrapper\n"); log2++; } return NULL; } # endif #endif // ======================================================================== // Cooperative stats polling (SIGUSR1 trigger safe point) hak_tiny_stats_poll(); // ======================================================================== // Phase 6-1.5: Ultra-Simple Fast Path (when enabled) // ======================================================================== // Design: "Simple Front + Smart Back" - inspired by Mid-Large HAKX +171% // - 3-4 instruction fast path (Phase 6-1 style) // - Existing SuperSlab + ACE + Learning backend // Two variants: // Phase 6-1.5: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1 (alignment guessing) // Phase 6-1.6: -DHAKMEM_TINY_PHASE6_METADATA=1 (metadata header) #ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE return hak_tiny_alloc_ultra_simple(size); #elif defined(HAKMEM_TINY_PHASE6_METADATA) return hak_tiny_alloc_metadata(size); #endif // ======================================================================== // 1. Size → class index int class_idx = hak_tiny_size_to_class(size); if (class_idx < 0) { static int log3 = 0; if (log3 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: class_idx < 0 for size %zu\n", size); log3++; } return NULL; // >1KB } #define HAK_RET_ALLOC_WITH_METRIC(ptr) do { \ tiny_diag_track_size_ge1024(size, class_idx); \ HAK_RET_ALLOC(class_idx, (ptr)); \ } while(0) // Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1) ROUTE_BEGIN(class_idx); do { static int g_alloc_ring = -1; if (__builtin_expect(g_alloc_ring == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ALLOC_RING"); g_alloc_ring = (e && *e && *e != '0') ? 1 : 0; } if (g_alloc_ring) { tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, (uint16_t)class_idx, (void*)(uintptr_t)size, 0); } } while (0); // Phase 13-A/B: Tiny Heap v2 front (tcache-like, A/B) if (__builtin_expect(tiny_heap_v2_enabled() && front_prune_heapv2_enabled() && class_idx <= 3, 0)) { void* base = tiny_heap_v2_alloc_by_class(class_idx); if (base) { front_metrics_heapv2_hit(class_idx); HAK_RET_ALLOC_WITH_METRIC(base); // Header write + return USER pointer } else { front_metrics_heapv2_miss(class_idx); } // Fall through to existing front path if HeapV2 misses } #if HAKMEM_TINY_MINIMAL_FRONT // Minimal Front for hot tiny classes (bench-focused): // SLL direct pop → minimal refill → pop, bypassing other layers. if (__builtin_expect(class_idx <= 3, 1)) { void* head = NULL; if (tls_sll_pop(class_idx, &head)) { HAK_RET_ALLOC_WITH_METRIC(head); } // Refill a small batch directly from TLS-cached SuperSlab #if HAKMEM_TINY_P0_BATCH_REFILL (void)sll_refill_batch_from_ss(class_idx, 32); #else (void)sll_refill_small_from_ss(class_idx, 32); #endif if (tls_sll_pop(class_idx, &head)) { HAK_RET_ALLOC_WITH_METRIC(head); } // Fall through to slow path if still empty } #endif // Ultra-Front - REMOVED (dead code cleanup 2025-11-27) if (__builtin_expect(!g_debug_fast0, 1)) { #ifdef HAKMEM_TINY_BENCH_FASTPATH if (__builtin_expect(class_idx <= HAKMEM_TINY_BENCH_TINY_CLASSES, 1)) { if (__builtin_expect(class_idx <= 3, 1)) { unsigned char* done = &g_tls_bench_warm_done[class_idx]; if (__builtin_expect(*done == 0, 0)) { int warm = (class_idx == 0) ? HAKMEM_TINY_BENCH_WARMUP8 : (class_idx == 1) ? HAKMEM_TINY_BENCH_WARMUP16 : (class_idx == 2) ? HAKMEM_TINY_BENCH_WARMUP32 : HAKMEM_TINY_BENCH_WARMUP64; #if HAKMEM_TINY_P0_BATCH_REFILL if (warm > 0) (void)sll_refill_batch_from_ss(class_idx, warm); #else if (warm > 0) (void)sll_refill_small_from_ss(class_idx, warm); #endif *done = 1; } } #ifndef HAKMEM_TINY_BENCH_SLL_ONLY tiny_small_mags_init_once(); if (class_idx > 3) tiny_mag_init_if_needed(class_idx); #endif void* head = NULL; if (tls_sll_pop(class_idx, &head)) { tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 0); HAK_RET_ALLOC_WITH_METRIC(head); } #ifndef HAKMEM_TINY_BENCH_SLL_ONLY TinyTLSMag* mag = &g_tls_mags[class_idx]; int t = mag->top; if (__builtin_expect(t > 0, 1)) { void* p = mag->items[--t].ptr; mag->top = t; tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, p, 1); HAK_RET_ALLOC_WITH_METRIC(p); } #endif int bench_refill = (class_idx == 0) ? HAKMEM_TINY_BENCH_REFILL8 : (class_idx == 1) ? HAKMEM_TINY_BENCH_REFILL16 : (class_idx == 2) ? HAKMEM_TINY_BENCH_REFILL32 : HAKMEM_TINY_BENCH_REFILL64; #if HAKMEM_TINY_P0_BATCH_REFILL if (__builtin_expect(sll_refill_batch_from_ss(class_idx, bench_refill) > 0, 0)) { #else if (__builtin_expect(sll_refill_small_from_ss(class_idx, bench_refill) > 0, 0)) { #endif if (tls_sll_pop(class_idx, &head)) { tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 2); HAK_RET_ALLOC_WITH_METRIC(head); } } // fallthrough to slow path on miss } #endif // TinyHotMag front: fast-tierが枯渇したとき、キャッシュを再補充してから利用する if (__builtin_expect(g_hotmag_enable && class_idx <= 2 && g_fast_head[class_idx] == NULL, 0)) { hotmag_init_if_needed(class_idx); TinyHotMag* hm = &g_tls_hot_mag[class_idx]; void* hotmag_ptr = hotmag_pop(class_idx); if (__builtin_expect(hotmag_ptr == NULL, 0)) { if (hotmag_try_refill(class_idx, hm) > 0) { hotmag_ptr = hotmag_pop(class_idx); } } if (__builtin_expect(hotmag_ptr != NULL, 1)) { tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, hotmag_ptr, 3); HAK_RET_ALLOC_WITH_METRIC(hotmag_ptr); } } if (g_hot_alloc_fn[class_idx] != NULL) { void* fast_hot = NULL; switch (class_idx) { case 0: fast_hot = tiny_hot_pop_class0(); break; case 1: fast_hot = tiny_hot_pop_class1(); break; case 2: fast_hot = tiny_hot_pop_class2(); break; case 3: fast_hot = tiny_hot_pop_class3(); break; default: fast_hot = NULL; break; } if (__builtin_expect(fast_hot != NULL, 1)) { #if HAKMEM_BUILD_DEBUG g_tls_hit_count[class_idx]++; #endif tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_hot, 4); HAK_RET_ALLOC_WITH_METRIC(fast_hot); } } hak_base_ptr_t fast = tiny_fast_pop(class_idx); if (__builtin_expect(!hak_base_is_null(fast), 0)) { void* fast_raw = HAK_BASE_TO_RAW(fast); #if HAKMEM_BUILD_DEBUG g_tls_hit_count[class_idx]++; #endif tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_raw, 5); HAK_RET_ALLOC_WITH_METRIC(fast_raw); } } else { tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, NULL, 0); } void* slow_ptr = hak_tiny_alloc_slow(size, class_idx); if (slow_ptr) { tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, slow_ptr, 6); HAK_RET_ALLOC_WITH_METRIC(slow_ptr); // Increment stats for slow path success } #if !HAKMEM_BUILD_RELEASE tiny_alloc_dump_tls_state(class_idx, "fail", &g_tls_slabs[class_idx]); #endif tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_NULL, (uint16_t)class_idx, NULL, 0); return slow_ptr; } #undef HAK_RET_ALLOC_WITH_METRIC