248 lines
10 KiB
C
248 lines
10 KiB
C
// malloc_tiny_fast.h - Phase 26: Front Gate Unification (Tiny Fast Path)
|
||
//
|
||
// Goal: Eliminate 3-layer overhead (malloc → hak_alloc_at → wrapper → tiny_alloc_fast)
|
||
// Target: +10-15% performance (11.35M → 12.5-13.5M ops/s)
|
||
//
|
||
// Design (ChatGPT analysis):
|
||
// - Replace: malloc → hak_alloc_at (236 lines) → wrapper (diagnostics) → tiny_alloc_fast
|
||
// - With: malloc → malloc_tiny_fast (single-layer, direct to Unified Cache)
|
||
// - Preserves: Safety checks (lock depth, initializing, LD_SAFE, jemalloc block)
|
||
// - Leverages: Phase 23 Unified Cache (tcache-style, 2-3 cache misses)
|
||
//
|
||
// Performance:
|
||
// - Current overhead: malloc(8.97%) + routing + wrapper(3.63%) + tiny(5.37%) = 17.97%
|
||
// - BenchFast ceiling: 8-10 instructions (~1-2% overhead)
|
||
// - Gap: ~16%
|
||
// - Target: Close half the gap (+10-15% improvement)
|
||
//
|
||
// ENV Variables:
|
||
// HAKMEM_FRONT_GATE_UNIFIED=1 # Enable Front Gate Unification (default: 0, OFF)
|
||
|
||
#ifndef HAK_FRONT_MALLOC_TINY_FAST_H
|
||
#define HAK_FRONT_MALLOC_TINY_FAST_H
|
||
|
||
#include <stdint.h>
|
||
#include <stdlib.h>
|
||
#include <stdio.h>
|
||
#include <pthread.h> // For pthread_self() in cross-thread check
|
||
#include "../hakmem_build_flags.h"
|
||
#include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES
|
||
#include "../hakmem_super_registry.h" // For cross-thread owner check
|
||
#include "../superslab/superslab_inline.h" // For ss_fast_lookup, slab_index_for (Phase 12)
|
||
#include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get
|
||
#include "../box/free_remote_box.h" // For tiny_free_remote_box
|
||
#include "tiny_unified_cache.h" // For unified_cache_pop_or_refill
|
||
#include "../tiny_region_id.h" // For tiny_region_id_write_header
|
||
#include "../hakmem_tiny.h" // For hak_tiny_size_to_class
|
||
#include "../box/tiny_front_hot_box.h" // Phase 4-Step2: Hot Path Box
|
||
#include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box
|
||
|
||
// Helper: current thread id (low 32 bits) for owner check
|
||
#ifndef TINY_SELF_U32_LOCAL_DEFINED
|
||
#define TINY_SELF_U32_LOCAL_DEFINED
|
||
static inline uint32_t tiny_self_u32_local(void) {
|
||
return (uint32_t)(uintptr_t)pthread_self();
|
||
}
|
||
#endif
|
||
|
||
// ============================================================================
|
||
// ENV Control (cached, lazy init)
|
||
// ============================================================================
|
||
|
||
// Enable flag (default: 0, OFF)
|
||
static inline int front_gate_unified_enabled(void) {
|
||
static int g_enable = -1;
|
||
if (__builtin_expect(g_enable == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED");
|
||
g_enable = (e && *e && *e == '0') ? 0 : 1; // default ON
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
if (g_enable) {
|
||
fprintf(stderr, "[FrontGate-INIT] front_gate_unified_enabled() = %d\n", g_enable);
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
}
|
||
return g_enable;
|
||
}
|
||
|
||
// ============================================================================
|
||
// Phase 4-Step2: malloc_tiny_fast() - Hot/Cold Path Box (ACTIVE)
|
||
// ============================================================================
|
||
|
||
// Ultra-thin Tiny allocation using Hot/Cold Path Box (Phase 4-Step2)
|
||
//
|
||
// IMPROVEMENTS over Phase 26-A:
|
||
// - Branch reduction: Hot path has only 1 branch (cache empty check)
|
||
// - Branch hints: TINY_HOT_LIKELY/UNLIKELY for better CPU prediction
|
||
// - Hot/Cold separation: Keeps hot path small (better i-cache locality)
|
||
// - Explicit fallback: Clear hot→cold transition
|
||
//
|
||
// PERFORMANCE:
|
||
// - Baseline (Phase 26-A, no PGO): 53.3 M ops/s
|
||
// - Hot/Cold Box (no PGO): 57.2 M ops/s (+7.3%)
|
||
//
|
||
// DESIGN:
|
||
// 1. size → class_idx (same as Phase 26-A)
|
||
// 2. Hot path: tiny_hot_alloc_fast() - cache hit (1 branch)
|
||
// 3. Cold path: tiny_cold_refill_and_alloc() - cache miss (noinline, cold)
|
||
//
|
||
// Preconditions:
|
||
// - Called AFTER malloc() safety checks (lock depth, initializing, LD_SAFE)
|
||
// - size <= tiny_get_max_size() (caller verified)
|
||
// Returns:
|
||
// - USER pointer on success
|
||
// - NULL on failure (caller falls back to normal path)
|
||
//
|
||
__attribute__((always_inline))
|
||
static inline void* malloc_tiny_fast(size_t size) {
|
||
// 1. size → class_idx (inline table lookup, 1-2 instructions)
|
||
int class_idx = hak_tiny_size_to_class(size);
|
||
|
||
// 2. Phase 4-Step2: Hot/Cold Path Box
|
||
// Try hot path first (cache hit, 1 branch)
|
||
void* ptr = tiny_hot_alloc_fast(class_idx);
|
||
if (TINY_HOT_LIKELY(ptr != NULL)) {
|
||
// Hot path: Cache hit → return USER pointer
|
||
return ptr;
|
||
}
|
||
|
||
// 3. Cold path: Cache miss → refill + alloc
|
||
// noinline, cold attribute keeps this code out of hot path
|
||
return tiny_cold_refill_and_alloc(class_idx);
|
||
}
|
||
|
||
// ============================================================================
|
||
// Phase 26-B: free_tiny_fast() - Ultra-thin Tiny deallocation
|
||
// ============================================================================
|
||
|
||
// Single-layer Tiny deallocation (bypasses hak_free_at + wrapper + diagnostics)
|
||
// Preconditions:
|
||
// - ptr is from malloc_tiny_fast() (has valid header)
|
||
// - Front Gate Unified is enabled
|
||
// Returns:
|
||
// - 1 on success (pushed to Unified Cache)
|
||
// - 0 on failure (caller falls back to normal free path)
|
||
__attribute__((always_inline))
|
||
static inline int free_tiny_fast(void* ptr) {
|
||
if (__builtin_expect(!ptr, 0)) return 0;
|
||
|
||
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
|
||
// 1. ページ境界ガード:
|
||
// ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。
|
||
// その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。
|
||
uintptr_t off = (uintptr_t)ptr & 0xFFFu;
|
||
if (__builtin_expect(off == 0, 0)) {
|
||
return 0;
|
||
}
|
||
|
||
// 2. Fast header magic validation (必須)
|
||
// Release ビルドでは tiny_region_id_read_header() が magic を省略するため、
|
||
// ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。
|
||
uint8_t* header_ptr = (uint8_t*)ptr - 1;
|
||
uint8_t header = *header_ptr;
|
||
uint8_t magic = header & 0xF0u;
|
||
if (__builtin_expect(magic != HEADER_MAGIC, 0)) {
|
||
// Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ
|
||
return 0;
|
||
}
|
||
|
||
// 3. class_idx 抽出(下位4bit)
|
||
int class_idx = (int)(header & HEADER_CLASS_MASK);
|
||
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
|
||
return 0;
|
||
}
|
||
|
||
// 4. BASE を計算して Unified Cache に push
|
||
void* base = (void*)((char*)ptr - 1);
|
||
|
||
// 5. Superslab 登録確認(誤分類防止)
|
||
SuperSlab* ss_guard = hak_super_lookup(ptr);
|
||
if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) {
|
||
return 0; // hakmem 管理外 → 通常 free 経路へ
|
||
}
|
||
|
||
// Cross-thread free detection (Larson MT crash fix, ENV gated)
|
||
{
|
||
static __thread int g_larson_fix = -1;
|
||
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
||
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL");
|
||
fflush(stderr);
|
||
#endif
|
||
}
|
||
|
||
if (__builtin_expect(g_larson_fix, 0)) {
|
||
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
|
||
SuperSlab* ss = ss_fast_lookup(base);
|
||
if (ss) {
|
||
int slab_idx = slab_index_for(ss, base);
|
||
if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) {
|
||
uint32_t self_tid = tiny_self_u32_local();
|
||
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
|
||
// LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes)
|
||
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
static _Atomic uint64_t g_owner_check_count = 0;
|
||
uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1);
|
||
if (oc < 10) {
|
||
fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n",
|
||
ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp));
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
|
||
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
|
||
// Cross-thread free → route to remote queue instead of poisoning TLS cache
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
static _Atomic uint64_t g_cross_thread_count = 0;
|
||
uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1);
|
||
if (ct < 20) {
|
||
fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n",
|
||
ptr, owner_tid_low, self_tid_cmp, self_tid);
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
|
||
return 1; // handled via remote queue
|
||
}
|
||
return 0; // remote push failed; fall back to normal path
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Debug: Log free operations (first 5000, all classes)
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
{
|
||
extern _Atomic uint64_t g_debug_op_count;
|
||
extern __thread TinyTLSSLL g_tls_sll[];
|
||
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
|
||
// Note: Shares g_debug_op_count with alloc logging, so bump the window.
|
||
if (op < 5000) {
|
||
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n",
|
||
(unsigned long)op, class_idx, ptr, base,
|
||
g_tls_sll[class_idx].count);
|
||
fflush(stderr);
|
||
}
|
||
}
|
||
#endif
|
||
|
||
int pushed = unified_cache_push(class_idx, base);
|
||
if (__builtin_expect(pushed, 1)) {
|
||
return 1; // Success
|
||
}
|
||
|
||
// Unified Cache full → 通常 free 経路へ
|
||
return 0;
|
||
#else
|
||
// No header mode - fall back to normal free
|
||
return 0;
|
||
#endif
|
||
}
|
||
|
||
#endif // HAK_FRONT_MALLOC_TINY_FAST_H
|