// malloc_tiny_fast.h - Phase 26: Front Gate Unification (Tiny Fast Path) // // Goal: Eliminate 3-layer overhead (malloc → hak_alloc_at → wrapper → tiny_alloc_fast) // Target: +10-15% performance (11.35M → 12.5-13.5M ops/s) // // Design (ChatGPT analysis): // - Replace: malloc → hak_alloc_at (236 lines) → wrapper (diagnostics) → tiny_alloc_fast // - With: malloc → malloc_tiny_fast (single-layer, direct to Unified Cache) // - Preserves: Safety checks (lock depth, initializing, LD_SAFE, jemalloc block) // - Leverages: Phase 23 Unified Cache (tcache-style, 2-3 cache misses) // // Performance: // - Current overhead: malloc(8.97%) + routing + wrapper(3.63%) + tiny(5.37%) = 17.97% // - BenchFast ceiling: 8-10 instructions (~1-2% overhead) // - Gap: ~16% // - Target: Close half the gap (+10-15% improvement) // // ENV Variables: // HAKMEM_FRONT_GATE_UNIFIED=1 # Enable Front Gate Unification (default: 0, OFF) #ifndef HAK_FRONT_MALLOC_TINY_FAST_H #define HAK_FRONT_MALLOC_TINY_FAST_H #include #include #include #include // For pthread_self() in cross-thread check #include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES #include "../hakmem_super_registry.h" // For cross-thread owner check #include "../superslab/superslab_inline.h" // For slab_index_for #include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get #include "../box/free_remote_box.h" // For tiny_free_remote_box #include "tiny_unified_cache.h" // For unified_cache_pop_or_refill #include "../tiny_region_id.h" // For tiny_region_id_write_header #include "../hakmem_tiny.h" // For hak_tiny_size_to_class // Helper: current thread id (low 32 bits) for owner check #ifndef TINY_SELF_U32_LOCAL_DEFINED #define TINY_SELF_U32_LOCAL_DEFINED static inline uint32_t tiny_self_u32_local(void) { return (uint32_t)(uintptr_t)pthread_self(); } #endif // ============================================================================ // ENV Control (cached, lazy init) // ============================================================================ // Enable flag (default: 0, OFF) static inline int front_gate_unified_enabled(void) { static int g_enable = -1; if (__builtin_expect(g_enable == -1, 0)) { const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED"); g_enable = (e && *e && *e == '0') ? 0 : 1; // default ON #if !HAKMEM_BUILD_RELEASE if (g_enable) { fprintf(stderr, "[FrontGate-INIT] front_gate_unified_enabled() = %d\n", g_enable); fflush(stderr); } #endif } return g_enable; } // ============================================================================ // Phase 26-A: malloc_tiny_fast() - Ultra-thin Tiny allocation // ============================================================================ // Single-layer Tiny allocation (bypasses hak_alloc_at + wrapper + diagnostics) // Preconditions: // - Called AFTER malloc() safety checks (lock depth, initializing, LD_SAFE) // - size <= tiny_get_max_size() (caller verified) // Returns: // - USER pointer on success // - NULL on Unified Cache miss (caller falls back to normal path) __attribute__((always_inline)) static inline void* malloc_tiny_fast(size_t size) { // 1. size → class_idx (inline table lookup, 1-2 instructions) int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { return NULL; // Out of range (should not happen if caller checked tiny_get_max_size()) } // 2. Phase 23: Unified Cache pop-or-refill (tcache-style, 2-3 cache misses) // This internally handles: // - Cache hit: direct pop (fast path) // - Cache miss: batch refill from SuperSlab (slow path) void* base = unified_cache_pop_or_refill(class_idx); if (__builtin_expect(base == NULL, 0)) { // Unified Cache disabled OR refill failed // Fall back to normal path (caller handles via hak_alloc_at) return NULL; } // 3. Write header + return USER pointer (2-3 instructions) #ifdef HAKMEM_TINY_HEADER_CLASSIDX tiny_region_id_write_header(base, class_idx); // Write 1-byte header (BASE first!) return (void*)((char*)base + 1); // Return USER pointer #else return base; // No header mode - return BASE directly #endif } // ============================================================================ // Phase 26-B: free_tiny_fast() - Ultra-thin Tiny deallocation // ============================================================================ // Single-layer Tiny deallocation (bypasses hak_free_at + wrapper + diagnostics) // Preconditions: // - ptr is from malloc_tiny_fast() (has valid header) // - Front Gate Unified is enabled // Returns: // - 1 on success (pushed to Unified Cache) // - 0 on failure (caller falls back to normal free path) __attribute__((always_inline)) static inline int free_tiny_fast(void* ptr) { if (__builtin_expect(!ptr, 0)) return 0; #ifdef HAKMEM_TINY_HEADER_CLASSIDX // 1. ページ境界ガード: // ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。 // その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。 uintptr_t off = (uintptr_t)ptr & 0xFFFu; if (__builtin_expect(off == 0, 0)) { return 0; } // 2. Fast header magic validation (必須) // Release ビルドでは tiny_region_id_read_header() が magic を省略するため、 // ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。 uint8_t* header_ptr = (uint8_t*)ptr - 1; uint8_t header = *header_ptr; uint8_t magic = header & 0xF0u; if (__builtin_expect(magic != HEADER_MAGIC, 0)) { // Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ return 0; } // 3. class_idx 抽出(下位4bit) int class_idx = (int)(header & HEADER_CLASS_MASK); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { return 0; } // 4. BASE を計算して Unified Cache に push void* base = (void*)((char*)ptr - 1); // Cross-thread free detection (Larson MT crash fix, ENV gated) { static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL"); fflush(stderr); #endif } if (__builtin_expect(g_larson_fix, 0)) { SuperSlab* ss = hak_super_lookup(base); if (ss && ss->magic == SUPERSLAB_MAGIC) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) { uint32_t self_tid = tiny_self_u32_local(); uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); // LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes) uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_owner_check_count = 0; uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1); if (oc < 10) { fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n", ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp)); fflush(stderr); } #endif if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { // Cross-thread free → route to remote queue instead of poisoning TLS cache #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_cross_thread_count = 0; uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1); if (ct < 20) { fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n", ptr, owner_tid_low, self_tid_cmp, self_tid); fflush(stderr); } #endif TinySlabMeta* meta = &ss->slabs[slab_idx]; if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { return 1; // handled via remote queue } return 0; // remote push failed; fall back to normal path } } } } } // Debug: Log free operations (first 5000, all classes) #if !HAKMEM_BUILD_RELEASE { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); // Note: Shares g_debug_op_count with alloc logging, so bump the window. if (op < 5000) { fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n", (unsigned long)op, class_idx, ptr, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif int pushed = unified_cache_push(class_idx, base); if (__builtin_expect(pushed, 1)) { return 1; // Success } // Unified Cache full → 通常 free 経路へ return 0; #else // No header mode - fall back to normal free return 0; #endif } #endif // HAK_FRONT_MALLOC_TINY_FAST_H