diff --git a/core/box/front_gate_classifier.c b/core/box/front_gate_classifier.c index 557d9878..464be322 100644 --- a/core/box/front_gate_classifier.c +++ b/core/box/front_gate_classifier.c @@ -8,6 +8,7 @@ #include // For fprintf in debug #include // For abort in debug +#include // For mincore() in Step 3 safety check #include "front_gate_classifier.h" #include "../tiny_region_id.h" // Must come before hakmem_tiny_superslab.h for HEADER_MAGIC #include "../hakmem_tiny_superslab.h" @@ -207,27 +208,25 @@ ptr_classification_t classify_ptr(void* ptr) { return result; } - // Step 3: Try AllocHeader (HAKMEM header) for Mid/Large/Mmap - do { - if (!ptr) break; - // Quick page-safety check: avoid crossing page for header read - uintptr_t off = (uintptr_t)ptr & 0xFFFu; - int safe_same_page = (off >= HEADER_SIZE); - void* raw = (char*)ptr - HEADER_SIZE; - if (!safe_same_page) { - if (!hak_is_memory_readable(raw)) break; - } - AllocHeader* hdr = (AllocHeader*)raw; - if (hdr->magic == HAKMEM_MAGIC) { - result.kind = PTR_KIND_MID_LARGE; // HAKMEM-owned (non-Tiny) -#if !HAKMEM_BUILD_RELEASE - g_classify_unknown_hit++; // reuse for stats without adding a new counter -#endif - return result; - } - } while (0); - - // Step 4: Not recognized → UNKNOWN (route to libc or slow path) + // Step 3: SAFETY FIX - Skip AllocHeader probe for unknown pointers + // + // RATIONALE: + // - If pointer isn't in Pool TLS or SuperSlab registries, it's either: + // 1. Mid/Large allocation (has AllocHeader) + // 2. External allocation (libc, stack, etc.) + // - We CANNOT safely distinguish (1) from (2) without dereferencing memory + // - Dereferencing unknown memory can SEGV (e.g., ptr at page boundary) + // - SAFER approach: Return UNKNOWN and let free wrapper handle it + // + // FREE WRAPPER BEHAVIOR (hak_free_api.inc.h): + // - PTR_KIND_UNKNOWN routes to Mid/Large registry lookups (hak_pool_mid_lookup, hak_l25_lookup) + // - If those fail → routes to AllocHeader dispatch (safe, same-page check) + // - If AllocHeader invalid → routes to __libc_free() + // + // PERFORMANCE IMPACT: + // - Only affects pointers NOT in our registries (rare) + // - Avoids SEGV on external pointers (correctness > performance) + // result.kind = PTR_KIND_UNKNOWN; #if !HAKMEM_BUILD_RELEASE diff --git a/core/box/hak_alloc_api.inc.h b/core/box/hak_alloc_api.inc.h index 51622243..bde1289a 100644 --- a/core/box/hak_alloc_api.inc.h +++ b/core/box/hak_alloc_api.inc.h @@ -21,14 +21,6 @@ static inline void* hak_os_map_boundary(size_t size, uintptr_t site_id) { __attribute__((always_inline)) inline void* hak_alloc_at(size_t size, hak_callsite_t site) { -#if !HAKMEM_BUILD_RELEASE - static _Atomic uint64_t hak_alloc_call_count = 0; - uint64_t call_num = atomic_fetch_add(&hak_alloc_call_count, 1); - if (call_num > 14250 && call_num < 14280 && size <= 1024) { - fprintf(stderr, "[HAK_ALLOC_AT] call=%lu size=%zu\n", call_num, size); - fflush(stderr); - } -#endif #if HAKMEM_DEBUG_TIMING HKM_TIME_START(t0); @@ -38,30 +30,12 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) { uintptr_t site_id = (uintptr_t)site; if (__builtin_expect(size <= TINY_MAX_SIZE, 1)) { -#if !HAKMEM_BUILD_RELEASE - if (call_num > 14250 && call_num < 14280 && size <= 1024) { - fprintf(stderr, "[HAK_ALLOC_AT] call=%lu entering tiny path\n", call_num); - fflush(stderr); - } -#endif #if HAKMEM_DEBUG_TIMING HKM_TIME_START(t_tiny); #endif void* tiny_ptr = NULL; #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR -#if !HAKMEM_BUILD_RELEASE - if (call_num > 14250 && call_num < 14280 && size <= 1024) { - fprintf(stderr, "[HAK_ALLOC_AT] call=%lu calling hak_tiny_alloc_fast_wrapper\n", call_num); - fflush(stderr); - } -#endif tiny_ptr = hak_tiny_alloc_fast_wrapper(size); -#if !HAKMEM_BUILD_RELEASE - if (call_num > 14250 && call_num < 14280 && size <= 1024) { - fprintf(stderr, "[HAK_ALLOC_AT] call=%lu hak_tiny_alloc_fast_wrapper returned %p\n", call_num, tiny_ptr); - fflush(stderr); - } -#endif #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) tiny_ptr = hak_tiny_alloc_ultra_simple(size); #elif defined(HAKMEM_TINY_PHASE6_METADATA) diff --git a/core/box/hak_free_api.inc.h b/core/box/hak_free_api.inc.h index 2014da86..a468af47 100644 --- a/core/box/hak_free_api.inc.h +++ b/core/box/hak_free_api.inc.h @@ -2,6 +2,7 @@ #ifndef HAK_FREE_API_INC_H #define HAK_FREE_API_INC_H +#include // For mincore() in AllocHeader safety check #include "hakmem_tiny_superslab.h" // For SUPERSLAB_MAGIC, SuperSlab #include "../tiny_free_fast_v2.inc.h" // Phase 7: Header-based ultra-fast free #include "../ptr_trace.h" // Debug: pointer trace immediate dump on libc fallback @@ -191,9 +192,26 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) { { void* raw = (char*)ptr - HEADER_SIZE; - // CRITICAL FIX (2025-11-07): Check if memory is accessible before dereferencing - // This prevents SEGV when ptr has no header (Tiny alloc where SS lookup failed, or libc alloc) - if (!hak_is_memory_readable(raw)) { + // CRITICAL FIX (2025-11-14): Use real mincore() to check memory accessibility + // Phase 9 gutted hak_is_memory_readable() to always return 1 (unsafe!) + // We MUST verify memory is mapped before dereferencing AllocHeader + int is_mapped = 0; + #ifdef __linux__ + { + unsigned char vec; + // Check both pages if header crosses page boundary + void* page1 = (void*)((uintptr_t)raw & ~0xFFFUL); + void* page2 = (void*)(((uintptr_t)raw + sizeof(AllocHeader) - 1) & ~0xFFFUL); + is_mapped = (mincore(page1, 1, &vec) == 0); + if (is_mapped && page2 != page1) { + is_mapped = (mincore(page2, 1, &vec) == 0); + } + } + #else + is_mapped = 1; // Assume mapped on non-Linux + #endif + + if (!is_mapped) { // Memory not accessible, ptr likely has no header hak_free_route_log("unmapped_header_fallback", ptr); diff --git a/core/hakmem_build_flags.h b/core/hakmem_build_flags.h index 638bc8ce..96ea335a 100644 --- a/core/hakmem_build_flags.h +++ b/core/hakmem_build_flags.h @@ -63,6 +63,13 @@ # define HAKMEM_TINY_AGGRESSIVE_INLINE 0 #endif +// Inline TLS SLL pop (experimental, A/B only) +// Default: OFF (HAKMEM_TINY_INLINE_SLL=0) to keep Box TLS-SLL API as the standard path. +// Enable explicitly via build flag: -DHAKMEM_TINY_INLINE_SLL=1 (bench/debug only). +#ifndef HAKMEM_TINY_INLINE_SLL +# define HAKMEM_TINY_INLINE_SLL 0 +#endif + // Phase 7 Task 3: Pre-warm TLS cache at init // Default: OFF (enable after implementation) // Build: make PREWARM_TLS=1 or make phase7 diff --git a/core/hakmem_pool.c.bak2 b/core/hakmem_pool.c.bak2 deleted file mode 100644 index 0b507e3f..00000000 --- a/core/hakmem_pool.c.bak2 +++ /dev/null @@ -1,1454 +0,0 @@ -// ============================================================================ -// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB) -// ============================================================================ -// -// サイズクラス定義: -// ┌──────────┬─────────┬──────────────┬─────────────┐ -// │ クラス │ サイズ │ 初期CAP │ ページ構成 │ -// ├──────────┼─────────┼──────────────┼─────────────┤ -// │ Class 0 │ 2 KiB │ 64 pages │ 32 blocks/p │ -// │ Class 1 │ 4 KiB │ 64 pages │ 16 blocks/p │ -// │ Class 2 │ 8 KiB │ 64 pages │ 8 blocks/p │ -// │ Class 3 │ 16 KiB │ 32 pages │ 4 blocks/p │ -// │ Class 4 │ 32 KiB │ 16 pages │ 2 blocks/p │ -// │ DYN1 │ 6 KiB* │ 0 (無効) │ 可変 │ -// │ DYN2 │ (未使用)│ 0 (無効) │ 可変 │ -// └──────────┴─────────┴──────────────┴─────────────┘ -// * DYN1はギャップ(8-16KB)を埋めるための動的クラス -// -// W_MAX (切り上げ許容倍率): -// - 意味: 要求サイズの何倍までのクラスを許容するか -// - デフォルト: 1.40 (40%までの切り上げを許容) -// - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40) -// - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能 -// -// CAP (在庫量): -// - 意味: 各クラスで保持する最大ページ数 -// - 初期値: {64,64,64,32,16} - 保守的(フットプリント優先) -// - 推奨値: {256,256,256,128,64} - パフォーマンス優先 -// - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定 -// - 学習モード: HAKMEM_LEARN=1 で自動調整 -// -// TLSリング構造: -// - POOL_L2_RING_CAP: リングバッファ容量(デフォルト16) -// - ActivePage A/B: bump-run方式(ロックフリー) -// - LIFO overflow: リングから溢れた分 -// -// パフォーマンスチューニング: -// 1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64 -// 2. W_MAX緩和: HAKMEM_WMAX_MID=1.6 -// 3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64 -// 4. 学習モード: HAKMEM_LEARN=1 -// -// License: MIT -// Last Updated: 2025-10-26 (Code Cleanup完了) - -#include "hakmem_pool.h" -#include "hakmem_config.h" -#include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC -#include "hakmem_syscall.h" // Box 3 syscall layer (bypasses LD_PRELOAD) -#include -#include -#include -#include -#include -#include -#include -#include "hakmem_prof.h" -#include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP gating) -#include "hakmem_debug.h" - -// False sharing mitigation: padded mutex type (64B) -typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex; - -// =========================================================================== -// Internal Data Structures -// =========================================================================== -#include "box/pool_tls_types.inc.h" - -// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid}) -#include "box/pool_mid_desc.inc.h" - -// ---------------- Transfer Cache (per-thread per-class inbox) -------------- -#include "box/pool_mid_tc.inc.h" - -// =========================================================================== -// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture -// =========================================================================== -// -// Key idea: Each 64KB page has independent freelist (no sharing!) -// - O(1) page lookup from block address: (addr & ~0xFFFF) -// - Owner thread: fast path (no locks, no atomics) -// - Cross-thread free: lock-free remote stack -// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc) - -// MF2 Configuration Constants (Quick Win #5) -#define MF2_PENDING_QUEUE_BUDGET 4 // Max pages to drain from pending queue -#define MF2_DEBUG_SAMPLE_COUNT 20 // Number of debug samples to log -#define MF2_TSC_CYCLES_PER_US 3000 // Estimated TSC cycles per microsecond -#define MF2_PAGE_SIZE_SHIFT 16 // log2(64KB) for fast division -#define MF2_PAGE_ALIGNMENT 65536 // 64KB alignment for mmap - -// Debug Logging Macros (Quick Win #6) -// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable -#ifdef HAKMEM_DEBUG_MF2 - #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__) - #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) -#else - #define MF2_DEBUG_LOG(fmt, ...) ((void)0) - #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) -#endif - -// Forward declarations -static size_t g_class_sizes[POOL_NUM_CLASSES]; - -// MF2 Page descriptor: per-page metadata (one per 64KB page) -typedef struct MidPage { - // Page identity - void* base; // Page base address (64KB aligned) - uint8_t class_idx; // Size class index (0-6) - uint8_t flags; // Page flags (reserved for future use) - uint16_t _pad0; - - // Ownership - pthread_t owner_tid; // Owner thread ID (for fast-path check) - struct MF2_ThreadPages* owner_tp; // Owner thread's heap (for pending queue access) - uint64_t last_transfer_time; // rdtsc() timestamp of last ownership transfer (lease mechanism) - - // Page-local freelist (owner-only, NO LOCK!) - PoolBlock* freelist; // Local freelist head - uint16_t free_count; // Number of free blocks - uint16_t capacity; // Total blocks per page - - // Remote frees (cross-thread, lock-free MPSC stack) - atomic_uintptr_t remote_head; // Lock-free remote free stack - atomic_uint remote_count; // Remote free count (for quick check) - - // Lifecycle - atomic_int in_use; // Live allocations on this page - atomic_int pending_dn; // DONTNEED enqueued flag - - // Linkage (thread-local page lists) - struct MidPage* next_page; // Next page in thread's list - struct MidPage* prev_page; // Previous page in thread's list - - // Pending queue (remote drain notification) - _Atomic(_Bool) in_remote_pending; // Is this page in pending queue? - struct MidPage* next_pending; // Next page in pending queue - - // Padding to cache line boundary (avoid false sharing) - char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 + - sizeof(atomic_uintptr_t) + sizeof(atomic_uint) + - sizeof(atomic_int) * 2 + sizeof(pthread_t) + - sizeof(_Atomic(_Bool)) + 4) % 64)]; -} MidPage; - -// Page registry: O(1) lookup from block address -// Use direct indexing: (addr >> 16) & MASK -#define MF2_PAGE_REGISTRY_BITS 16 // 64K entries (4GB address space with 64KB pages) -#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS) -#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1) - -typedef struct { - // Direct-mapped page table (no hash collisions!) - MidPage* pages[MF2_PAGE_REGISTRY_SIZE]; - - // Coarse-grained locks for rare updates (page alloc/free) - // 256 locks = 256-way parallelism for page registration - pthread_mutex_t locks[256]; - - // Statistics - atomic_uint_fast64_t total_pages; // Total pages allocated - atomic_uint_fast64_t active_pages; // Pages with live allocations -} MF2_PageRegistry; - -// Thread-local page lists (one list per size class) -typedef struct MF2_ThreadPages { - // Active pages (have free blocks) - MidPage* active_page[POOL_NUM_CLASSES]; - - // Partial pages (drained pages with free blocks, LIFO for cache locality) - // Checked before allocating new pages (fast reuse path) - MidPage* partial_pages[POOL_NUM_CLASSES]; - - // Full pages (no free blocks, but may receive remote frees) - // TODO: Gradually deprecate in favor of partial_pages - MidPage* full_pages[POOL_NUM_CLASSES]; - - // Pending queue (pages with remote frees, MPSC lock-free stack) - atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES]; - - // Pending claim flags (prevent multi-consumer CAS thrashing) - // One adopter at a time per queue (test_and_set to claim, clear to release) - atomic_flag pending_claim[POOL_NUM_CLASSES]; - - // Page ownership count (for statistics) - uint32_t page_count[POOL_NUM_CLASSES]; - - // Thread identity (cached for fast comparison) - pthread_t my_tid; - - // Route P: Activity tracking for idle-based adoption - // Updated on every allocation (mf2_alloc_fast) - // Read by adopters to check if owner is idle - atomic_uint_fast64_t last_alloc_tsc; -} MF2_ThreadPages; - -// Global page registry (shared, rarely accessed) -static MF2_PageRegistry g_mf2_page_registry; - -// Thread-local page lists (hot path, no sharing!) -static __thread MF2_ThreadPages* t_mf2_pages = NULL; - -// =========================================================================== -// MF2 Global State (Quick Win #3b - Structured Globals) -// =========================================================================== -// Individual globals replaced with structured state below. -// Old declarations removed, replaced with macro-mapped struct instances. -// -// Benefits: -// - Logical grouping (config, registry, stats) -// - Better documentation -// - Easier to extend or refactor -// - Single source of truth for each category - -#define MF2_MAX_THREADS 256 - -// MF2 Configuration (environment variables) -typedef struct { - int enabled; // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled) - int max_queues; // HAKMEM_MF2_MAX_QUEUES (default: 2) - int lease_ms; // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled) - int idle_threshold_us; // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs) -} MF2_Config; - -// MF2 Thread Registry (cross-thread coordination) -typedef struct { - MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS]; // Global registry - _Atomic int num_thread_pages; // Active thread count - _Atomic int adoptable_count[POOL_NUM_CLASSES]; // Non-empty pending queues - pthread_key_t tls_key; // Thread-local storage key - pthread_once_t key_once; // TLS initialization guard -} MF2_Registry; - -// MF2 Statistics (debug instrumentation) -typedef struct { - // Allocation path - atomic_uint_fast64_t alloc_fast_hit; - atomic_uint_fast64_t alloc_slow_hit; - atomic_uint_fast64_t page_reuse_count; - atomic_uint_fast64_t new_page_count; - - // Free path - atomic_uint_fast64_t free_owner_count; - atomic_uint_fast64_t free_remote_count; - - // Drain operations - atomic_uint_fast64_t drain_count; - atomic_uint_fast64_t drain_blocks; - atomic_uint_fast64_t drain_attempts; - atomic_uint_fast64_t drain_success; - atomic_uint_fast64_t slow_checked_drain; - atomic_uint_fast64_t slow_found_remote; - - // Full page scan (obsolete, kept for historical tracking) - atomic_uint_fast64_t full_scan_checked; - atomic_uint_fast64_t full_scan_found_remote; - atomic_uint_fast64_t eager_drain_scanned; - atomic_uint_fast64_t eager_drain_found; - - // Pending queue - atomic_uint_fast64_t pending_enqueued; - atomic_uint_fast64_t pending_drained; - atomic_uint_fast64_t pending_requeued; -} MF2_Stats; - -// Instantiate structured global state (Quick Win #3b) -static MF2_Config g_mf2_config = { - .enabled = 0, // Will be set by env var - .max_queues = 2, - .lease_ms = 10, - .idle_threshold_us = 150 -}; - -static MF2_Registry g_mf2_registry = { - .all_thread_pages = {0}, - .num_thread_pages = 0, - .adoptable_count = {0}, - .tls_key = 0, - .key_once = PTHREAD_ONCE_INIT -}; - -static MF2_Stats g_mf2_stats = { - // All fields initialized to 0 (atomic zero-initialization is valid) - .alloc_fast_hit = 0, - .alloc_slow_hit = 0, - .page_reuse_count = 0, - .new_page_count = 0, - .free_owner_count = 0, - .free_remote_count = 0, - .drain_count = 0, - .drain_blocks = 0, - .drain_attempts = 0, - .drain_success = 0, - .slow_checked_drain = 0, - .slow_found_remote = 0, - .full_scan_checked = 0, - .full_scan_found_remote = 0, - .eager_drain_scanned = 0, - .eager_drain_found = 0, - .pending_enqueued = 0, - .pending_drained = 0, - .pending_requeued = 0 -}; - -// Compatibility macros: Map old global names to struct fields -// This allows existing code to work unchanged while using structured state -#define g_mf2_enabled (g_mf2_config.enabled) -#define g_mf2_max_queues (g_mf2_config.max_queues) -#define g_mf2_lease_ms (g_mf2_config.lease_ms) -#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us) - -#define g_all_thread_pages (g_mf2_registry.all_thread_pages) -#define g_num_thread_pages (g_mf2_registry.num_thread_pages) -#define g_adoptable_count (g_mf2_registry.adoptable_count) -#define g_mf2_tls_key (g_mf2_registry.tls_key) -#define g_mf2_key_once (g_mf2_registry.key_once) - -#define g_mf2_alloc_fast_hit (g_mf2_stats.alloc_fast_hit) -#define g_mf2_alloc_slow_hit (g_mf2_stats.alloc_slow_hit) -#define g_mf2_page_reuse_count (g_mf2_stats.page_reuse_count) -#define g_mf2_new_page_count (g_mf2_stats.new_page_count) -#define g_mf2_free_owner_count (g_mf2_stats.free_owner_count) -#define g_mf2_free_remote_count (g_mf2_stats.free_remote_count) -#define g_mf2_drain_count (g_mf2_stats.drain_count) -#define g_mf2_drain_blocks (g_mf2_stats.drain_blocks) -#define g_mf2_drain_attempts (g_mf2_stats.drain_attempts) -#define g_mf2_drain_success (g_mf2_stats.drain_success) -#define g_mf2_slow_checked_drain (g_mf2_stats.slow_checked_drain) -#define g_mf2_slow_found_remote (g_mf2_stats.slow_found_remote) -#define g_mf2_full_scan_checked (g_mf2_stats.full_scan_checked) -#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote) -#define g_mf2_eager_drain_scanned (g_mf2_stats.eager_drain_scanned) -#define g_mf2_eager_drain_found (g_mf2_stats.eager_drain_found) -#define g_mf2_pending_enqueued (g_mf2_stats.pending_enqueued) -#define g_mf2_pending_drained (g_mf2_stats.pending_drained) -#define g_mf2_pending_requeued (g_mf2_stats.pending_requeued) - -// =========================================================================== -// End of MF2 Data Structures -// =========================================================================== - -// --- MF2 Initialization Functions --- - -// Thread-safe initialization using pthread_once -static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT; -static void mf2_page_registry_init_impl(void) { - // Initialize all page slots to NULL - memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry)); - - // Initialize 256 coarse-grained locks for registry updates - for (int i = 0; i < 256; i++) { - pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL); - } - - // Initialize counters - atomic_store(&g_mf2_page_registry.total_pages, 0); - atomic_store(&g_mf2_page_registry.active_pages, 0); -} -static void mf2_page_registry_init(void) { - pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl); -} - -// Strategy A: ThreadPages destructor (cleanup on thread exit) -static void mf2_thread_pages_destructor(void* arg) { - MF2_ThreadPages* tp = (MF2_ThreadPages*)arg; - if (!tp) return; - - // SAFETY: Don't remove from global registry or free memory - // Reason: Causes "malloc(): unsorted double linked list corrupted" crashes - // Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime) - // TODO: Investigate safe cleanup mechanism - - // Remove from global registry (DISABLED for safety) - // for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) { - // if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) { - // atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release); - // break; - // } - // } - - // Free all pages owned by this thread (DISABLED for safety) - // hkm_libc_free(tp); - - (void)tp; // Suppress unused warning -} - -// Strategy A: Initialize pthread_key (once only) -static void mf2_init_tls_key(void) { - pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor); -} - -// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection) -static inline uint64_t mf2_rdtsc(void) { -#if defined(__x86_64__) || defined(__i386__) - uint32_t lo, hi; - __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); - return ((uint64_t)hi << 32) | lo; -#else - // Fallback for non-x86 architectures (use clock_gettime approximation) - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; -#endif -} - -static MF2_ThreadPages* mf2_thread_pages_get(void) { - if (t_mf2_pages) return t_mf2_pages; - - // Initialize pthread_key (once only) - pthread_once(&g_mf2_key_once, mf2_init_tls_key); - - // Allocate thread-local page lists - MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages)); - if (!tp) return NULL; - - // Initialize with current thread ID - tp->my_tid = pthread_self(); - - // All page lists start empty (NULL) - for (int c = 0; c < POOL_NUM_CLASSES; c++) { - tp->active_page[c] = NULL; - tp->full_pages[c] = NULL; - atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed); - atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed); - tp->page_count[c] = 0; - } - - // Route P: Initialize activity tracking - atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed); - - // Strategy A: Register in global array for round-robin drain - int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel); - if (idx < MF2_MAX_THREADS) { - atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release); - - // DEBUG: Log first 10 thread registrations - Disabled for performance - // static _Atomic int reg_samples = 0; - // int rs = atomic_fetch_add_explicit(®_samples, 1, memory_order_relaxed); - // if (rs < 10) { - // fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n", - // rs, (unsigned long)tp->my_tid, tp, idx); - // } - } else { - MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS); - } - - // Set pthread-specific data for destructor - pthread_setspecific(g_mf2_tls_key, tp); - - t_mf2_pages = tp; - return tp; -} - -// --- MF2 Page Allocation & Lookup --- - -// O(1) page lookup from block address (mimalloc's secret sauce!) -static inline MidPage* mf2_addr_to_page(void* addr) { - // Step 1: Get page base address (64KB aligned) - // 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits - void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL); - - // Step 2: Index into registry (direct-mapped, 64K entries) - // (addr >> 16) extracts page index, & 0xFFFF wraps to registry size - size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); - - // Step 3: Direct lookup (no hash collision handling needed with 64K entries) - MidPage* page = g_mf2_page_registry.pages[idx]; - - // ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups - static _Atomic int lookup_count = 0; - // DEBUG: Disabled for performance - // int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed); - // if (count < 100) { - // int found = (page != NULL); - // int match = (page && page->base == page_base); - // fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s", - // count, addr, page_base, idx, found ? "YES" : "NO"); - // if (page) { - // fprintf(stderr, ", page->base=%p, match=%s", - // page->base, match ? "YES" : "NO"); - // } - // fprintf(stderr, "\n"); - // } - - // Validation: Ensure page base matches (handles potential collisions) - if (page && page->base == page_base) { - return page; - } - - // Collision or not registered (shouldn't happen in normal operation) - return NULL; -} - -// Register a page in the global registry (called once per page allocation) -static void mf2_register_page(MidPage* page) { - if (!page) return; - - // Calculate registry index from page base - size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); - - // ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance - // static int register_count = 0; - // if (register_count < 10) { - // fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n", - // register_count, page->base, idx, - // (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO"); - // register_count++; - // } - - // Coarse-grained lock (256 locks for 64K entries = 256 entries/lock) - int lock_idx = idx % 256; - pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]); - - // Check for collision (should be rare with 64K entries) - if (g_mf2_page_registry.pages[idx] != NULL) { - // Collision detected - this is a problem! - // For MVP, we'll just log and overwrite (TODO: handle collisions properly) - HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx); - } - - // Register the page - g_mf2_page_registry.pages[idx] = page; - - // Update counters - atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed); - atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed); - - pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]); -} - -// Unregister a page from the global registry (called when returning page to OS) -__attribute__((unused)) static void mf2_unregister_page(MidPage* page) { - if (!page) return; - - size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); - int lock_idx = idx % 256; - - pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]); - - if (g_mf2_page_registry.pages[idx] == page) { - g_mf2_page_registry.pages[idx] = NULL; - atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed); - } - - pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]); -} - -// Allocate and initialize a new 64KB page for given size class -static MidPage* mf2_alloc_new_page(int class_idx) { - if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL; - - // Get user size class (2KB, 4KB, 8KB, 16KB, 32KB) - size_t user_size = g_class_sizes[class_idx]; - if (user_size == 0) return NULL; // Dynamic class disabled - - // CRITICAL FIX: Each block needs HEADER_SIZE + user_size - // The header stores metadata (AllocHeader), user_size is the usable space - size_t block_size = HEADER_SIZE + user_size; - - // Step 1: Allocate 64KB page (aligned to 64KB boundary) - // CRITICAL FIX #4: Must ensure 64KB alignment! - // mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup. - // This caused 97% of frees to fail silently (fatal bug!) - // - // CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion! - // Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion. - - // Allocate 2x size to allow alignment adjustment - size_t alloc_size = POOL_PAGE_SIZE * 2; // 128KB - void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (raw == MAP_FAILED) { - return NULL; // OOM - } - - // Find 64KB aligned address within allocation - uintptr_t addr = (uintptr_t)raw; - uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL; // Round up to 64KB boundary - void* page_base = (void*)aligned; - - // Free unused prefix (if any) - size_t prefix_size = aligned - addr; - if (prefix_size > 0) { - munmap(raw, prefix_size); - } - - // Free unused suffix - size_t suffix_offset = prefix_size + POOL_PAGE_SIZE; - if (suffix_offset < alloc_size) { - munmap((char*)raw + suffix_offset, alloc_size - suffix_offset); - } - - // DEBUG: Log first few allocations - static _Atomic int mmap_count = 0; - int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed); - if (mc < 5) { - MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu", - mc, raw, page_base, prefix_size, alloc_size - suffix_offset); - } - - // ALIGNMENT VERIFICATION (Step 1) - if (((uintptr_t)page_base & 0xFFFF) != 0) { - MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)", - page_base, ((uintptr_t)page_base & 0xFFFF)); - } - - // Zero-fill (required for posix_memalign) - // Note: This adds ~15μs overhead, but is necessary for correctness - memset(page_base, 0, POOL_PAGE_SIZE); - - // Step 2: Allocate MidPage descriptor - MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage)); - if (!page) { - // CRITICAL FIX: Use munmap for mmap-allocated memory - munmap(page_base, POOL_PAGE_SIZE); - return NULL; - } - - // Step 3: Initialize page descriptor - page->base = page_base; - page->class_idx = (uint8_t)class_idx; - page->flags = 0; - page->owner_tid = pthread_self(); - page->owner_tp = mf2_thread_pages_get(); // Store owner's ThreadPages for pending queue - page->last_transfer_time = 0; // No transfer yet (lease mechanism) - - // Step 4: Build freelist chain (walk through page and link blocks) - // Calculate how many blocks fit in 64KB page (including header overhead) - size_t usable_size = POOL_PAGE_SIZE; - size_t num_blocks = usable_size / block_size; - - page->capacity = (uint16_t)num_blocks; - page->free_count = (uint16_t)num_blocks; - - // Build linked list of free blocks - PoolBlock* freelist_head = NULL; - PoolBlock* freelist_tail = NULL; - - for (size_t i = 0; i < num_blocks; i++) { - char* block_addr = (char*)page_base + (i * block_size); - PoolBlock* block = (PoolBlock*)block_addr; - - block->next = NULL; - - if (freelist_head == NULL) { - freelist_head = block; - freelist_tail = block; - } else { - freelist_tail->next = block; - freelist_tail = block; - } - } - - page->freelist = freelist_head; - - // Step 5: Initialize remote stack (for cross-thread frees) - atomic_store(&page->remote_head, (uintptr_t)0); - atomic_store(&page->remote_count, 0); - - // Step 6: Initialize lifecycle counters - atomic_store(&page->in_use, 0); // No blocks allocated yet - atomic_store(&page->pending_dn, 0); - - // Step 7: Initialize linkage - page->next_page = NULL; - page->prev_page = NULL; - - // Initialize pending queue fields - atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed); - page->next_pending = NULL; - - // Step 8: Register page in global registry - mf2_register_page(page); - - return page; -} - -// --- MF2 Allocation & Free Operations --- - -// Forward declarations -static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page); - -// Drain remote frees (cross-thread) into page's local freelist -// Called by owner thread when local freelist is empty -static int mf2_drain_remote_frees(MidPage* page) { - if (!page) return 0; - - atomic_fetch_add(&g_mf2_drain_attempts, 1); - - // Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG) - unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); - if (remote_count == 0) { - return 0; // Nothing to drain - } - - // Atomically swap remote stack head with NULL (lock-free pop all) - uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0, - memory_order_acq_rel); - if (!head) { - atomic_store_explicit(&page->remote_count, 0, memory_order_release); - return 0; // Race: someone else drained it - } - - // Reset remote count (FIX #6: use release for future drain checks to see) - atomic_store_explicit(&page->remote_count, 0, memory_order_release); - - // Walk the remote stack and count blocks - int drained = 0; - PoolBlock* cur = (PoolBlock*)head; - PoolBlock* tail = NULL; - - while (cur) { - drained++; - tail = cur; - cur = cur->next; - } - - // Append remote stack to local freelist (splice in front for simplicity) - if (tail) { - tail->next = page->freelist; - page->freelist = (PoolBlock*)head; - page->free_count += drained; - } - - atomic_fetch_add(&g_mf2_drain_count, 1); - atomic_fetch_add(&g_mf2_drain_blocks, drained); - - // CRITICAL FIX: Check if new remotes arrived DURING drain - // If so, re-enqueue to owner's pending queue (avoid losing remotes!) - unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire); - if (post_drain_count >= 1 && page->owner_tp) { // Use same threshold as initial enqueue - // New remotes arrived during drain, re-enqueue for next round - // Note: This is safe because flag was cleared earlier - mf2_enqueue_pending(page->owner_tp, page); - } - - return drained; -} - -// =========================================================================== -// Pending Queue Operations (MPSC Lock-Free Stack) -// =========================================================================== - -// Enqueue page to owner's pending queue (called by remote threads) -// MPSC: Multiple producers (remote free threads), single consumer (owner) -static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) { - if (!owner_tp || !page) return; - - // Already in pending? Skip (avoid duplicate enqueue) - _Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel); - if (was_pending) { - return; // Already enqueued, nothing to do - } - - atomic_fetch_add(&g_mf2_pending_enqueued, 1); - - // Push to owner's pending stack (Treiber stack algorithm) - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed); - page->next_pending = (MidPage*)old_head; - } while (!atomic_compare_exchange_weak_explicit( - &owner_tp->pages_remote_pending[page->class_idx], - &old_head, (uintptr_t)page, - memory_order_release, // Publish page - memory_order_relaxed)); - - // 0→1 detection: Increment adoptable count for this class - // This enables O(1) early return in try_adopt (if count==0, no scan needed) - if (old_head == 0) { - atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed); - } -} - -// Dequeue one page from pending queue (called by owner thread or adopter) -// Uses CAS for correctness (multi-consumer in adoption path) -static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return NULL; - - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire); - if (old_head == 0) { - return NULL; // Queue empty - } - MidPage* page = (MidPage*)old_head; - - // CAS to pop head - if (atomic_compare_exchange_weak_explicit( - &tp->pages_remote_pending[class_idx], - &old_head, (uintptr_t)page->next_pending, - memory_order_acq_rel, memory_order_relaxed)) { - // Successfully dequeued - MidPage* next = page->next_pending; - page->next_pending = NULL; // Clear link - - // If queue became empty (next==NULL), decrement adoptable count - // This enables O(1) early return in try_adopt when all queues empty - if (next == NULL) { - atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed); - } - - return page; - } - } while (1); -} - -// =========================================================================== -// End of Pending Queue Operations -// =========================================================================== - -// Forward declarations -static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id); - -// =========================================================================== -// Helper Functions (Clean & Modular) -// =========================================================================== - -// Helper: Make page active (move old active to full_pages) -static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return; - - // Move old active page to full_pages (if any) - if (tp->active_page[class_idx]) { - MidPage* old_active = tp->active_page[class_idx]; - old_active->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = old_active; - } - - // Set new page as active - tp->active_page[class_idx] = page; - page->next_page = NULL; -} - -// Helper: Drain page and add to partial list (LIFO for cache locality) -// Returns true if page has free blocks after drain -static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return false; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // If page has freelist after drain, add to partial list (LIFO) - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - page->next_page = tp->partial_pages[class_idx]; - tp->partial_pages[class_idx] = page; - return true; - } - - // No freelist, return to full_pages - page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = page; - return false; -} - -// Helper: Drain page and activate if successful (Direct Handoff - backward compat) -// Returns true if page was activated -static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return false; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // If page has freelist after drain, make it active immediately - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - mf2_make_page_active(tp, class_idx, page); - return true; - } - - // No freelist, return to full_pages - page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = page; - return false; -} - -// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1) -// Returns true if a page was successfully drained and activated -static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return false; - - // Budget: Process up to N pages to avoid blocking - for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) { - MidPage* pending_page = mf2_dequeue_pending(tp, class_idx); - if (!pending_page) break; // Queue empty - - atomic_fetch_add(&g_mf2_pending_drained, 1); - - // Clear pending flag (no longer in queue) - atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release); - - // DIRECT HANDOFF: Drain and activate if successful - if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) { - return true; // Success! Page is now active - } - // No freelist after drain, page returned to full_pages by helper - } - return false; // No pages available for reuse -} - -// Helper: Try to drain remotes from active page (must-reuse gate part 2) -// Returns true if active page has freelist after drain -static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return false; - - MidPage* page = tp->active_page[class_idx]; - if (!page) return false; - - atomic_fetch_add(&g_mf2_slow_checked_drain, 1); - unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); - - if (remote_cnt > 0) { - atomic_fetch_add(&g_mf2_slow_found_remote, 1); - int drained = mf2_drain_remote_frees(page); - if (drained > 0 && page->freelist) { - atomic_fetch_add(&g_mf2_drain_success, 1); - return true; // Success! Active page now has freelist - } - } - return false; // No remotes or drain failed -} - -// Helper: Allocate new page and make it active -// Returns the newly allocated page (or NULL on OOM) -static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return NULL; - - atomic_fetch_add(&g_mf2_new_page_count, 1); - - // DEBUG: Log why we're allocating new page (first N samples) - static _Atomic int new_page_samples = 0; - int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed); - if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) { - // Count adoptable pages across all threads - int total_adoptable = 0; - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed); - } - MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p", - sample_idx, class_idx, - (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed), - total_adoptable, - tp->active_page[class_idx], - tp->full_pages[class_idx]); - } - - MidPage* page = mf2_alloc_new_page(class_idx); - if (!page) { - return NULL; // OOM - } - - // Move current active page to full list (if any) - if (tp->active_page[class_idx]) { - MidPage* old_page = tp->active_page[class_idx]; - old_page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = old_page; - } - - // Set new page as active - tp->active_page[class_idx] = page; - tp->page_count[class_idx]++; - - return page; -} - -// =========================================================================== -// End of Helper Functions -// =========================================================================== - -// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue -// Returns true if a page was successfully adopted and activated -// Called from alloc_slow when allocating thread needs memory -static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) { - if (!me) return false; - - // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating) - // Avoids scanning empty queues (major performance win!) - int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed); - if (adoptable == 0) return false; // All queues empty, no scan needed - - // Get global thread registry - int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); - if (num_tp == 0) return false; - - // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES) - // Prevents excessive scanning overhead (2-8 threads is usually enough) - int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues; - - // Round-robin scan (limited number of threads, not ALL!) - static _Atomic uint64_t adopt_counter = 0; - uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed); - - for (int i = 0; i < scan_limit; i++) { - int tp_idx = (start_idx + i) % num_tp; - MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit( - (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire); - - if (!other_tp) continue; - - // Route P: Idle Detection - Only adopt from idle owners - // Check if owner is still actively allocating (threshold configurable via env var) - uint64_t now_tsc = mf2_rdtsc(); - uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed); - uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US; - - if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) { - continue; // Owner still active, skip adoption - } - - // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!) - // Only one thread scans each queue at a time → eliminates CAS contention - if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) { - continue; // Another thread is already scanning this queue, skip - } - - // Try to dequeue a pending page from this thread - MidPage* page = mf2_dequeue_pending(other_tp, class_idx); - if (!page) { - // Queue empty, release claim and try next thread - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - continue; - } - - // Clear pending flag (no longer in queue) - atomic_store_explicit(&page->in_remote_pending, false, memory_order_release); - - // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS) - // 0ms = disabled (no lease check), >0 = lease period in milliseconds - uint64_t now = mf2_rdtsc(); - uint64_t last_transfer = page->last_transfer_time; - if (g_mf2_lease_ms > 0 && last_transfer != 0) { - // Calculate lease cycles from ms (approx 3GHz CPU) - uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL); - if ((now - last_transfer) < lease_cycles) { - // Lease still active, return page to full_pages (don't thrash ownership) - page->next_page = other_tp->full_pages[class_idx]; - other_tp->full_pages[class_idx] = page; - // Release claim before continuing - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - continue; // Try next thread - } - } - - // Try to transfer ownership using CAS - pthread_t old_owner = page->owner_tid; - pthread_t new_owner = pthread_self(); - - // Note: pthread_t may not be atomic-compatible on all platforms - // For now, we'll use a simple write (ownership transfer is rare) - // TODO: If thrashing is observed, add atomic CAS with serialization - page->owner_tid = new_owner; - page->owner_tp = me; - page->last_transfer_time = now; - - // DEBUG: Log drain state - static _Atomic int adopt_samples = 0; - int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed); - unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed); - unsigned int pre_free = page->free_count; - PoolBlock* pre_freelist = page->freelist; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // DEBUG: Log result (first 10 samples) - if (sample_idx < 10) { - MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p", - sample_idx, class_idx, pre_remote, drained, - pre_free, page->free_count, pre_freelist, page->freelist); - } - - // Make adopted page ACTIVE immediately (not partial!) - // Adoption needs immediate activation for caller's mf2_alloc_fast() - // Partial list is only for own pending queue drains - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - atomic_fetch_add(&g_mf2_pending_drained, 1); - atomic_fetch_add(&g_mf2_drain_success, 1); - - // Make it active (move old active to full_pages) - mf2_make_page_active(me, class_idx, page); - - // Release claim before returning SUCCESS - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - return true; // SUCCESS! Page adopted and activated - } - - // No freelist after drain, return to MY full_pages (I'm the new owner!) - page->next_page = me->full_pages[class_idx]; - me->full_pages[class_idx] = page; - // Release claim before continuing search - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - // Continue searching for a better page - } - - return false; // No adoptable pages found -} - -// Fast allocation path (owner thread, NO LOCK!) -static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) { - // Get thread-local page lists - MF2_ThreadPages* tp = mf2_thread_pages_get(); - if (!tp) return NULL; - - // Get active page for this class - MidPage* page = tp->active_page[class_idx]; - if (!page) { - // No active page, go to slow path - return mf2_alloc_slow(class_idx, size, site_id); - } - - // FAST PATH: Pop from page-local freelist (NO LOCK!) - if (page->freelist) { - atomic_fetch_add(&g_mf2_alloc_fast_hit, 1); - - // Route P: Update activity tracking for idle detection - atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed); - - PoolBlock* block = page->freelist; - page->freelist = block->next; - page->free_count--; - - // Increment in-use count (atomic for cross-thread visibility) - atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed); - - // Return user pointer (skip header) - return (char*)block + HEADER_SIZE; - } - - // Local freelist empty, go to slow path - return mf2_alloc_slow(class_idx, size, site_id); -} - -// Slow allocation path (drain remote or allocate new page) -static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) { - (void)site_id; // Unused for now - - atomic_fetch_add(&g_mf2_alloc_slow_hit, 1); - - // Get thread-local page lists - MF2_ThreadPages* tp = mf2_thread_pages_get(); - if (!tp) return NULL; - - // =========================================================================== - // Allocation Strategy (Must-Reuse Order) - // =========================================================================== - // 1. MUST-REUSE GATE (Part 1): Drain own pending queue - // - Process up to 4 pages to avoid blocking - // - Direct handoff: activate first successful drain immediately - if (mf2_try_reuse_own_pending(tp, class_idx)) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // 2. MUST-REUSE GATE (Part 2): Drain active page remotes - // - Check if current active page has remote frees - // - Drain and retry allocation if successful - if (mf2_try_drain_active_remotes(tp, class_idx)) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // HISTORICAL NOTE: full_pages scan removed - // Old approach: Scan full_pages looking for pages with remotes - // Problem: Drained pages consumed before owner can scan them - // New approach: Direct Handoff immediately activates drained pages - // Result: full_pages scan always finds 0 pages (100% waste) - // - // Benchmark evidence (before removal): - // - Full scan checked: 1,879,484 pages - // - Full scan found: 0 pages (0% success rate!) - - // 3. Consumer-Driven Adoption (Route P with idle detection) - // - Only adopt from idle owners (haven't allocated in >150µs) - // - Prevents "adoption stealing" from active owners - if (mf2_try_adopt_pending(tp, class_idx)) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // 4. MUST-REUSE GATE (Final): Allocate new page (last resort) - // - Only reached after exhausting all reuse opportunities - // - Order: pending queue → active drain → adoption → NEW - MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx); - if (!page) { - return NULL; // OOM - } - - // Retry allocation from new page - return mf2_alloc_fast(class_idx, size, site_id); -} - -// Forward declaration of slow free path -static void mf2_free_slow(MidPage* page, void* ptr); - -// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue) -// Fast free path (owner thread, NO LOCK!) -static inline void mf2_free_fast(MidPage* page, void* ptr) { - if (!page || !ptr) return; - - atomic_fetch_add(&g_mf2_free_owner_count, 1); - - // Get block pointer (rewind to header) - PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE); - - // FAST PATH: Push to page-local freelist (NO LOCK!) - block->next = page->freelist; - page->freelist = block; - page->free_count++; - - // Decrement in-use count (atomic for cross-thread visibility) - int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release); - - // Check if page is now empty (all blocks free) - if (old_in_use == 1 && page->free_count == page->capacity) { - // Memory efficiency: Return empty pages to OS via MADV_DONTNEED - // Keeps VA mapped (no munmap), but releases physical memory - hak_batch_add_page(page->base, POOL_PAGE_SIZE); - } -} - -// Slow free path (cross-thread free to remote stack) -static void mf2_free_slow(MidPage* page, void* ptr) { - if (!page || !ptr) return; - - atomic_fetch_add(&g_mf2_free_remote_count, 1); - - // Get block pointer - PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE); - - // Push to page's remote stack (lock-free MPSC) - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire); - block->next = (PoolBlock*)old_head; - } while (!atomic_compare_exchange_weak_explicit( - &page->remote_head, &old_head, (uintptr_t)block, - memory_order_release, memory_order_relaxed)); - - // Increment remote count and detect threshold for enqueueing - unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst); - - // CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge - // Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again) - // Solution: Only enqueue when remotes accumulate to threshold (better batching) - // - // Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4): - // 1 = immediate (0→1 edge, causes ping-pong) - // 4 = balanced (batch 4 blocks before notifying owner) - // 8 = aggressive batching (higher latency, but better efficiency) - // - // We enqueue on transitions TO the threshold (old_count == threshold-1) - static int g_enqueue_threshold = 1; // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4 - if (old_count + 1 == (unsigned int)g_enqueue_threshold) { - // Remote count just reached threshold, notify owner - if (page->owner_tp) { - mf2_enqueue_pending(page->owner_tp, page); - } - } - - // DEBUG: Sample first 10 remote frees - Disabled for performance - // static _Atomic int remote_free_samples = 0; - // int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed); - // if (sample < 10) { - // fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n", - // sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO"); - // } - - // Decrement in-use count - int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release); - - // Check if page is now empty (FIX #6: acquire to see all remote frees) - if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) { - // Memory efficiency: Return empty pages to OS via MADV_DONTNEED - // Keeps VA mapped (no munmap), but releases physical memory - hak_batch_add_page(page->base, POOL_PAGE_SIZE); - } -} - -// Top-level free dispatcher -static void mf2_free(void* ptr) { - if (!ptr) return; - - // O(1) page lookup (mimalloc's magic!) - MidPage* page = mf2_addr_to_page(ptr); - if (!page) { - // Not a MF2 page (shouldn't happen if MF2 is enabled properly) - return; - } - - // Check if we're the owner (fast path) - MF2_ThreadPages* tp = mf2_thread_pages_get(); - - if (tp && page->owner_tid == tp->my_tid) { - // Fast: Owner thread, push to local freelist (NO LOCK!) - mf2_free_fast(page, ptr); - } else { - // Slow: Cross-thread free, push to remote stack (lock-free) - mf2_free_slow(page, ptr); - } -} - -// =========================================================================== -// Global pool state (simplified: single-threaded for MVP) -static struct { - PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - - // Locks: per (class, shard) freelist to allow concurrent operations - PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - - // Non-empty bitmap (O(1) empty class skip) - // Bit i = 1 if freelist[class][shard] is non-empty - // Use atomic to avoid class-wide locks - atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES]; // 1 bit per shard - - // Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc - atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - atomic_uint remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - - // Statistics - uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t total_bytes_allocated __attribute__((aligned(64))); - uint64_t total_pages_allocated __attribute__((aligned(64))); - - // Per-class page accounting (for Soft CAP guidance) - uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64))); - - // ACE: per-class bundle factor for refill (1..4) + last snapshot - int bundle_factor[POOL_NUM_CLASSES]; - uint64_t last_hits[POOL_NUM_CLASSES]; - uint64_t last_misses[POOL_NUM_CLASSES]; - - int initialized; - int tls_free_enabled; // env: HAKMEM_POOL_TLS_FREE (default: 1) - - // Extra metrics (for learner logging): all relaxed atomics - atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64))); - atomic_uint_fast64_t trylock_success __attribute__((aligned(64))); - atomic_uint_fast64_t ring_underflow __attribute__((aligned(64))); -} g_pool; - -static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers -static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing -static int g_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring -static int g_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES (1..8) -static int g_ring_return_div = 2; // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third) -static int g_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX (LIFO size cap) -int g_hdr_light_enabled = 0; // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation) -static int g_pool_min_bundle = 2; // env: HAKMEM_POOL_MIN_BUNDLE (default 2) -// Sampled counter updates to reduce hot-path stores: 1/2^k -static int g_count_sample_exp = 10; // env: HAKMEM_POOL_COUNT_SAMPLE (0..16) -static __thread uint32_t t_pool_rng = 0x243f6a88u; // per-thread RNG for sampling - -// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap. -// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap -static size_t g_class_sizes[POOL_NUM_CLASSES] = { - POOL_CLASS_2KB, // 2 KB - POOL_CLASS_4KB, // 4 KB - POOL_CLASS_8KB, // 8 KB - POOL_CLASS_16KB, // 16 KB - POOL_CLASS_32KB, // 32 KB - POOL_CLASS_40KB, // 40 KB (Bridge class 0) - POOL_CLASS_52KB // 52 KB (Bridge class 1) -}; - -// Blocks per page (for each class) -__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = { - POOL_PAGE_SIZE / POOL_CLASS_2KB, // 32 blocks (2KiB) - POOL_PAGE_SIZE / POOL_CLASS_4KB, // 16 blocks (4KiB) - POOL_PAGE_SIZE / POOL_CLASS_8KB, // 8 blocks (8KiB) - POOL_PAGE_SIZE / POOL_CLASS_16KB, // 4 blocks (16KiB) - POOL_PAGE_SIZE / POOL_CLASS_32KB, // 2 blocks (32KiB) - POOL_PAGE_SIZE / POOL_CLASS_40KB, // 1 block (40KiB Bridge) - POOL_PAGE_SIZE / POOL_CLASS_52KB // 1 block (52KiB Bridge) -}; - -// =========================================================================== -// Helper Functions -// =========================================================================== - -// Write minimal header for Mid allocation (fast-return friendly) -static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) { - // For Mid, prefer headerless operation when HDR_LIGHT>=1. - // Debug or non-Mid callers can still write full headers elsewhere. - if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path - hdr->magic = HAKMEM_MAGIC; - hdr->method = ALLOC_METHOD_POOL; - hdr->size = class_sz; - if (!g_hdr_light_enabled) { - hdr->alloc_site = site_id; - hdr->class_bytes = 0; - hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); - } -} - -// Branchless LUT (Lookup Table) for O(1) class determination -// Expanded to 53 entries for Bridge classes (40KB, 52KB) -static const uint8_t SIZE_TO_CLASS[53] = { - 0,0,0, // 0-2KB → Class 0 - 1,1, // 3-4KB → Class 1 - 2,2,2,2, // 5-8KB → Class 2 - 3,3,3,3,3,3,3,3, // 9-16KB → Class 3 - 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, // 17-32KB → Class 4 - 5,5,5,5,5,5,5,5, // 33-40KB → Class 5 (Bridge class 0) - 6,6,6,6,6,6,6,6,6,6,6,6 // 41-52KB → Class 6 (Bridge class 1) -}; - -// Get size class index from size (0-6, or -1 if out of range) -// Updated range check for Bridge classes (0-52KB) -static inline int hak_pool_get_class_index(size_t size) { - // Fast path: exact match against configured class sizes (covers Bridge classes) - // Note: size passed here should already be a rounded class size from ACE. - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - size_t cs = g_class_sizes[i]; - if (cs != 0 && size == cs) return i; - } - // Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior) - uint32_t kb = (uint32_t)((size + 1023) >> 10); // Round up to KB units - return (kb < 53) ? SIZE_TO_CLASS[kb] : -1; // Expanded to 53KB for Bridge classes -} - -// Get shard index from site_id (0-63) -int hak_pool_get_shard_index(uintptr_t site_id) { - if (!g_shard_mix_enabled) { - // Legacy: Shift by 4 to reduce collision (instruction alignment) - return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1)); - } - // SplitMix64-like mixer with thread id salt for better dispersion - uint64_t x = (uint64_t)site_id; - uint64_t tid = (uint64_t)(uintptr_t)pthread_self(); - x ^= (tid << 1); - x += 0x9e3779b97f4a7c15ULL; - x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; - x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; - x = (x ^ (x >> 31)); - return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1)); -} - -// TLS helpers -#include "box/pool_tls_core.inc.h" - - -// Refill/ACE (boxed) -#include "box/pool_refill.inc.h" - -// Init/Shutdown + MF2 debug (boxed) -#include "box/pool_init_api.inc.h" - -// Pool statistics (boxed) -#include "box/pool_stats.inc.h" - -// Public API (boxed): alloc/free/lookup/free_fast -#include "box/pool_api.inc.h" diff --git a/core/hakmem_pool.c.bak3 b/core/hakmem_pool.c.bak3 deleted file mode 100644 index f7dec263..00000000 --- a/core/hakmem_pool.c.bak3 +++ /dev/null @@ -1,1190 +0,0 @@ -// ============================================================================ -// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB) -// ============================================================================ -// -// サイズクラス定義: -// ┌──────────┬─────────┬──────────────┬─────────────┐ -// │ クラス │ サイズ │ 初期CAP │ ページ構成 │ -// ├──────────┼─────────┼──────────────┼─────────────┤ -// │ Class 0 │ 2 KiB │ 64 pages │ 32 blocks/p │ -// │ Class 1 │ 4 KiB │ 64 pages │ 16 blocks/p │ -// │ Class 2 │ 8 KiB │ 64 pages │ 8 blocks/p │ -// │ Class 3 │ 16 KiB │ 32 pages │ 4 blocks/p │ -// │ Class 4 │ 32 KiB │ 16 pages │ 2 blocks/p │ -// │ DYN1 │ 6 KiB* │ 0 (無効) │ 可変 │ -// │ DYN2 │ (未使用)│ 0 (無効) │ 可変 │ -// └──────────┴─────────┴──────────────┴─────────────┘ -// * DYN1はギャップ(8-16KB)を埋めるための動的クラス -// -// W_MAX (切り上げ許容倍率): -// - 意味: 要求サイズの何倍までのクラスを許容するか -// - デフォルト: 1.40 (40%までの切り上げを許容) -// - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40) -// - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能 -// -// CAP (在庫量): -// - 意味: 各クラスで保持する最大ページ数 -// - 初期値: {64,64,64,32,16} - 保守的(フットプリント優先) -// - 推奨値: {256,256,256,128,64} - パフォーマンス優先 -// - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定 -// - 学習モード: HAKMEM_LEARN=1 で自動調整 -// -// TLSリング構造: -// - POOL_L2_RING_CAP: リングバッファ容量(デフォルト16) -// - ActivePage A/B: bump-run方式(ロックフリー) -// - LIFO overflow: リングから溢れた分 -// -// パフォーマンスチューニング: -// 1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64 -// 2. W_MAX緩和: HAKMEM_WMAX_MID=1.6 -// 3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64 -// 4. 学習モード: HAKMEM_LEARN=1 -// -// License: MIT -// Last Updated: 2025-10-26 (Code Cleanup完了) - -#include "hakmem_pool.h" -#include "hakmem_config.h" -#include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC -#include "hakmem_syscall.h" // Box 3 syscall layer (bypasses LD_PRELOAD) -#include -#include -#include -#include -#include -#include -#include -#include "hakmem_prof.h" -#include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP gating) -#include "hakmem_debug.h" - -// False sharing mitigation: padded mutex type (64B) -typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex; - -// =========================================================================== -// Internal Data Structures -// =========================================================================== -#include "box/pool_tls_types.inc.h" - -// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid}) -#include "box/pool_mid_desc.inc.h" - -// ---------------- Transfer Cache (per-thread per-class inbox) -------------- -#include "box/pool_mid_tc.inc.h" - -#include "box/pool_mf2_types.inc.h" - - -// --- MF2 Initialization Functions --- - -// Thread-safe initialization using pthread_once -static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT; -static void mf2_page_registry_init_impl(void) { - // Initialize all page slots to NULL - memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry)); - - // Initialize 256 coarse-grained locks for registry updates - for (int i = 0; i < 256; i++) { - pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL); - } - - // Initialize counters - atomic_store(&g_mf2_page_registry.total_pages, 0); - atomic_store(&g_mf2_page_registry.active_pages, 0); -} -static void mf2_page_registry_init(void) { - pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl); -} - -// Strategy A: ThreadPages destructor (cleanup on thread exit) -static void mf2_thread_pages_destructor(void* arg) { - MF2_ThreadPages* tp = (MF2_ThreadPages*)arg; - if (!tp) return; - - // SAFETY: Don't remove from global registry or free memory - // Reason: Causes "malloc(): unsorted double linked list corrupted" crashes - // Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime) - // TODO: Investigate safe cleanup mechanism - - // Remove from global registry (DISABLED for safety) - // for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) { - // if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) { - // atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release); - // break; - // } - // } - - // Free all pages owned by this thread (DISABLED for safety) - // hkm_libc_free(tp); - - (void)tp; // Suppress unused warning -} - -// Strategy A: Initialize pthread_key (once only) -static void mf2_init_tls_key(void) { - pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor); -} - -// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection) -static inline uint64_t mf2_rdtsc(void) { -#if defined(__x86_64__) || defined(__i386__) - uint32_t lo, hi; - __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); - return ((uint64_t)hi << 32) | lo; -#else - // Fallback for non-x86 architectures (use clock_gettime approximation) - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; -#endif -} - -static MF2_ThreadPages* mf2_thread_pages_get(void) { - if (t_mf2_pages) return t_mf2_pages; - - // Initialize pthread_key (once only) - pthread_once(&g_mf2_key_once, mf2_init_tls_key); - - // Allocate thread-local page lists - MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages)); - if (!tp) return NULL; - - // Initialize with current thread ID - tp->my_tid = pthread_self(); - - // All page lists start empty (NULL) - for (int c = 0; c < POOL_NUM_CLASSES; c++) { - tp->active_page[c] = NULL; - tp->full_pages[c] = NULL; - atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed); - atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed); - tp->page_count[c] = 0; - } - - // Route P: Initialize activity tracking - atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed); - - // Strategy A: Register in global array for round-robin drain - int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel); - if (idx < MF2_MAX_THREADS) { - atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release); - - // DEBUG: Log first 10 thread registrations - Disabled for performance - // static _Atomic int reg_samples = 0; - // int rs = atomic_fetch_add_explicit(®_samples, 1, memory_order_relaxed); - // if (rs < 10) { - // fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n", - // rs, (unsigned long)tp->my_tid, tp, idx); - // } - } else { - MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS); - } - - // Set pthread-specific data for destructor - pthread_setspecific(g_mf2_tls_key, tp); - - t_mf2_pages = tp; - return tp; -} - -// --- MF2 Page Allocation & Lookup --- - -// O(1) page lookup from block address (mimalloc's secret sauce!) -static inline MidPage* mf2_addr_to_page(void* addr) { - // Step 1: Get page base address (64KB aligned) - // 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits - void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL); - - // Step 2: Index into registry (direct-mapped, 64K entries) - // (addr >> 16) extracts page index, & 0xFFFF wraps to registry size - size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); - - // Step 3: Direct lookup (no hash collision handling needed with 64K entries) - MidPage* page = g_mf2_page_registry.pages[idx]; - - // ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups - static _Atomic int lookup_count = 0; - // DEBUG: Disabled for performance - // int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed); - // if (count < 100) { - // int found = (page != NULL); - // int match = (page && page->base == page_base); - // fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s", - // count, addr, page_base, idx, found ? "YES" : "NO"); - // if (page) { - // fprintf(stderr, ", page->base=%p, match=%s", - // page->base, match ? "YES" : "NO"); - // } - // fprintf(stderr, "\n"); - // } - - // Validation: Ensure page base matches (handles potential collisions) - if (page && page->base == page_base) { - return page; - } - - // Collision or not registered (shouldn't happen in normal operation) - return NULL; -} - -// Register a page in the global registry (called once per page allocation) -static void mf2_register_page(MidPage* page) { - if (!page) return; - - // Calculate registry index from page base - size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); - - // ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance - // static int register_count = 0; - // if (register_count < 10) { - // fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n", - // register_count, page->base, idx, - // (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO"); - // register_count++; - // } - - // Coarse-grained lock (256 locks for 64K entries = 256 entries/lock) - int lock_idx = idx % 256; - pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]); - - // Check for collision (should be rare with 64K entries) - if (g_mf2_page_registry.pages[idx] != NULL) { - // Collision detected - this is a problem! - // For MVP, we'll just log and overwrite (TODO: handle collisions properly) - HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx); - } - - // Register the page - g_mf2_page_registry.pages[idx] = page; - - // Update counters - atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed); - atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed); - - pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]); -} - -// Unregister a page from the global registry (called when returning page to OS) -__attribute__((unused)) static void mf2_unregister_page(MidPage* page) { - if (!page) return; - - size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); - int lock_idx = idx % 256; - - pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]); - - if (g_mf2_page_registry.pages[idx] == page) { - g_mf2_page_registry.pages[idx] = NULL; - atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed); - } - - pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]); -} - -// Allocate and initialize a new 64KB page for given size class -static MidPage* mf2_alloc_new_page(int class_idx) { - if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL; - - // Get user size class (2KB, 4KB, 8KB, 16KB, 32KB) - size_t user_size = g_class_sizes[class_idx]; - if (user_size == 0) return NULL; // Dynamic class disabled - - // CRITICAL FIX: Each block needs HEADER_SIZE + user_size - // The header stores metadata (AllocHeader), user_size is the usable space - size_t block_size = HEADER_SIZE + user_size; - - // Step 1: Allocate 64KB page (aligned to 64KB boundary) - // CRITICAL FIX #4: Must ensure 64KB alignment! - // mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup. - // This caused 97% of frees to fail silently (fatal bug!) - // - // CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion! - // Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion. - - // Allocate 2x size to allow alignment adjustment - size_t alloc_size = POOL_PAGE_SIZE * 2; // 128KB - void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (raw == MAP_FAILED) { - return NULL; // OOM - } - - // Find 64KB aligned address within allocation - uintptr_t addr = (uintptr_t)raw; - uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL; // Round up to 64KB boundary - void* page_base = (void*)aligned; - - // Free unused prefix (if any) - size_t prefix_size = aligned - addr; - if (prefix_size > 0) { - munmap(raw, prefix_size); - } - - // Free unused suffix - size_t suffix_offset = prefix_size + POOL_PAGE_SIZE; - if (suffix_offset < alloc_size) { - munmap((char*)raw + suffix_offset, alloc_size - suffix_offset); - } - - // DEBUG: Log first few allocations - static _Atomic int mmap_count = 0; - int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed); - if (mc < 5) { - MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu", - mc, raw, page_base, prefix_size, alloc_size - suffix_offset); - } - - // ALIGNMENT VERIFICATION (Step 1) - if (((uintptr_t)page_base & 0xFFFF) != 0) { - MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)", - page_base, ((uintptr_t)page_base & 0xFFFF)); - } - - // Zero-fill (required for posix_memalign) - // Note: This adds ~15μs overhead, but is necessary for correctness - memset(page_base, 0, POOL_PAGE_SIZE); - - // Step 2: Allocate MidPage descriptor - MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage)); - if (!page) { - // CRITICAL FIX: Use munmap for mmap-allocated memory - munmap(page_base, POOL_PAGE_SIZE); - return NULL; - } - - // Step 3: Initialize page descriptor - page->base = page_base; - page->class_idx = (uint8_t)class_idx; - page->flags = 0; - page->owner_tid = pthread_self(); - page->owner_tp = mf2_thread_pages_get(); // Store owner's ThreadPages for pending queue - page->last_transfer_time = 0; // No transfer yet (lease mechanism) - - // Step 4: Build freelist chain (walk through page and link blocks) - // Calculate how many blocks fit in 64KB page (including header overhead) - size_t usable_size = POOL_PAGE_SIZE; - size_t num_blocks = usable_size / block_size; - - page->capacity = (uint16_t)num_blocks; - page->free_count = (uint16_t)num_blocks; - - // Build linked list of free blocks - PoolBlock* freelist_head = NULL; - PoolBlock* freelist_tail = NULL; - - for (size_t i = 0; i < num_blocks; i++) { - char* block_addr = (char*)page_base + (i * block_size); - PoolBlock* block = (PoolBlock*)block_addr; - - block->next = NULL; - - if (freelist_head == NULL) { - freelist_head = block; - freelist_tail = block; - } else { - freelist_tail->next = block; - freelist_tail = block; - } - } - - page->freelist = freelist_head; - - // Step 5: Initialize remote stack (for cross-thread frees) - atomic_store(&page->remote_head, (uintptr_t)0); - atomic_store(&page->remote_count, 0); - - // Step 6: Initialize lifecycle counters - atomic_store(&page->in_use, 0); // No blocks allocated yet - atomic_store(&page->pending_dn, 0); - - // Step 7: Initialize linkage - page->next_page = NULL; - page->prev_page = NULL; - - // Initialize pending queue fields - atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed); - page->next_pending = NULL; - - // Step 8: Register page in global registry - mf2_register_page(page); - - return page; -} - -// --- MF2 Allocation & Free Operations --- - -// Forward declarations -static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page); - -// Drain remote frees (cross-thread) into page's local freelist -// Called by owner thread when local freelist is empty -static int mf2_drain_remote_frees(MidPage* page) { - if (!page) return 0; - - atomic_fetch_add(&g_mf2_drain_attempts, 1); - - // Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG) - unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); - if (remote_count == 0) { - return 0; // Nothing to drain - } - - // Atomically swap remote stack head with NULL (lock-free pop all) - uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0, - memory_order_acq_rel); - if (!head) { - atomic_store_explicit(&page->remote_count, 0, memory_order_release); - return 0; // Race: someone else drained it - } - - // Reset remote count (FIX #6: use release for future drain checks to see) - atomic_store_explicit(&page->remote_count, 0, memory_order_release); - - // Walk the remote stack and count blocks - int drained = 0; - PoolBlock* cur = (PoolBlock*)head; - PoolBlock* tail = NULL; - - while (cur) { - drained++; - tail = cur; - cur = cur->next; - } - - // Append remote stack to local freelist (splice in front for simplicity) - if (tail) { - tail->next = page->freelist; - page->freelist = (PoolBlock*)head; - page->free_count += drained; - } - - atomic_fetch_add(&g_mf2_drain_count, 1); - atomic_fetch_add(&g_mf2_drain_blocks, drained); - - // CRITICAL FIX: Check if new remotes arrived DURING drain - // If so, re-enqueue to owner's pending queue (avoid losing remotes!) - unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire); - if (post_drain_count >= 1 && page->owner_tp) { // Use same threshold as initial enqueue - // New remotes arrived during drain, re-enqueue for next round - // Note: This is safe because flag was cleared earlier - mf2_enqueue_pending(page->owner_tp, page); - } - - return drained; -} - -// =========================================================================== -// Pending Queue Operations (MPSC Lock-Free Stack) -// =========================================================================== - -// Enqueue page to owner's pending queue (called by remote threads) -// MPSC: Multiple producers (remote free threads), single consumer (owner) -static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) { - if (!owner_tp || !page) return; - - // Already in pending? Skip (avoid duplicate enqueue) - _Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel); - if (was_pending) { - return; // Already enqueued, nothing to do - } - - atomic_fetch_add(&g_mf2_pending_enqueued, 1); - - // Push to owner's pending stack (Treiber stack algorithm) - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed); - page->next_pending = (MidPage*)old_head; - } while (!atomic_compare_exchange_weak_explicit( - &owner_tp->pages_remote_pending[page->class_idx], - &old_head, (uintptr_t)page, - memory_order_release, // Publish page - memory_order_relaxed)); - - // 0→1 detection: Increment adoptable count for this class - // This enables O(1) early return in try_adopt (if count==0, no scan needed) - if (old_head == 0) { - atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed); - } -} - -// Dequeue one page from pending queue (called by owner thread or adopter) -// Uses CAS for correctness (multi-consumer in adoption path) -static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return NULL; - - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire); - if (old_head == 0) { - return NULL; // Queue empty - } - MidPage* page = (MidPage*)old_head; - - // CAS to pop head - if (atomic_compare_exchange_weak_explicit( - &tp->pages_remote_pending[class_idx], - &old_head, (uintptr_t)page->next_pending, - memory_order_acq_rel, memory_order_relaxed)) { - // Successfully dequeued - MidPage* next = page->next_pending; - page->next_pending = NULL; // Clear link - - // If queue became empty (next==NULL), decrement adoptable count - // This enables O(1) early return in try_adopt when all queues empty - if (next == NULL) { - atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed); - } - - return page; - } - } while (1); -} - -// =========================================================================== -// End of Pending Queue Operations -// =========================================================================== - -// Forward declarations -static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id); - -// =========================================================================== -// Helper Functions (Clean & Modular) -// =========================================================================== - -// Helper: Make page active (move old active to full_pages) -static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return; - - // Move old active page to full_pages (if any) - if (tp->active_page[class_idx]) { - MidPage* old_active = tp->active_page[class_idx]; - old_active->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = old_active; - } - - // Set new page as active - tp->active_page[class_idx] = page; - page->next_page = NULL; -} - -// Helper: Drain page and add to partial list (LIFO for cache locality) -// Returns true if page has free blocks after drain -static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return false; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // If page has freelist after drain, add to partial list (LIFO) - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - page->next_page = tp->partial_pages[class_idx]; - tp->partial_pages[class_idx] = page; - return true; - } - - // No freelist, return to full_pages - page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = page; - return false; -} - -// Helper: Drain page and activate if successful (Direct Handoff - backward compat) -// Returns true if page was activated -static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return false; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // If page has freelist after drain, make it active immediately - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - mf2_make_page_active(tp, class_idx, page); - return true; - } - - // No freelist, return to full_pages - page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = page; - return false; -} - -// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1) -// Returns true if a page was successfully drained and activated -static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return false; - - // Budget: Process up to N pages to avoid blocking - for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) { - MidPage* pending_page = mf2_dequeue_pending(tp, class_idx); - if (!pending_page) break; // Queue empty - - atomic_fetch_add(&g_mf2_pending_drained, 1); - - // Clear pending flag (no longer in queue) - atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release); - - // DIRECT HANDOFF: Drain and activate if successful - if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) { - return true; // Success! Page is now active - } - // No freelist after drain, page returned to full_pages by helper - } - return false; // No pages available for reuse -} - -// Helper: Try to drain remotes from active page (must-reuse gate part 2) -// Returns true if active page has freelist after drain -static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return false; - - MidPage* page = tp->active_page[class_idx]; - if (!page) return false; - - atomic_fetch_add(&g_mf2_slow_checked_drain, 1); - unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); - - if (remote_cnt > 0) { - atomic_fetch_add(&g_mf2_slow_found_remote, 1); - int drained = mf2_drain_remote_frees(page); - if (drained > 0 && page->freelist) { - atomic_fetch_add(&g_mf2_drain_success, 1); - return true; // Success! Active page now has freelist - } - } - return false; // No remotes or drain failed -} - -// Helper: Allocate new page and make it active -// Returns the newly allocated page (or NULL on OOM) -static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return NULL; - - atomic_fetch_add(&g_mf2_new_page_count, 1); - - // DEBUG: Log why we're allocating new page (first N samples) - static _Atomic int new_page_samples = 0; - int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed); - if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) { - // Count adoptable pages across all threads - int total_adoptable = 0; - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed); - } - MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p", - sample_idx, class_idx, - (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed), - total_adoptable, - tp->active_page[class_idx], - tp->full_pages[class_idx]); - } - - MidPage* page = mf2_alloc_new_page(class_idx); - if (!page) { - return NULL; // OOM - } - - // Move current active page to full list (if any) - if (tp->active_page[class_idx]) { - MidPage* old_page = tp->active_page[class_idx]; - old_page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = old_page; - } - - // Set new page as active - tp->active_page[class_idx] = page; - tp->page_count[class_idx]++; - - return page; -} - -// =========================================================================== -// End of Helper Functions -// =========================================================================== - -// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue -// Returns true if a page was successfully adopted and activated -// Called from alloc_slow when allocating thread needs memory -static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) { - if (!me) return false; - - // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating) - // Avoids scanning empty queues (major performance win!) - int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed); - if (adoptable == 0) return false; // All queues empty, no scan needed - - // Get global thread registry - int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); - if (num_tp == 0) return false; - - // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES) - // Prevents excessive scanning overhead (2-8 threads is usually enough) - int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues; - - // Round-robin scan (limited number of threads, not ALL!) - static _Atomic uint64_t adopt_counter = 0; - uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed); - - for (int i = 0; i < scan_limit; i++) { - int tp_idx = (start_idx + i) % num_tp; - MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit( - (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire); - - if (!other_tp) continue; - - // Route P: Idle Detection - Only adopt from idle owners - // Check if owner is still actively allocating (threshold configurable via env var) - uint64_t now_tsc = mf2_rdtsc(); - uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed); - uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US; - - if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) { - continue; // Owner still active, skip adoption - } - - // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!) - // Only one thread scans each queue at a time → eliminates CAS contention - if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) { - continue; // Another thread is already scanning this queue, skip - } - - // Try to dequeue a pending page from this thread - MidPage* page = mf2_dequeue_pending(other_tp, class_idx); - if (!page) { - // Queue empty, release claim and try next thread - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - continue; - } - - // Clear pending flag (no longer in queue) - atomic_store_explicit(&page->in_remote_pending, false, memory_order_release); - - // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS) - // 0ms = disabled (no lease check), >0 = lease period in milliseconds - uint64_t now = mf2_rdtsc(); - uint64_t last_transfer = page->last_transfer_time; - if (g_mf2_lease_ms > 0 && last_transfer != 0) { - // Calculate lease cycles from ms (approx 3GHz CPU) - uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL); - if ((now - last_transfer) < lease_cycles) { - // Lease still active, return page to full_pages (don't thrash ownership) - page->next_page = other_tp->full_pages[class_idx]; - other_tp->full_pages[class_idx] = page; - // Release claim before continuing - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - continue; // Try next thread - } - } - - // Try to transfer ownership using CAS - pthread_t old_owner = page->owner_tid; - pthread_t new_owner = pthread_self(); - - // Note: pthread_t may not be atomic-compatible on all platforms - // For now, we'll use a simple write (ownership transfer is rare) - // TODO: If thrashing is observed, add atomic CAS with serialization - page->owner_tid = new_owner; - page->owner_tp = me; - page->last_transfer_time = now; - - // DEBUG: Log drain state - static _Atomic int adopt_samples = 0; - int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed); - unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed); - unsigned int pre_free = page->free_count; - PoolBlock* pre_freelist = page->freelist; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // DEBUG: Log result (first 10 samples) - if (sample_idx < 10) { - MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p", - sample_idx, class_idx, pre_remote, drained, - pre_free, page->free_count, pre_freelist, page->freelist); - } - - // Make adopted page ACTIVE immediately (not partial!) - // Adoption needs immediate activation for caller's mf2_alloc_fast() - // Partial list is only for own pending queue drains - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - atomic_fetch_add(&g_mf2_pending_drained, 1); - atomic_fetch_add(&g_mf2_drain_success, 1); - - // Make it active (move old active to full_pages) - mf2_make_page_active(me, class_idx, page); - - // Release claim before returning SUCCESS - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - return true; // SUCCESS! Page adopted and activated - } - - // No freelist after drain, return to MY full_pages (I'm the new owner!) - page->next_page = me->full_pages[class_idx]; - me->full_pages[class_idx] = page; - // Release claim before continuing search - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - // Continue searching for a better page - } - - return false; // No adoptable pages found -} - -// Fast allocation path (owner thread, NO LOCK!) -static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) { - // Get thread-local page lists - MF2_ThreadPages* tp = mf2_thread_pages_get(); - if (!tp) return NULL; - - // Get active page for this class - MidPage* page = tp->active_page[class_idx]; - if (!page) { - // No active page, go to slow path - return mf2_alloc_slow(class_idx, size, site_id); - } - - // FAST PATH: Pop from page-local freelist (NO LOCK!) - if (page->freelist) { - atomic_fetch_add(&g_mf2_alloc_fast_hit, 1); - - // Route P: Update activity tracking for idle detection - atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed); - - PoolBlock* block = page->freelist; - page->freelist = block->next; - page->free_count--; - - // Increment in-use count (atomic for cross-thread visibility) - atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed); - - // Return user pointer (skip header) - return (char*)block + HEADER_SIZE; - } - - // Local freelist empty, go to slow path - return mf2_alloc_slow(class_idx, size, site_id); -} - -// Slow allocation path (drain remote or allocate new page) -static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) { - (void)site_id; // Unused for now - - atomic_fetch_add(&g_mf2_alloc_slow_hit, 1); - - // Get thread-local page lists - MF2_ThreadPages* tp = mf2_thread_pages_get(); - if (!tp) return NULL; - - // =========================================================================== - // Allocation Strategy (Must-Reuse Order) - // =========================================================================== - // 1. MUST-REUSE GATE (Part 1): Drain own pending queue - // - Process up to 4 pages to avoid blocking - // - Direct handoff: activate first successful drain immediately - if (mf2_try_reuse_own_pending(tp, class_idx)) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // 2. MUST-REUSE GATE (Part 2): Drain active page remotes - // - Check if current active page has remote frees - // - Drain and retry allocation if successful - if (mf2_try_drain_active_remotes(tp, class_idx)) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // HISTORICAL NOTE: full_pages scan removed - // Old approach: Scan full_pages looking for pages with remotes - // Problem: Drained pages consumed before owner can scan them - // New approach: Direct Handoff immediately activates drained pages - // Result: full_pages scan always finds 0 pages (100% waste) - // - // Benchmark evidence (before removal): - // - Full scan checked: 1,879,484 pages - // - Full scan found: 0 pages (0% success rate!) - - // 3. Consumer-Driven Adoption (Route P with idle detection) - // - Only adopt from idle owners (haven't allocated in >150µs) - // - Prevents "adoption stealing" from active owners - if (mf2_try_adopt_pending(tp, class_idx)) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // 4. MUST-REUSE GATE (Final): Allocate new page (last resort) - // - Only reached after exhausting all reuse opportunities - // - Order: pending queue → active drain → adoption → NEW - MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx); - if (!page) { - return NULL; // OOM - } - - // Retry allocation from new page - return mf2_alloc_fast(class_idx, size, site_id); -} - -// Forward declaration of slow free path -static void mf2_free_slow(MidPage* page, void* ptr); - -// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue) -// Fast free path (owner thread, NO LOCK!) -static inline void mf2_free_fast(MidPage* page, void* ptr) { - if (!page || !ptr) return; - - atomic_fetch_add(&g_mf2_free_owner_count, 1); - - // Get block pointer (rewind to header) - PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE); - - // FAST PATH: Push to page-local freelist (NO LOCK!) - block->next = page->freelist; - page->freelist = block; - page->free_count++; - - // Decrement in-use count (atomic for cross-thread visibility) - int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release); - - // Check if page is now empty (all blocks free) - if (old_in_use == 1 && page->free_count == page->capacity) { - // Memory efficiency: Return empty pages to OS via MADV_DONTNEED - // Keeps VA mapped (no munmap), but releases physical memory - hak_batch_add_page(page->base, POOL_PAGE_SIZE); - } -} - -// Slow free path (cross-thread free to remote stack) -static void mf2_free_slow(MidPage* page, void* ptr) { - if (!page || !ptr) return; - - atomic_fetch_add(&g_mf2_free_remote_count, 1); - - // Get block pointer - PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE); - - // Push to page's remote stack (lock-free MPSC) - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire); - block->next = (PoolBlock*)old_head; - } while (!atomic_compare_exchange_weak_explicit( - &page->remote_head, &old_head, (uintptr_t)block, - memory_order_release, memory_order_relaxed)); - - // Increment remote count and detect threshold for enqueueing - unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst); - - // CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge - // Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again) - // Solution: Only enqueue when remotes accumulate to threshold (better batching) - // - // Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4): - // 1 = immediate (0→1 edge, causes ping-pong) - // 4 = balanced (batch 4 blocks before notifying owner) - // 8 = aggressive batching (higher latency, but better efficiency) - // - // We enqueue on transitions TO the threshold (old_count == threshold-1) - static int g_enqueue_threshold = 1; // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4 - if (old_count + 1 == (unsigned int)g_enqueue_threshold) { - // Remote count just reached threshold, notify owner - if (page->owner_tp) { - mf2_enqueue_pending(page->owner_tp, page); - } - } - - // DEBUG: Sample first 10 remote frees - Disabled for performance - // static _Atomic int remote_free_samples = 0; - // int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed); - // if (sample < 10) { - // fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n", - // sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO"); - // } - - // Decrement in-use count - int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release); - - // Check if page is now empty (FIX #6: acquire to see all remote frees) - if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) { - // Memory efficiency: Return empty pages to OS via MADV_DONTNEED - // Keeps VA mapped (no munmap), but releases physical memory - hak_batch_add_page(page->base, POOL_PAGE_SIZE); - } -} - -// Top-level free dispatcher -static void mf2_free(void* ptr) { - if (!ptr) return; - - // O(1) page lookup (mimalloc's magic!) - MidPage* page = mf2_addr_to_page(ptr); - if (!page) { - // Not a MF2 page (shouldn't happen if MF2 is enabled properly) - return; - } - - // Check if we're the owner (fast path) - MF2_ThreadPages* tp = mf2_thread_pages_get(); - - if (tp && page->owner_tid == tp->my_tid) { - // Fast: Owner thread, push to local freelist (NO LOCK!) - mf2_free_fast(page, ptr); - } else { - // Slow: Cross-thread free, push to remote stack (lock-free) - mf2_free_slow(page, ptr); - } -} - -// =========================================================================== -// Global pool state (simplified: single-threaded for MVP) -static struct { - PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - - // Locks: per (class, shard) freelist to allow concurrent operations - PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - - // Non-empty bitmap (O(1) empty class skip) - // Bit i = 1 if freelist[class][shard] is non-empty - // Use atomic to avoid class-wide locks - atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES]; // 1 bit per shard - - // Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc - atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - atomic_uint remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; - - // Statistics - uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64))); - uint64_t total_bytes_allocated __attribute__((aligned(64))); - uint64_t total_pages_allocated __attribute__((aligned(64))); - - // Per-class page accounting (for Soft CAP guidance) - uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64))); - - // ACE: per-class bundle factor for refill (1..4) + last snapshot - int bundle_factor[POOL_NUM_CLASSES]; - uint64_t last_hits[POOL_NUM_CLASSES]; - uint64_t last_misses[POOL_NUM_CLASSES]; - - int initialized; - int tls_free_enabled; // env: HAKMEM_POOL_TLS_FREE (default: 1) - - // Extra metrics (for learner logging): all relaxed atomics - atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64))); - atomic_uint_fast64_t trylock_success __attribute__((aligned(64))); - atomic_uint_fast64_t ring_underflow __attribute__((aligned(64))); -} g_pool; - -static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers -static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing -static int g_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring -static int g_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES (1..8) -static int g_ring_return_div = 2; // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third) -static int g_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX (LIFO size cap) -int g_hdr_light_enabled = 0; // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation) -static int g_pool_min_bundle = 2; // env: HAKMEM_POOL_MIN_BUNDLE (default 2) -// Sampled counter updates to reduce hot-path stores: 1/2^k -static int g_count_sample_exp = 10; // env: HAKMEM_POOL_COUNT_SAMPLE (0..16) -static __thread uint32_t t_pool_rng = 0x243f6a88u; // per-thread RNG for sampling - -// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap. -// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap -static size_t g_class_sizes[POOL_NUM_CLASSES] = { - POOL_CLASS_2KB, // 2 KB - POOL_CLASS_4KB, // 4 KB - POOL_CLASS_8KB, // 8 KB - POOL_CLASS_16KB, // 16 KB - POOL_CLASS_32KB, // 32 KB - POOL_CLASS_40KB, // 40 KB (Bridge class 0) - POOL_CLASS_52KB // 52 KB (Bridge class 1) -}; - -// Blocks per page (for each class) -__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = { - POOL_PAGE_SIZE / POOL_CLASS_2KB, // 32 blocks (2KiB) - POOL_PAGE_SIZE / POOL_CLASS_4KB, // 16 blocks (4KiB) - POOL_PAGE_SIZE / POOL_CLASS_8KB, // 8 blocks (8KiB) - POOL_PAGE_SIZE / POOL_CLASS_16KB, // 4 blocks (16KiB) - POOL_PAGE_SIZE / POOL_CLASS_32KB, // 2 blocks (32KiB) - POOL_PAGE_SIZE / POOL_CLASS_40KB, // 1 block (40KiB Bridge) - POOL_PAGE_SIZE / POOL_CLASS_52KB // 1 block (52KiB Bridge) -}; - -// =========================================================================== -// Helper Functions -// =========================================================================== - -// Write minimal header for Mid allocation (fast-return friendly) -static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) { - // For Mid, prefer headerless operation when HDR_LIGHT>=1. - // Debug or non-Mid callers can still write full headers elsewhere. - if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path - hdr->magic = HAKMEM_MAGIC; - hdr->method = ALLOC_METHOD_POOL; - hdr->size = class_sz; - if (!g_hdr_light_enabled) { - hdr->alloc_site = site_id; - hdr->class_bytes = 0; - hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); - } -} - -// Branchless LUT (Lookup Table) for O(1) class determination -// Expanded to 53 entries for Bridge classes (40KB, 52KB) -static const uint8_t SIZE_TO_CLASS[53] = { - 0,0,0, // 0-2KB → Class 0 - 1,1, // 3-4KB → Class 1 - 2,2,2,2, // 5-8KB → Class 2 - 3,3,3,3,3,3,3,3, // 9-16KB → Class 3 - 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, // 17-32KB → Class 4 - 5,5,5,5,5,5,5,5, // 33-40KB → Class 5 (Bridge class 0) - 6,6,6,6,6,6,6,6,6,6,6,6 // 41-52KB → Class 6 (Bridge class 1) -}; - -// Get size class index from size (0-6, or -1 if out of range) -// Updated range check for Bridge classes (0-52KB) -static inline int hak_pool_get_class_index(size_t size) { - // Fast path: exact match against configured class sizes (covers Bridge classes) - // Note: size passed here should already be a rounded class size from ACE. - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - size_t cs = g_class_sizes[i]; - if (cs != 0 && size == cs) return i; - } - // Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior) - uint32_t kb = (uint32_t)((size + 1023) >> 10); // Round up to KB units - return (kb < 53) ? SIZE_TO_CLASS[kb] : -1; // Expanded to 53KB for Bridge classes -} - -// Get shard index from site_id (0-63) -int hak_pool_get_shard_index(uintptr_t site_id) { - if (!g_shard_mix_enabled) { - // Legacy: Shift by 4 to reduce collision (instruction alignment) - return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1)); - } - // SplitMix64-like mixer with thread id salt for better dispersion - uint64_t x = (uint64_t)site_id; - uint64_t tid = (uint64_t)(uintptr_t)pthread_self(); - x ^= (tid << 1); - x += 0x9e3779b97f4a7c15ULL; - x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; - x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; - x = (x ^ (x >> 31)); - return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1)); -} - -// TLS helpers -#include "box/pool_tls_core.inc.h" - - -// Refill/ACE (boxed) -#include "box/pool_refill.inc.h" - -// Init/Shutdown + MF2 debug (boxed) -#include "box/pool_init_api.inc.h" - -// Pool statistics (boxed) -#include "box/pool_stats.inc.h" - -// Public API (boxed): alloc/free/lookup/free_fast -#include "box/pool_api.inc.h" diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h index 6e6aa758..080356db 100644 --- a/core/tiny_alloc_fast.inc.h +++ b/core/tiny_alloc_fast.inc.h @@ -612,11 +612,11 @@ static inline void* tiny_alloc_fast(size_t size) { if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) { // For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads. if (class_idx <= 3) { -#if defined(HAKMEM_TINY_INLINE_SLL) && HAKMEM_TINY_AGGRESSIVE_INLINE - // Experimental: Use inline SLL pop macro (enable via HAKMEM_TINY_INLINE_SLL=1) +#if HAKMEM_TINY_INLINE_SLL + // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1) TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr); #else - // Default: Safe Box API (bypasses inline SLL when Front-Direct) + // Default: Safe Box API (Box TLS-SLL) for all standard builds ptr = tiny_alloc_fast_pop(class_idx); #endif } else { @@ -656,11 +656,11 @@ static inline void* tiny_alloc_fast(size_t size) { // Skip SLL retry if Front-Direct OR SLL disabled if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) { if (class_idx <= 3) { -#if defined(HAKMEM_TINY_INLINE_SLL) && HAKMEM_TINY_AGGRESSIVE_INLINE - // Experimental: Use inline SLL pop macro (enable via HAKMEM_TINY_INLINE_SLL=1) +#if HAKMEM_TINY_INLINE_SLL + // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1) TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr); #else - // Default: Safe Box API (bypasses inline SLL when Front-Direct) + // Default: Safe Box API (Box TLS-SLL) for all standard builds ptr = tiny_alloc_fast_pop(class_idx); #endif } else {