// ============================================================================ // hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB) // ============================================================================ // // サイズクラス定義: // ┌──────────┬─────────┬──────────────┬─────────────┐ // │ クラス │ サイズ │ 初期CAP │ ページ構成 │ // ├──────────┼─────────┼──────────────┼─────────────┤ // │ Class 0 │ 2 KiB │ 64 pages │ 32 blocks/p │ // │ Class 1 │ 4 KiB │ 64 pages │ 16 blocks/p │ // │ Class 2 │ 8 KiB │ 64 pages │ 8 blocks/p │ // │ Class 3 │ 16 KiB │ 32 pages │ 4 blocks/p │ // │ Class 4 │ 32 KiB │ 16 pages │ 2 blocks/p │ // │ DYN1 │ 6 KiB* │ 0 (無効) │ 可変 │ // │ DYN2 │ (未使用)│ 0 (無効) │ 可変 │ // └──────────┴─────────┴──────────────┴─────────────┘ // * DYN1はギャップ(8-16KB)を埋めるための動的クラス // // W_MAX (切り上げ許容倍率): // - 意味: 要求サイズの何倍までのクラスを許容するか // - デフォルト: 1.40 (40%までの切り上げを許容) // - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40) // - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能 // // CAP (在庫量): // - 意味: 各クラスで保持する最大ページ数 // - 初期値: {64,64,64,32,16} - 保守的(フットプリント優先) // - 推奨値: {256,256,256,128,64} - パフォーマンス優先 // - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定 // - 学習モード: HAKMEM_LEARN=1 で自動調整 // // TLSリング構造: // - POOL_L2_RING_CAP: リングバッファ容量(デフォルト16) // - ActivePage A/B: bump-run方式(ロックフリー) // - LIFO overflow: リングから溢れた分 // // パフォーマンスチューニング: // 1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64 // 2. W_MAX緩和: HAKMEM_WMAX_MID=1.6 // 3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64 // 4. 学習モード: HAKMEM_LEARN=1 // // License: MIT // Last Updated: 2025-10-26 (Code Cleanup完了) #include "hakmem_pool.h" #include "hakmem_config.h" #include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC #include "hakmem_syscall.h" // Box 3 syscall layer (bypasses LD_PRELOAD) #include #include #include #include #include #include #include #include "hakmem_prof.h" #include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP gating) #include "hakmem_debug.h" // False sharing mitigation: padded mutex type (64B) typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex; // =========================================================================== // Internal Data Structures // =========================================================================== #include "box/pool_tls_types.inc.h" // Mid page descriptor registry (64KiB pages → {class_idx, owner_tid}) #include "box/pool_mid_desc.inc.h" // ---------------- Transfer Cache (per-thread per-class inbox) -------------- #include "box/pool_mid_tc.inc.h" // =========================================================================== // MF2 Per-Page Sharding: Mimalloc-Inspired Architecture // =========================================================================== // // Key idea: Each 64KB page has independent freelist (no sharing!) // - O(1) page lookup from block address: (addr & ~0xFFFF) // - Owner thread: fast path (no locks, no atomics) // - Cross-thread free: lock-free remote stack // - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc) // MF2 Configuration Constants (Quick Win #5) #define MF2_PENDING_QUEUE_BUDGET 4 // Max pages to drain from pending queue #define MF2_DEBUG_SAMPLE_COUNT 20 // Number of debug samples to log #define MF2_TSC_CYCLES_PER_US 3000 // Estimated TSC cycles per microsecond #define MF2_PAGE_SIZE_SHIFT 16 // log2(64KB) for fast division #define MF2_PAGE_ALIGNMENT 65536 // 64KB alignment for mmap // Debug Logging Macros (Quick Win #6) // Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable #ifdef HAKMEM_DEBUG_MF2 #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__) #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) #else #define MF2_DEBUG_LOG(fmt, ...) ((void)0) #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) #endif // Forward declarations static size_t g_class_sizes[POOL_NUM_CLASSES]; // MF2 Page descriptor: per-page metadata (one per 64KB page) typedef struct MidPage { // Page identity void* base; // Page base address (64KB aligned) uint8_t class_idx; // Size class index (0-6) uint8_t flags; // Page flags (reserved for future use) uint16_t _pad0; // Ownership pthread_t owner_tid; // Owner thread ID (for fast-path check) struct MF2_ThreadPages* owner_tp; // Owner thread's heap (for pending queue access) uint64_t last_transfer_time; // rdtsc() timestamp of last ownership transfer (lease mechanism) // Page-local freelist (owner-only, NO LOCK!) PoolBlock* freelist; // Local freelist head uint16_t free_count; // Number of free blocks uint16_t capacity; // Total blocks per page // Remote frees (cross-thread, lock-free MPSC stack) atomic_uintptr_t remote_head; // Lock-free remote free stack atomic_uint remote_count; // Remote free count (for quick check) // Lifecycle atomic_int in_use; // Live allocations on this page atomic_int pending_dn; // DONTNEED enqueued flag // Linkage (thread-local page lists) struct MidPage* next_page; // Next page in thread's list struct MidPage* prev_page; // Previous page in thread's list // Pending queue (remote drain notification) _Atomic(_Bool) in_remote_pending; // Is this page in pending queue? struct MidPage* next_pending; // Next page in pending queue // Padding to cache line boundary (avoid false sharing) char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 + sizeof(atomic_uintptr_t) + sizeof(atomic_uint) + sizeof(atomic_int) * 2 + sizeof(pthread_t) + sizeof(_Atomic(_Bool)) + 4) % 64)]; } MidPage; // Page registry: O(1) lookup from block address // Use direct indexing: (addr >> 16) & MASK #define MF2_PAGE_REGISTRY_BITS 16 // 64K entries (4GB address space with 64KB pages) #define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS) #define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1) typedef struct { // Direct-mapped page table (no hash collisions!) MidPage* pages[MF2_PAGE_REGISTRY_SIZE]; // Coarse-grained locks for rare updates (page alloc/free) // 256 locks = 256-way parallelism for page registration pthread_mutex_t locks[256]; // Statistics atomic_uint_fast64_t total_pages; // Total pages allocated atomic_uint_fast64_t active_pages; // Pages with live allocations } MF2_PageRegistry; // Thread-local page lists (one list per size class) typedef struct MF2_ThreadPages { // Active pages (have free blocks) MidPage* active_page[POOL_NUM_CLASSES]; // Partial pages (drained pages with free blocks, LIFO for cache locality) // Checked before allocating new pages (fast reuse path) MidPage* partial_pages[POOL_NUM_CLASSES]; // Full pages (no free blocks, but may receive remote frees) // TODO: Gradually deprecate in favor of partial_pages MidPage* full_pages[POOL_NUM_CLASSES]; // Pending queue (pages with remote frees, MPSC lock-free stack) atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES]; // Pending claim flags (prevent multi-consumer CAS thrashing) // One adopter at a time per queue (test_and_set to claim, clear to release) atomic_flag pending_claim[POOL_NUM_CLASSES]; // Page ownership count (for statistics) uint32_t page_count[POOL_NUM_CLASSES]; // Thread identity (cached for fast comparison) pthread_t my_tid; // Route P: Activity tracking for idle-based adoption // Updated on every allocation (mf2_alloc_fast) // Read by adopters to check if owner is idle atomic_uint_fast64_t last_alloc_tsc; } MF2_ThreadPages; // Global page registry (shared, rarely accessed) static MF2_PageRegistry g_mf2_page_registry; // Thread-local page lists (hot path, no sharing!) static __thread MF2_ThreadPages* t_mf2_pages = NULL; // =========================================================================== // MF2 Global State (Quick Win #3b - Structured Globals) // =========================================================================== // Individual globals replaced with structured state below. // Old declarations removed, replaced with macro-mapped struct instances. // // Benefits: // - Logical grouping (config, registry, stats) // - Better documentation // - Easier to extend or refactor // - Single source of truth for each category #define MF2_MAX_THREADS 256 // MF2 Configuration (environment variables) typedef struct { int enabled; // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled) int max_queues; // HAKMEM_MF2_MAX_QUEUES (default: 2) int lease_ms; // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled) int idle_threshold_us; // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs) } MF2_Config; // MF2 Thread Registry (cross-thread coordination) typedef struct { MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS]; // Global registry _Atomic int num_thread_pages; // Active thread count _Atomic int adoptable_count[POOL_NUM_CLASSES]; // Non-empty pending queues pthread_key_t tls_key; // Thread-local storage key pthread_once_t key_once; // TLS initialization guard } MF2_Registry; // MF2 Statistics (debug instrumentation) typedef struct { // Allocation path atomic_uint_fast64_t alloc_fast_hit; atomic_uint_fast64_t alloc_slow_hit; atomic_uint_fast64_t page_reuse_count; atomic_uint_fast64_t new_page_count; // Free path atomic_uint_fast64_t free_owner_count; atomic_uint_fast64_t free_remote_count; // Drain operations atomic_uint_fast64_t drain_count; atomic_uint_fast64_t drain_blocks; atomic_uint_fast64_t drain_attempts; atomic_uint_fast64_t drain_success; atomic_uint_fast64_t slow_checked_drain; atomic_uint_fast64_t slow_found_remote; // Full page scan (obsolete, kept for historical tracking) atomic_uint_fast64_t full_scan_checked; atomic_uint_fast64_t full_scan_found_remote; atomic_uint_fast64_t eager_drain_scanned; atomic_uint_fast64_t eager_drain_found; // Pending queue atomic_uint_fast64_t pending_enqueued; atomic_uint_fast64_t pending_drained; atomic_uint_fast64_t pending_requeued; } MF2_Stats; // Instantiate structured global state (Quick Win #3b) static MF2_Config g_mf2_config = { .enabled = 0, // Will be set by env var .max_queues = 2, .lease_ms = 10, .idle_threshold_us = 150 }; static MF2_Registry g_mf2_registry = { .all_thread_pages = {0}, .num_thread_pages = 0, .adoptable_count = {0}, .tls_key = 0, .key_once = PTHREAD_ONCE_INIT }; static MF2_Stats g_mf2_stats = { // All fields initialized to 0 (atomic zero-initialization is valid) .alloc_fast_hit = 0, .alloc_slow_hit = 0, .page_reuse_count = 0, .new_page_count = 0, .free_owner_count = 0, .free_remote_count = 0, .drain_count = 0, .drain_blocks = 0, .drain_attempts = 0, .drain_success = 0, .slow_checked_drain = 0, .slow_found_remote = 0, .full_scan_checked = 0, .full_scan_found_remote = 0, .eager_drain_scanned = 0, .eager_drain_found = 0, .pending_enqueued = 0, .pending_drained = 0, .pending_requeued = 0 }; // Compatibility macros: Map old global names to struct fields // This allows existing code to work unchanged while using structured state #define g_mf2_enabled (g_mf2_config.enabled) #define g_mf2_max_queues (g_mf2_config.max_queues) #define g_mf2_lease_ms (g_mf2_config.lease_ms) #define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us) #define g_all_thread_pages (g_mf2_registry.all_thread_pages) #define g_num_thread_pages (g_mf2_registry.num_thread_pages) #define g_adoptable_count (g_mf2_registry.adoptable_count) #define g_mf2_tls_key (g_mf2_registry.tls_key) #define g_mf2_key_once (g_mf2_registry.key_once) #define g_mf2_alloc_fast_hit (g_mf2_stats.alloc_fast_hit) #define g_mf2_alloc_slow_hit (g_mf2_stats.alloc_slow_hit) #define g_mf2_page_reuse_count (g_mf2_stats.page_reuse_count) #define g_mf2_new_page_count (g_mf2_stats.new_page_count) #define g_mf2_free_owner_count (g_mf2_stats.free_owner_count) #define g_mf2_free_remote_count (g_mf2_stats.free_remote_count) #define g_mf2_drain_count (g_mf2_stats.drain_count) #define g_mf2_drain_blocks (g_mf2_stats.drain_blocks) #define g_mf2_drain_attempts (g_mf2_stats.drain_attempts) #define g_mf2_drain_success (g_mf2_stats.drain_success) #define g_mf2_slow_checked_drain (g_mf2_stats.slow_checked_drain) #define g_mf2_slow_found_remote (g_mf2_stats.slow_found_remote) #define g_mf2_full_scan_checked (g_mf2_stats.full_scan_checked) #define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote) #define g_mf2_eager_drain_scanned (g_mf2_stats.eager_drain_scanned) #define g_mf2_eager_drain_found (g_mf2_stats.eager_drain_found) #define g_mf2_pending_enqueued (g_mf2_stats.pending_enqueued) #define g_mf2_pending_drained (g_mf2_stats.pending_drained) #define g_mf2_pending_requeued (g_mf2_stats.pending_requeued) // =========================================================================== // End of MF2 Data Structures // =========================================================================== // --- MF2 Initialization Functions --- // Thread-safe initialization using pthread_once static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT; static void mf2_page_registry_init_impl(void) { // Initialize all page slots to NULL memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry)); // Initialize 256 coarse-grained locks for registry updates for (int i = 0; i < 256; i++) { pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL); } // Initialize counters atomic_store(&g_mf2_page_registry.total_pages, 0); atomic_store(&g_mf2_page_registry.active_pages, 0); } static void mf2_page_registry_init(void) { pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl); } // Strategy A: ThreadPages destructor (cleanup on thread exit) static void mf2_thread_pages_destructor(void* arg) { MF2_ThreadPages* tp = (MF2_ThreadPages*)arg; if (!tp) return; // SAFETY: Don't remove from global registry or free memory // Reason: Causes "malloc(): unsorted double linked list corrupted" crashes // Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime) // TODO: Investigate safe cleanup mechanism // Remove from global registry (DISABLED for safety) // for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) { // if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) { // atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release); // break; // } // } // Free all pages owned by this thread (DISABLED for safety) // hkm_libc_free(tp); (void)tp; // Suppress unused warning } // Strategy A: Initialize pthread_key (once only) static void mf2_init_tls_key(void) { pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor); } // Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection) static inline uint64_t mf2_rdtsc(void) { #if defined(__x86_64__) || defined(__i386__) uint32_t lo, hi; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; #else // Fallback for non-x86 architectures (use clock_gettime approximation) struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; #endif } static MF2_ThreadPages* mf2_thread_pages_get(void) { if (t_mf2_pages) return t_mf2_pages; // Initialize pthread_key (once only) pthread_once(&g_mf2_key_once, mf2_init_tls_key); // Allocate thread-local page lists MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages)); if (!tp) return NULL; // Initialize with current thread ID tp->my_tid = pthread_self(); // All page lists start empty (NULL) for (int c = 0; c < POOL_NUM_CLASSES; c++) { tp->active_page[c] = NULL; tp->full_pages[c] = NULL; atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed); atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed); tp->page_count[c] = 0; } // Route P: Initialize activity tracking atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed); // Strategy A: Register in global array for round-robin drain int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel); if (idx < MF2_MAX_THREADS) { atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release); // DEBUG: Log first 10 thread registrations - Disabled for performance // static _Atomic int reg_samples = 0; // int rs = atomic_fetch_add_explicit(®_samples, 1, memory_order_relaxed); // if (rs < 10) { // fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n", // rs, (unsigned long)tp->my_tid, tp, idx); // } } else { MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS); } // Set pthread-specific data for destructor pthread_setspecific(g_mf2_tls_key, tp); t_mf2_pages = tp; return tp; } // --- MF2 Page Allocation & Lookup --- // O(1) page lookup from block address (mimalloc's secret sauce!) static inline MidPage* mf2_addr_to_page(void* addr) { // Step 1: Get page base address (64KB aligned) // 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL); // Step 2: Index into registry (direct-mapped, 64K entries) // (addr >> 16) extracts page index, & 0xFFFF wraps to registry size size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); // Step 3: Direct lookup (no hash collision handling needed with 64K entries) MidPage* page = g_mf2_page_registry.pages[idx]; // ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups static _Atomic int lookup_count = 0; // DEBUG: Disabled for performance // int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed); // if (count < 100) { // int found = (page != NULL); // int match = (page && page->base == page_base); // fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s", // count, addr, page_base, idx, found ? "YES" : "NO"); // if (page) { // fprintf(stderr, ", page->base=%p, match=%s", // page->base, match ? "YES" : "NO"); // } // fprintf(stderr, "\n"); // } // Validation: Ensure page base matches (handles potential collisions) if (page && page->base == page_base) { return page; } // Collision or not registered (shouldn't happen in normal operation) return NULL; } // Register a page in the global registry (called once per page allocation) static void mf2_register_page(MidPage* page) { if (!page) return; // Calculate registry index from page base size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); // ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance // static int register_count = 0; // if (register_count < 10) { // fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n", // register_count, page->base, idx, // (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO"); // register_count++; // } // Coarse-grained lock (256 locks for 64K entries = 256 entries/lock) int lock_idx = idx % 256; pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]); // Check for collision (should be rare with 64K entries) if (g_mf2_page_registry.pages[idx] != NULL) { // Collision detected - this is a problem! // For MVP, we'll just log and overwrite (TODO: handle collisions properly) HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx); } // Register the page g_mf2_page_registry.pages[idx] = page; // Update counters atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed); atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed); pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]); } // Unregister a page from the global registry (called when returning page to OS) __attribute__((unused)) static void mf2_unregister_page(MidPage* page) { if (!page) return; size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1); int lock_idx = idx % 256; pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]); if (g_mf2_page_registry.pages[idx] == page) { g_mf2_page_registry.pages[idx] = NULL; atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed); } pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]); } // Allocate and initialize a new 64KB page for given size class static MidPage* mf2_alloc_new_page(int class_idx) { if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL; // Get user size class (2KB, 4KB, 8KB, 16KB, 32KB) size_t user_size = g_class_sizes[class_idx]; if (user_size == 0) return NULL; // Dynamic class disabled // CRITICAL FIX: Each block needs HEADER_SIZE + user_size // The header stores metadata (AllocHeader), user_size is the usable space size_t block_size = HEADER_SIZE + user_size; // Step 1: Allocate 64KB page (aligned to 64KB boundary) // CRITICAL FIX #4: Must ensure 64KB alignment! // mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup. // This caused 97% of frees to fail silently (fatal bug!) // // CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion! // Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion. // Allocate 2x size to allow alignment adjustment size_t alloc_size = POOL_PAGE_SIZE * 2; // 128KB void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (raw == MAP_FAILED) { return NULL; // OOM } // Find 64KB aligned address within allocation uintptr_t addr = (uintptr_t)raw; uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL; // Round up to 64KB boundary void* page_base = (void*)aligned; // Free unused prefix (if any) size_t prefix_size = aligned - addr; if (prefix_size > 0) { munmap(raw, prefix_size); } // Free unused suffix size_t suffix_offset = prefix_size + POOL_PAGE_SIZE; if (suffix_offset < alloc_size) { munmap((char*)raw + suffix_offset, alloc_size - suffix_offset); } // DEBUG: Log first few allocations static _Atomic int mmap_count = 0; int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed); if (mc < 5) { MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu", mc, raw, page_base, prefix_size, alloc_size - suffix_offset); } // ALIGNMENT VERIFICATION (Step 1) if (((uintptr_t)page_base & 0xFFFF) != 0) { MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)", page_base, ((uintptr_t)page_base & 0xFFFF)); } // Zero-fill (required for posix_memalign) // Note: This adds ~15μs overhead, but is necessary for correctness memset(page_base, 0, POOL_PAGE_SIZE); // Step 2: Allocate MidPage descriptor MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage)); if (!page) { // CRITICAL FIX: Use munmap for mmap-allocated memory munmap(page_base, POOL_PAGE_SIZE); return NULL; } // Step 3: Initialize page descriptor page->base = page_base; page->class_idx = (uint8_t)class_idx; page->flags = 0; page->owner_tid = pthread_self(); page->owner_tp = mf2_thread_pages_get(); // Store owner's ThreadPages for pending queue page->last_transfer_time = 0; // No transfer yet (lease mechanism) // Step 4: Build freelist chain (walk through page and link blocks) // Calculate how many blocks fit in 64KB page (including header overhead) size_t usable_size = POOL_PAGE_SIZE; size_t num_blocks = usable_size / block_size; page->capacity = (uint16_t)num_blocks; page->free_count = (uint16_t)num_blocks; // Build linked list of free blocks PoolBlock* freelist_head = NULL; PoolBlock* freelist_tail = NULL; for (size_t i = 0; i < num_blocks; i++) { char* block_addr = (char*)page_base + (i * block_size); PoolBlock* block = (PoolBlock*)block_addr; block->next = NULL; if (freelist_head == NULL) { freelist_head = block; freelist_tail = block; } else { freelist_tail->next = block; freelist_tail = block; } } page->freelist = freelist_head; // Step 5: Initialize remote stack (for cross-thread frees) atomic_store(&page->remote_head, (uintptr_t)0); atomic_store(&page->remote_count, 0); // Step 6: Initialize lifecycle counters atomic_store(&page->in_use, 0); // No blocks allocated yet atomic_store(&page->pending_dn, 0); // Step 7: Initialize linkage page->next_page = NULL; page->prev_page = NULL; // Initialize pending queue fields atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed); page->next_pending = NULL; // Step 8: Register page in global registry mf2_register_page(page); return page; } // --- MF2 Allocation & Free Operations --- // Forward declarations static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page); // Drain remote frees (cross-thread) into page's local freelist // Called by owner thread when local freelist is empty static int mf2_drain_remote_frees(MidPage* page) { if (!page) return 0; atomic_fetch_add(&g_mf2_drain_attempts, 1); // Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG) unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); if (remote_count == 0) { return 0; // Nothing to drain } // Atomically swap remote stack head with NULL (lock-free pop all) uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0, memory_order_acq_rel); if (!head) { atomic_store_explicit(&page->remote_count, 0, memory_order_release); return 0; // Race: someone else drained it } // Reset remote count (FIX #6: use release for future drain checks to see) atomic_store_explicit(&page->remote_count, 0, memory_order_release); // Walk the remote stack and count blocks int drained = 0; PoolBlock* cur = (PoolBlock*)head; PoolBlock* tail = NULL; while (cur) { drained++; tail = cur; cur = cur->next; } // Append remote stack to local freelist (splice in front for simplicity) if (tail) { tail->next = page->freelist; page->freelist = (PoolBlock*)head; page->free_count += drained; } atomic_fetch_add(&g_mf2_drain_count, 1); atomic_fetch_add(&g_mf2_drain_blocks, drained); // CRITICAL FIX: Check if new remotes arrived DURING drain // If so, re-enqueue to owner's pending queue (avoid losing remotes!) unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire); if (post_drain_count >= 1 && page->owner_tp) { // Use same threshold as initial enqueue // New remotes arrived during drain, re-enqueue for next round // Note: This is safe because flag was cleared earlier mf2_enqueue_pending(page->owner_tp, page); } return drained; } // =========================================================================== // Pending Queue Operations (MPSC Lock-Free Stack) // =========================================================================== // Enqueue page to owner's pending queue (called by remote threads) // MPSC: Multiple producers (remote free threads), single consumer (owner) static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) { if (!owner_tp || !page) return; // Already in pending? Skip (avoid duplicate enqueue) _Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel); if (was_pending) { return; // Already enqueued, nothing to do } atomic_fetch_add(&g_mf2_pending_enqueued, 1); // Push to owner's pending stack (Treiber stack algorithm) uintptr_t old_head; do { old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed); page->next_pending = (MidPage*)old_head; } while (!atomic_compare_exchange_weak_explicit( &owner_tp->pages_remote_pending[page->class_idx], &old_head, (uintptr_t)page, memory_order_release, // Publish page memory_order_relaxed)); // 0→1 detection: Increment adoptable count for this class // This enables O(1) early return in try_adopt (if count==0, no scan needed) if (old_head == 0) { atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed); } } // Dequeue one page from pending queue (called by owner thread or adopter) // Uses CAS for correctness (multi-consumer in adoption path) static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) { if (!tp) return NULL; uintptr_t old_head; do { old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire); if (old_head == 0) { return NULL; // Queue empty } MidPage* page = (MidPage*)old_head; // CAS to pop head if (atomic_compare_exchange_weak_explicit( &tp->pages_remote_pending[class_idx], &old_head, (uintptr_t)page->next_pending, memory_order_acq_rel, memory_order_relaxed)) { // Successfully dequeued MidPage* next = page->next_pending; page->next_pending = NULL; // Clear link // If queue became empty (next==NULL), decrement adoptable count // This enables O(1) early return in try_adopt when all queues empty if (next == NULL) { atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed); } return page; } } while (1); } // =========================================================================== // End of Pending Queue Operations // =========================================================================== // Forward declarations static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id); // =========================================================================== // Helper Functions (Clean & Modular) // =========================================================================== // Helper: Make page active (move old active to full_pages) static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) { if (!tp || !page) return; // Move old active page to full_pages (if any) if (tp->active_page[class_idx]) { MidPage* old_active = tp->active_page[class_idx]; old_active->next_page = tp->full_pages[class_idx]; tp->full_pages[class_idx] = old_active; } // Set new page as active tp->active_page[class_idx] = page; page->next_page = NULL; } // Helper: Drain page and add to partial list (LIFO for cache locality) // Returns true if page has free blocks after drain static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) { if (!tp || !page) return false; // Drain remote frees int drained = mf2_drain_remote_frees(page); // If page has freelist after drain, add to partial list (LIFO) if (page->freelist) { atomic_fetch_add(&g_mf2_page_reuse_count, 1); page->next_page = tp->partial_pages[class_idx]; tp->partial_pages[class_idx] = page; return true; } // No freelist, return to full_pages page->next_page = tp->full_pages[class_idx]; tp->full_pages[class_idx] = page; return false; } // Helper: Drain page and activate if successful (Direct Handoff - backward compat) // Returns true if page was activated static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) { if (!tp || !page) return false; // Drain remote frees int drained = mf2_drain_remote_frees(page); // If page has freelist after drain, make it active immediately if (page->freelist) { atomic_fetch_add(&g_mf2_page_reuse_count, 1); mf2_make_page_active(tp, class_idx, page); return true; } // No freelist, return to full_pages page->next_page = tp->full_pages[class_idx]; tp->full_pages[class_idx] = page; return false; } // Helper: Try to reuse pages from own pending queue (must-reuse gate part 1) // Returns true if a page was successfully drained and activated static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) { if (!tp) return false; // Budget: Process up to N pages to avoid blocking for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) { MidPage* pending_page = mf2_dequeue_pending(tp, class_idx); if (!pending_page) break; // Queue empty atomic_fetch_add(&g_mf2_pending_drained, 1); // Clear pending flag (no longer in queue) atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release); // DIRECT HANDOFF: Drain and activate if successful if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) { return true; // Success! Page is now active } // No freelist after drain, page returned to full_pages by helper } return false; // No pages available for reuse } // Helper: Try to drain remotes from active page (must-reuse gate part 2) // Returns true if active page has freelist after drain static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) { if (!tp) return false; MidPage* page = tp->active_page[class_idx]; if (!page) return false; atomic_fetch_add(&g_mf2_slow_checked_drain, 1); unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); if (remote_cnt > 0) { atomic_fetch_add(&g_mf2_slow_found_remote, 1); int drained = mf2_drain_remote_frees(page); if (drained > 0 && page->freelist) { atomic_fetch_add(&g_mf2_drain_success, 1); return true; // Success! Active page now has freelist } } return false; // No remotes or drain failed } // Helper: Allocate new page and make it active // Returns the newly allocated page (or NULL on OOM) static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) { if (!tp) return NULL; atomic_fetch_add(&g_mf2_new_page_count, 1); // DEBUG: Log why we're allocating new page (first N samples) static _Atomic int new_page_samples = 0; int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed); if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) { // Count adoptable pages across all threads int total_adoptable = 0; for (int i = 0; i < POOL_NUM_CLASSES; i++) { total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed); } MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p", sample_idx, class_idx, (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed), total_adoptable, tp->active_page[class_idx], tp->full_pages[class_idx]); } MidPage* page = mf2_alloc_new_page(class_idx); if (!page) { return NULL; // OOM } // Move current active page to full list (if any) if (tp->active_page[class_idx]) { MidPage* old_page = tp->active_page[class_idx]; old_page->next_page = tp->full_pages[class_idx]; tp->full_pages[class_idx] = old_page; } // Set new page as active tp->active_page[class_idx] = page; tp->page_count[class_idx]++; return page; } // =========================================================================== // End of Helper Functions // =========================================================================== // Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue // Returns true if a page was successfully adopted and activated // Called from alloc_slow when allocating thread needs memory static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) { if (!me) return false; // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating) // Avoids scanning empty queues (major performance win!) int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed); if (adoptable == 0) return false; // All queues empty, no scan needed // Get global thread registry int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); if (num_tp == 0) return false; // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES) // Prevents excessive scanning overhead (2-8 threads is usually enough) int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues; // Round-robin scan (limited number of threads, not ALL!) static _Atomic uint64_t adopt_counter = 0; uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed); for (int i = 0; i < scan_limit; i++) { int tp_idx = (start_idx + i) % num_tp; MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit( (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire); if (!other_tp) continue; // Route P: Idle Detection - Only adopt from idle owners // Check if owner is still actively allocating (threshold configurable via env var) uint64_t now_tsc = mf2_rdtsc(); uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed); uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US; if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) { continue; // Owner still active, skip adoption } // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!) // Only one thread scans each queue at a time → eliminates CAS contention if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) { continue; // Another thread is already scanning this queue, skip } // Try to dequeue a pending page from this thread MidPage* page = mf2_dequeue_pending(other_tp, class_idx); if (!page) { // Queue empty, release claim and try next thread atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); continue; } // Clear pending flag (no longer in queue) atomic_store_explicit(&page->in_remote_pending, false, memory_order_release); // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS) // 0ms = disabled (no lease check), >0 = lease period in milliseconds uint64_t now = mf2_rdtsc(); uint64_t last_transfer = page->last_transfer_time; if (g_mf2_lease_ms > 0 && last_transfer != 0) { // Calculate lease cycles from ms (approx 3GHz CPU) uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL); if ((now - last_transfer) < lease_cycles) { // Lease still active, return page to full_pages (don't thrash ownership) page->next_page = other_tp->full_pages[class_idx]; other_tp->full_pages[class_idx] = page; // Release claim before continuing atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); continue; // Try next thread } } // Try to transfer ownership using CAS pthread_t old_owner = page->owner_tid; pthread_t new_owner = pthread_self(); // Note: pthread_t may not be atomic-compatible on all platforms // For now, we'll use a simple write (ownership transfer is rare) // TODO: If thrashing is observed, add atomic CAS with serialization page->owner_tid = new_owner; page->owner_tp = me; page->last_transfer_time = now; // DEBUG: Log drain state static _Atomic int adopt_samples = 0; int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed); unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed); unsigned int pre_free = page->free_count; PoolBlock* pre_freelist = page->freelist; // Drain remote frees int drained = mf2_drain_remote_frees(page); // DEBUG: Log result (first 10 samples) if (sample_idx < 10) { MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p", sample_idx, class_idx, pre_remote, drained, pre_free, page->free_count, pre_freelist, page->freelist); } // Make adopted page ACTIVE immediately (not partial!) // Adoption needs immediate activation for caller's mf2_alloc_fast() // Partial list is only for own pending queue drains if (page->freelist) { atomic_fetch_add(&g_mf2_page_reuse_count, 1); atomic_fetch_add(&g_mf2_pending_drained, 1); atomic_fetch_add(&g_mf2_drain_success, 1); // Make it active (move old active to full_pages) mf2_make_page_active(me, class_idx, page); // Release claim before returning SUCCESS atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); return true; // SUCCESS! Page adopted and activated } // No freelist after drain, return to MY full_pages (I'm the new owner!) page->next_page = me->full_pages[class_idx]; me->full_pages[class_idx] = page; // Release claim before continuing search atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); // Continue searching for a better page } return false; // No adoptable pages found } // Fast allocation path (owner thread, NO LOCK!) static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) { // Get thread-local page lists MF2_ThreadPages* tp = mf2_thread_pages_get(); if (!tp) return NULL; // Get active page for this class MidPage* page = tp->active_page[class_idx]; if (!page) { // No active page, go to slow path return mf2_alloc_slow(class_idx, size, site_id); } // FAST PATH: Pop from page-local freelist (NO LOCK!) if (page->freelist) { atomic_fetch_add(&g_mf2_alloc_fast_hit, 1); // Route P: Update activity tracking for idle detection atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed); PoolBlock* block = page->freelist; page->freelist = block->next; page->free_count--; // Increment in-use count (atomic for cross-thread visibility) atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed); // Return user pointer (skip header) return (char*)block + HEADER_SIZE; } // Local freelist empty, go to slow path return mf2_alloc_slow(class_idx, size, site_id); } // Slow allocation path (drain remote or allocate new page) static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) { (void)site_id; // Unused for now atomic_fetch_add(&g_mf2_alloc_slow_hit, 1); // Get thread-local page lists MF2_ThreadPages* tp = mf2_thread_pages_get(); if (!tp) return NULL; // =========================================================================== // Allocation Strategy (Must-Reuse Order) // =========================================================================== // 1. MUST-REUSE GATE (Part 1): Drain own pending queue // - Process up to 4 pages to avoid blocking // - Direct handoff: activate first successful drain immediately if (mf2_try_reuse_own_pending(tp, class_idx)) { return mf2_alloc_fast(class_idx, size, site_id); } // 2. MUST-REUSE GATE (Part 2): Drain active page remotes // - Check if current active page has remote frees // - Drain and retry allocation if successful if (mf2_try_drain_active_remotes(tp, class_idx)) { return mf2_alloc_fast(class_idx, size, site_id); } // HISTORICAL NOTE: full_pages scan removed // Old approach: Scan full_pages looking for pages with remotes // Problem: Drained pages consumed before owner can scan them // New approach: Direct Handoff immediately activates drained pages // Result: full_pages scan always finds 0 pages (100% waste) // // Benchmark evidence (before removal): // - Full scan checked: 1,879,484 pages // - Full scan found: 0 pages (0% success rate!) // 3. Consumer-Driven Adoption (Route P with idle detection) // - Only adopt from idle owners (haven't allocated in >150µs) // - Prevents "adoption stealing" from active owners if (mf2_try_adopt_pending(tp, class_idx)) { return mf2_alloc_fast(class_idx, size, site_id); } // 4. MUST-REUSE GATE (Final): Allocate new page (last resort) // - Only reached after exhausting all reuse opportunities // - Order: pending queue → active drain → adoption → NEW MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx); if (!page) { return NULL; // OOM } // Retry allocation from new page return mf2_alloc_fast(class_idx, size, site_id); } // Forward declaration of slow free path static void mf2_free_slow(MidPage* page, void* ptr); // Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue) // Fast free path (owner thread, NO LOCK!) static inline void mf2_free_fast(MidPage* page, void* ptr) { if (!page || !ptr) return; atomic_fetch_add(&g_mf2_free_owner_count, 1); // Get block pointer (rewind to header) PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE); // FAST PATH: Push to page-local freelist (NO LOCK!) block->next = page->freelist; page->freelist = block; page->free_count++; // Decrement in-use count (atomic for cross-thread visibility) int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release); // Check if page is now empty (all blocks free) if (old_in_use == 1 && page->free_count == page->capacity) { // Memory efficiency: Return empty pages to OS via MADV_DONTNEED // Keeps VA mapped (no munmap), but releases physical memory hak_batch_add_page(page->base, POOL_PAGE_SIZE); } } // Slow free path (cross-thread free to remote stack) static void mf2_free_slow(MidPage* page, void* ptr) { if (!page || !ptr) return; atomic_fetch_add(&g_mf2_free_remote_count, 1); // Get block pointer PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE); // Push to page's remote stack (lock-free MPSC) uintptr_t old_head; do { old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire); block->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit( &page->remote_head, &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed)); // Increment remote count and detect threshold for enqueueing unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst); // CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge // Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again) // Solution: Only enqueue when remotes accumulate to threshold (better batching) // // Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4): // 1 = immediate (0→1 edge, causes ping-pong) // 4 = balanced (batch 4 blocks before notifying owner) // 8 = aggressive batching (higher latency, but better efficiency) // // We enqueue on transitions TO the threshold (old_count == threshold-1) static int g_enqueue_threshold = 1; // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4 if (old_count + 1 == (unsigned int)g_enqueue_threshold) { // Remote count just reached threshold, notify owner if (page->owner_tp) { mf2_enqueue_pending(page->owner_tp, page); } } // DEBUG: Sample first 10 remote frees - Disabled for performance // static _Atomic int remote_free_samples = 0; // int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed); // if (sample < 10) { // fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n", // sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO"); // } // Decrement in-use count int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release); // Check if page is now empty (FIX #6: acquire to see all remote frees) if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) { // Memory efficiency: Return empty pages to OS via MADV_DONTNEED // Keeps VA mapped (no munmap), but releases physical memory hak_batch_add_page(page->base, POOL_PAGE_SIZE); } } // Top-level free dispatcher static void mf2_free(void* ptr) { if (!ptr) return; // O(1) page lookup (mimalloc's magic!) MidPage* page = mf2_addr_to_page(ptr); if (!page) { // Not a MF2 page (shouldn't happen if MF2 is enabled properly) return; } // Check if we're the owner (fast path) MF2_ThreadPages* tp = mf2_thread_pages_get(); if (tp && page->owner_tid == tp->my_tid) { // Fast: Owner thread, push to local freelist (NO LOCK!) mf2_free_fast(page, ptr); } else { // Slow: Cross-thread free, push to remote stack (lock-free) mf2_free_slow(page, ptr); } } // =========================================================================== // Global pool state (simplified: single-threaded for MVP) static struct { PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; // Locks: per (class, shard) freelist to allow concurrent operations PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; // Non-empty bitmap (O(1) empty class skip) // Bit i = 1 if freelist[class][shard] is non-empty // Use atomic to avoid class-wide locks atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES]; // 1 bit per shard // Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; atomic_uint remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS]; // Statistics uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64))); uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64))); uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64))); uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64))); uint64_t total_bytes_allocated __attribute__((aligned(64))); uint64_t total_pages_allocated __attribute__((aligned(64))); // Per-class page accounting (for Soft CAP guidance) uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64))); // ACE: per-class bundle factor for refill (1..4) + last snapshot int bundle_factor[POOL_NUM_CLASSES]; uint64_t last_hits[POOL_NUM_CLASSES]; uint64_t last_misses[POOL_NUM_CLASSES]; int initialized; int tls_free_enabled; // env: HAKMEM_POOL_TLS_FREE (default: 1) // Extra metrics (for learner logging): all relaxed atomics atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64))); atomic_uint_fast64_t trylock_success __attribute__((aligned(64))); atomic_uint_fast64_t ring_underflow __attribute__((aligned(64))); } g_pool; static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing static int g_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring static int g_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES (1..8) static int g_ring_return_div = 2; // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third) static int g_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX (LIFO size cap) int g_hdr_light_enabled = 0; // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation) static int g_pool_min_bundle = 2; // env: HAKMEM_POOL_MIN_BUNDLE (default 2) // Sampled counter updates to reduce hot-path stores: 1/2^k static int g_count_sample_exp = 10; // env: HAKMEM_POOL_COUNT_SAMPLE (0..16) static __thread uint32_t t_pool_rng = 0x243f6a88u; // per-thread RNG for sampling // Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap. // 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap static size_t g_class_sizes[POOL_NUM_CLASSES] = { POOL_CLASS_2KB, // 2 KB POOL_CLASS_4KB, // 4 KB POOL_CLASS_8KB, // 8 KB POOL_CLASS_16KB, // 16 KB POOL_CLASS_32KB, // 32 KB POOL_CLASS_40KB, // 40 KB (Bridge class 0) POOL_CLASS_52KB // 52 KB (Bridge class 1) }; // Blocks per page (for each class) __attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = { POOL_PAGE_SIZE / POOL_CLASS_2KB, // 32 blocks (2KiB) POOL_PAGE_SIZE / POOL_CLASS_4KB, // 16 blocks (4KiB) POOL_PAGE_SIZE / POOL_CLASS_8KB, // 8 blocks (8KiB) POOL_PAGE_SIZE / POOL_CLASS_16KB, // 4 blocks (16KiB) POOL_PAGE_SIZE / POOL_CLASS_32KB, // 2 blocks (32KiB) POOL_PAGE_SIZE / POOL_CLASS_40KB, // 1 block (40KiB Bridge) POOL_PAGE_SIZE / POOL_CLASS_52KB // 1 block (52KiB Bridge) }; // =========================================================================== // Helper Functions // =========================================================================== // Write minimal header for Mid allocation (fast-return friendly) static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) { // For Mid, prefer headerless operation when HDR_LIGHT>=1. // Debug or non-Mid callers can still write full headers elsewhere. if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path hdr->magic = HAKMEM_MAGIC; hdr->method = ALLOC_METHOD_POOL; hdr->size = class_sz; if (!g_hdr_light_enabled) { hdr->alloc_site = site_id; hdr->class_bytes = 0; hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); } } // Branchless LUT (Lookup Table) for O(1) class determination // Expanded to 53 entries for Bridge classes (40KB, 52KB) static const uint8_t SIZE_TO_CLASS[53] = { 0,0,0, // 0-2KB → Class 0 1,1, // 3-4KB → Class 1 2,2,2,2, // 5-8KB → Class 2 3,3,3,3,3,3,3,3, // 9-16KB → Class 3 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, // 17-32KB → Class 4 5,5,5,5,5,5,5,5, // 33-40KB → Class 5 (Bridge class 0) 6,6,6,6,6,6,6,6,6,6,6,6 // 41-52KB → Class 6 (Bridge class 1) }; // Get size class index from size (0-6, or -1 if out of range) // Updated range check for Bridge classes (0-52KB) static inline int hak_pool_get_class_index(size_t size) { // Fast path: exact match against configured class sizes (covers Bridge classes) // Note: size passed here should already be a rounded class size from ACE. for (int i = 0; i < POOL_NUM_CLASSES; i++) { size_t cs = g_class_sizes[i]; if (cs != 0 && size == cs) return i; } // Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior) uint32_t kb = (uint32_t)((size + 1023) >> 10); // Round up to KB units return (kb < 53) ? SIZE_TO_CLASS[kb] : -1; // Expanded to 53KB for Bridge classes } // Get shard index from site_id (0-63) int hak_pool_get_shard_index(uintptr_t site_id) { if (!g_shard_mix_enabled) { // Legacy: Shift by 4 to reduce collision (instruction alignment) return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1)); } // SplitMix64-like mixer with thread id salt for better dispersion uint64_t x = (uint64_t)site_id; uint64_t tid = (uint64_t)(uintptr_t)pthread_self(); x ^= (tid << 1); x += 0x9e3779b97f4a7c15ULL; x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; x = (x ^ (x >> 31)); return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1)); } // TLS helpers #include "box/pool_tls_core.inc.h" // Refill/ACE (boxed) #include "box/pool_refill.inc.h" // Init/Shutdown + MF2 debug (boxed) #include "box/pool_init_api.inc.h" // Pool statistics (boxed) #include "box/pool_stats.inc.h" // Public API (boxed): alloc/free/lookup/free_fast #include "box/pool_api.inc.h"