diff --git a/core/box/pool_mf2_adoption.inc.h b/core/box/pool_mf2_adoption.inc.h new file mode 100644 index 00000000..56ab27f7 --- /dev/null +++ b/core/box/pool_mf2_adoption.inc.h @@ -0,0 +1,129 @@ +// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue +// Returns true if a page was successfully adopted and activated +// Called from alloc_slow when allocating thread needs memory +static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) { + if (!me) return false; + + // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating) + // Avoids scanning empty queues (major performance win!) + int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed); + if (adoptable == 0) return false; // All queues empty, no scan needed + + // Get global thread registry + int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); + if (num_tp == 0) return false; + + // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES) + // Prevents excessive scanning overhead (2-8 threads is usually enough) + int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues; + + // Round-robin scan (limited number of threads, not ALL!) + static _Atomic uint64_t adopt_counter = 0; + uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed); + + for (int i = 0; i < scan_limit; i++) { + int tp_idx = (start_idx + i) % num_tp; + MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit( + (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire); + + if (!other_tp) continue; + + // Route P: Idle Detection - Only adopt from idle owners + // Check if owner is still actively allocating (threshold configurable via env var) + uint64_t now_tsc = mf2_rdtsc(); + uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed); + uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US; + + if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) { + continue; // Owner still active, skip adoption + } + + // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!) + // Only one thread scans each queue at a time → eliminates CAS contention + if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) { + continue; // Another thread is already scanning this queue, skip + } + + // Try to dequeue a pending page from this thread + MidPage* page = mf2_dequeue_pending(other_tp, class_idx); + if (!page) { + // Queue empty, release claim and try next thread + atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); + continue; + } + + // Clear pending flag (no longer in queue) + atomic_store_explicit(&page->in_remote_pending, false, memory_order_release); + + // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS) + // 0ms = disabled (no lease check), >0 = lease period in milliseconds + uint64_t now = mf2_rdtsc(); + uint64_t last_transfer = page->last_transfer_time; + if (g_mf2_lease_ms > 0 && last_transfer != 0) { + // Calculate lease cycles from ms (approx 3GHz CPU) + uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL); + if ((now - last_transfer) < lease_cycles) { + // Lease still active, return page to full_pages (don't thrash ownership) + page->next_page = other_tp->full_pages[class_idx]; + other_tp->full_pages[class_idx] = page; + // Release claim before continuing + atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); + continue; // Try next thread + } + } + + // Try to transfer ownership using CAS + pthread_t old_owner = page->owner_tid; + pthread_t new_owner = pthread_self(); + + // Note: pthread_t may not be atomic-compatible on all platforms + // For now, we'll use a simple write (ownership transfer is rare) + // TODO: If thrashing is observed, add atomic CAS with serialization + page->owner_tid = new_owner; + page->owner_tp = me; + page->last_transfer_time = now; + + // DEBUG: Log drain state + static _Atomic int adopt_samples = 0; + int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed); + unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed); + unsigned int pre_free = page->free_count; + PoolBlock* pre_freelist = page->freelist; + + // Drain remote frees + int drained = mf2_drain_remote_frees(page); + + // DEBUG: Log result (first 10 samples) + if (sample_idx < 10) { + MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p", + sample_idx, class_idx, pre_remote, drained, + pre_free, page->free_count, pre_freelist, page->freelist); + } + + // Make adopted page ACTIVE immediately (not partial!) + // Adoption needs immediate activation for caller's mf2_alloc_fast() + // Partial list is only for own pending queue drains + if (page->freelist) { + atomic_fetch_add(&g_mf2_page_reuse_count, 1); + atomic_fetch_add(&g_mf2_pending_drained, 1); + atomic_fetch_add(&g_mf2_drain_success, 1); + + // Make it active (move old active to full_pages) + mf2_make_page_active(me, class_idx, page); + + // Release claim before returning SUCCESS + atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); + return true; // SUCCESS! Page adopted and activated + } + + // No freelist after drain, return to MY full_pages (I'm the new owner!) + page->next_page = me->full_pages[class_idx]; + me->full_pages[class_idx] = page; + // Release claim before continuing search + atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); + // Continue searching for a better page + } + + return false; // No adoptable pages found +} + diff --git a/core/box/pool_mf2_helpers.inc.h b/core/box/pool_mf2_helpers.inc.h new file mode 100644 index 00000000..37367082 --- /dev/null +++ b/core/box/pool_mf2_helpers.inc.h @@ -0,0 +1,158 @@ +// Forward declarations +static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id); + +// =========================================================================== +// Helper Functions (Clean & Modular) +// =========================================================================== + +// Helper: Make page active (move old active to full_pages) +static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) { + if (!tp || !page) return; + + // Move old active page to full_pages (if any) + if (tp->active_page[class_idx]) { + MidPage* old_active = tp->active_page[class_idx]; + old_active->next_page = tp->full_pages[class_idx]; + tp->full_pages[class_idx] = old_active; + } + + // Set new page as active + tp->active_page[class_idx] = page; + page->next_page = NULL; +} + +// Helper: Drain page and add to partial list (LIFO for cache locality) +// Returns true if page has free blocks after drain +static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) { + if (!tp || !page) return false; + + // Drain remote frees + int drained = mf2_drain_remote_frees(page); + + // If page has freelist after drain, add to partial list (LIFO) + if (page->freelist) { + atomic_fetch_add(&g_mf2_page_reuse_count, 1); + page->next_page = tp->partial_pages[class_idx]; + tp->partial_pages[class_idx] = page; + return true; + } + + // No freelist, return to full_pages + page->next_page = tp->full_pages[class_idx]; + tp->full_pages[class_idx] = page; + return false; +} + +// Helper: Drain page and activate if successful (Direct Handoff - backward compat) +// Returns true if page was activated +static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) { + if (!tp || !page) return false; + + // Drain remote frees + int drained = mf2_drain_remote_frees(page); + + // If page has freelist after drain, make it active immediately + if (page->freelist) { + atomic_fetch_add(&g_mf2_page_reuse_count, 1); + mf2_make_page_active(tp, class_idx, page); + return true; + } + + // No freelist, return to full_pages + page->next_page = tp->full_pages[class_idx]; + tp->full_pages[class_idx] = page; + return false; +} + +// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1) +// Returns true if a page was successfully drained and activated +static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) { + if (!tp) return false; + + // Budget: Process up to N pages to avoid blocking + for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) { + MidPage* pending_page = mf2_dequeue_pending(tp, class_idx); + if (!pending_page) break; // Queue empty + + atomic_fetch_add(&g_mf2_pending_drained, 1); + + // Clear pending flag (no longer in queue) + atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release); + + // DIRECT HANDOFF: Drain and activate if successful + if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) { + return true; // Success! Page is now active + } + // No freelist after drain, page returned to full_pages by helper + } + return false; // No pages available for reuse +} + +// Helper: Try to drain remotes from active page (must-reuse gate part 2) +// Returns true if active page has freelist after drain +static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) { + if (!tp) return false; + + MidPage* page = tp->active_page[class_idx]; + if (!page) return false; + + atomic_fetch_add(&g_mf2_slow_checked_drain, 1); + unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); + + if (remote_cnt > 0) { + atomic_fetch_add(&g_mf2_slow_found_remote, 1); + int drained = mf2_drain_remote_frees(page); + if (drained > 0 && page->freelist) { + atomic_fetch_add(&g_mf2_drain_success, 1); + return true; // Success! Active page now has freelist + } + } + return false; // No remotes or drain failed +} + +// Helper: Allocate new page and make it active +// Returns the newly allocated page (or NULL on OOM) +static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) { + if (!tp) return NULL; + + atomic_fetch_add(&g_mf2_new_page_count, 1); + + // DEBUG: Log why we're allocating new page (first N samples) + static _Atomic int new_page_samples = 0; + int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed); + if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) { + // Count adoptable pages across all threads + int total_adoptable = 0; + for (int i = 0; i < POOL_NUM_CLASSES; i++) { + total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed); + } + MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p", + sample_idx, class_idx, + (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed), + total_adoptable, + tp->active_page[class_idx], + tp->full_pages[class_idx]); + } + + MidPage* page = mf2_alloc_new_page(class_idx); + if (!page) { + return NULL; // OOM + } + + // Move current active page to full list (if any) + if (tp->active_page[class_idx]) { + MidPage* old_page = tp->active_page[class_idx]; + old_page->next_page = tp->full_pages[class_idx]; + tp->full_pages[class_idx] = old_page; + } + + // Set new page as active + tp->active_page[class_idx] = page; + tp->page_count[class_idx]++; + + return page; +} + +// =========================================================================== +// End of Helper Functions +// =========================================================================== diff --git a/core/box/pool_mf2_types.inc.h b/core/box/pool_mf2_types.inc.h new file mode 100644 index 00000000..203a6a9c --- /dev/null +++ b/core/box/pool_mf2_types.inc.h @@ -0,0 +1,266 @@ +// =========================================================================== +// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture +// =========================================================================== +// +// Key idea: Each 64KB page has independent freelist (no sharing!) +// - O(1) page lookup from block address: (addr & ~0xFFFF) +// - Owner thread: fast path (no locks, no atomics) +// - Cross-thread free: lock-free remote stack +// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc) + +// MF2 Configuration Constants (Quick Win #5) +#define MF2_PENDING_QUEUE_BUDGET 4 // Max pages to drain from pending queue +#define MF2_DEBUG_SAMPLE_COUNT 20 // Number of debug samples to log +#define MF2_TSC_CYCLES_PER_US 3000 // Estimated TSC cycles per microsecond +#define MF2_PAGE_SIZE_SHIFT 16 // log2(64KB) for fast division +#define MF2_PAGE_ALIGNMENT 65536 // 64KB alignment for mmap + +// Debug Logging Macros (Quick Win #6) +// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable +#ifdef HAKMEM_DEBUG_MF2 + #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__) + #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) +#else + #define MF2_DEBUG_LOG(fmt, ...) ((void)0) + #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) +#endif + +// Forward declarations +static size_t g_class_sizes[POOL_NUM_CLASSES]; + +// MF2 Page descriptor: per-page metadata (one per 64KB page) +typedef struct MidPage { + // Page identity + void* base; // Page base address (64KB aligned) + uint8_t class_idx; // Size class index (0-6) + uint8_t flags; // Page flags (reserved for future use) + uint16_t _pad0; + + // Ownership + pthread_t owner_tid; // Owner thread ID (for fast-path check) + struct MF2_ThreadPages* owner_tp; // Owner thread's heap (for pending queue access) + uint64_t last_transfer_time; // rdtsc() timestamp of last ownership transfer (lease mechanism) + + // Page-local freelist (owner-only, NO LOCK!) + PoolBlock* freelist; // Local freelist head + uint16_t free_count; // Number of free blocks + uint16_t capacity; // Total blocks per page + + // Remote frees (cross-thread, lock-free MPSC stack) + atomic_uintptr_t remote_head; // Lock-free remote free stack + atomic_uint remote_count; // Remote free count (for quick check) + + // Lifecycle + atomic_int in_use; // Live allocations on this page + atomic_int pending_dn; // DONTNEED enqueued flag + + // Linkage (thread-local page lists) + struct MidPage* next_page; // Next page in thread's list + struct MidPage* prev_page; // Previous page in thread's list + + // Pending queue (remote drain notification) + _Atomic(_Bool) in_remote_pending; // Is this page in pending queue? + struct MidPage* next_pending; // Next page in pending queue + + // Padding to cache line boundary (avoid false sharing) + char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 + + sizeof(atomic_uintptr_t) + sizeof(atomic_uint) + + sizeof(atomic_int) * 2 + sizeof(pthread_t) + + sizeof(_Atomic(_Bool)) + 4) % 64)]; +} MidPage; + +// Page registry: O(1) lookup from block address +// Use direct indexing: (addr >> 16) & MASK +#define MF2_PAGE_REGISTRY_BITS 16 // 64K entries (4GB address space with 64KB pages) +#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS) +#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1) + +typedef struct { + // Direct-mapped page table (no hash collisions!) + MidPage* pages[MF2_PAGE_REGISTRY_SIZE]; + + // Coarse-grained locks for rare updates (page alloc/free) + // 256 locks = 256-way parallelism for page registration + pthread_mutex_t locks[256]; + + // Statistics + atomic_uint_fast64_t total_pages; // Total pages allocated + atomic_uint_fast64_t active_pages; // Pages with live allocations +} MF2_PageRegistry; + +// Thread-local page lists (one list per size class) +typedef struct MF2_ThreadPages { + // Active pages (have free blocks) + MidPage* active_page[POOL_NUM_CLASSES]; + + // Partial pages (drained pages with free blocks, LIFO for cache locality) + // Checked before allocating new pages (fast reuse path) + MidPage* partial_pages[POOL_NUM_CLASSES]; + + // Full pages (no free blocks, but may receive remote frees) + // TODO: Gradually deprecate in favor of partial_pages + MidPage* full_pages[POOL_NUM_CLASSES]; + + // Pending queue (pages with remote frees, MPSC lock-free stack) + atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES]; + + // Pending claim flags (prevent multi-consumer CAS thrashing) + // One adopter at a time per queue (test_and_set to claim, clear to release) + atomic_flag pending_claim[POOL_NUM_CLASSES]; + + // Page ownership count (for statistics) + uint32_t page_count[POOL_NUM_CLASSES]; + + // Thread identity (cached for fast comparison) + pthread_t my_tid; + + // Route P: Activity tracking for idle-based adoption + // Updated on every allocation (mf2_alloc_fast) + // Read by adopters to check if owner is idle + atomic_uint_fast64_t last_alloc_tsc; +} MF2_ThreadPages; + +// Global page registry (shared, rarely accessed) +static MF2_PageRegistry g_mf2_page_registry; + +// Thread-local page lists (hot path, no sharing!) +static __thread MF2_ThreadPages* t_mf2_pages = NULL; + +// =========================================================================== +// MF2 Global State (Quick Win #3b - Structured Globals) +// =========================================================================== +// Individual globals replaced with structured state below. +// Old declarations removed, replaced with macro-mapped struct instances. +// +// Benefits: +// - Logical grouping (config, registry, stats) +// - Better documentation +// - Easier to extend or refactor +// - Single source of truth for each category + +#define MF2_MAX_THREADS 256 + +// MF2 Configuration (environment variables) +typedef struct { + int enabled; // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled) + int max_queues; // HAKMEM_MF2_MAX_QUEUES (default: 2) + int lease_ms; // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled) + int idle_threshold_us; // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs) +} MF2_Config; + +// MF2 Thread Registry (cross-thread coordination) +typedef struct { + MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS]; // Global registry + _Atomic int num_thread_pages; // Active thread count + _Atomic int adoptable_count[POOL_NUM_CLASSES]; // Non-empty pending queues + pthread_key_t tls_key; // Thread-local storage key + pthread_once_t key_once; // TLS initialization guard +} MF2_Registry; + +// MF2 Statistics (debug instrumentation) +typedef struct { + // Allocation path + atomic_uint_fast64_t alloc_fast_hit; + atomic_uint_fast64_t alloc_slow_hit; + atomic_uint_fast64_t page_reuse_count; + atomic_uint_fast64_t new_page_count; + + // Free path + atomic_uint_fast64_t free_owner_count; + atomic_uint_fast64_t free_remote_count; + + // Drain operations + atomic_uint_fast64_t drain_count; + atomic_uint_fast64_t drain_blocks; + atomic_uint_fast64_t drain_attempts; + atomic_uint_fast64_t drain_success; + atomic_uint_fast64_t slow_checked_drain; + atomic_uint_fast64_t slow_found_remote; + + // Full page scan (obsolete, kept for historical tracking) + atomic_uint_fast64_t full_scan_checked; + atomic_uint_fast64_t full_scan_found_remote; + atomic_uint_fast64_t eager_drain_scanned; + atomic_uint_fast64_t eager_drain_found; + + // Pending queue + atomic_uint_fast64_t pending_enqueued; + atomic_uint_fast64_t pending_drained; + atomic_uint_fast64_t pending_requeued; +} MF2_Stats; + +// Instantiate structured global state (Quick Win #3b) +static MF2_Config g_mf2_config = { + .enabled = 0, // Will be set by env var + .max_queues = 2, + .lease_ms = 10, + .idle_threshold_us = 150 +}; + +static MF2_Registry g_mf2_registry = { + .all_thread_pages = {0}, + .num_thread_pages = 0, + .adoptable_count = {0}, + .tls_key = 0, + .key_once = PTHREAD_ONCE_INIT +}; + +static MF2_Stats g_mf2_stats = { + // All fields initialized to 0 (atomic zero-initialization is valid) + .alloc_fast_hit = 0, + .alloc_slow_hit = 0, + .page_reuse_count = 0, + .new_page_count = 0, + .free_owner_count = 0, + .free_remote_count = 0, + .drain_count = 0, + .drain_blocks = 0, + .drain_attempts = 0, + .drain_success = 0, + .slow_checked_drain = 0, + .slow_found_remote = 0, + .full_scan_checked = 0, + .full_scan_found_remote = 0, + .eager_drain_scanned = 0, + .eager_drain_found = 0, + .pending_enqueued = 0, + .pending_drained = 0, + .pending_requeued = 0 +}; + +// Compatibility macros: Map old global names to struct fields +// This allows existing code to work unchanged while using structured state +#define g_mf2_enabled (g_mf2_config.enabled) +#define g_mf2_max_queues (g_mf2_config.max_queues) +#define g_mf2_lease_ms (g_mf2_config.lease_ms) +#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us) + +#define g_all_thread_pages (g_mf2_registry.all_thread_pages) +#define g_num_thread_pages (g_mf2_registry.num_thread_pages) +#define g_adoptable_count (g_mf2_registry.adoptable_count) +#define g_mf2_tls_key (g_mf2_registry.tls_key) +#define g_mf2_key_once (g_mf2_registry.key_once) + +#define g_mf2_alloc_fast_hit (g_mf2_stats.alloc_fast_hit) +#define g_mf2_alloc_slow_hit (g_mf2_stats.alloc_slow_hit) +#define g_mf2_page_reuse_count (g_mf2_stats.page_reuse_count) +#define g_mf2_new_page_count (g_mf2_stats.new_page_count) +#define g_mf2_free_owner_count (g_mf2_stats.free_owner_count) +#define g_mf2_free_remote_count (g_mf2_stats.free_remote_count) +#define g_mf2_drain_count (g_mf2_stats.drain_count) +#define g_mf2_drain_blocks (g_mf2_stats.drain_blocks) +#define g_mf2_drain_attempts (g_mf2_stats.drain_attempts) +#define g_mf2_drain_success (g_mf2_stats.drain_success) +#define g_mf2_slow_checked_drain (g_mf2_stats.slow_checked_drain) +#define g_mf2_slow_found_remote (g_mf2_stats.slow_found_remote) +#define g_mf2_full_scan_checked (g_mf2_stats.full_scan_checked) +#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote) +#define g_mf2_eager_drain_scanned (g_mf2_stats.eager_drain_scanned) +#define g_mf2_eager_drain_found (g_mf2_stats.eager_drain_found) +#define g_mf2_pending_enqueued (g_mf2_stats.pending_enqueued) +#define g_mf2_pending_drained (g_mf2_stats.pending_drained) +#define g_mf2_pending_requeued (g_mf2_stats.pending_requeued) + +// =========================================================================== +// End of MF2 Data Structures +// =========================================================================== diff --git a/core/box/pool_tls_types.inc.h b/core/box/pool_tls_types.inc.h new file mode 100644 index 00000000..c27d32c4 --- /dev/null +++ b/core/box/pool_tls_types.inc.h @@ -0,0 +1,32 @@ +// =========================================================================== +// Internal Data Structures +// =========================================================================== + +// Freelist block header (embedded in allocated block) +typedef struct PoolBlock { + struct PoolBlock* next; // Next free block in freelist +} PoolBlock; + +// TLS cache: one block per class to avoid frequent locks (legacy single-slot) +__thread PoolBlock* tls_pool_cache[POOL_NUM_CLASSES] = {NULL}; + +// TLS ring buffer to further reduce lock traffic (configurable capacity) +// Separate ring size for L2 Pool (mid/large allocations: 8-32KB) +#ifndef POOL_L2_RING_CAP +#define POOL_L2_RING_CAP 48 // Optimized for L1 cache efficiency (384B, 6 cache lines) +#endif +typedef struct { PoolBlock* items[POOL_L2_RING_CAP]; int top; } PoolTLSRing; +typedef struct { PoolTLSRing ring; PoolBlock* lo_head; size_t lo_count; } PoolTLSBin; +static __thread PoolTLSBin g_tls_bin[POOL_NUM_CLASSES]; + +// TLS active pages (per class): bump-run (no per-block links) from privately owned pages (max 3) +typedef struct { + void* page; // page base + char* bump; // next raw allocation (header start) + char* end; // page end (bump-run limit) + int count; // remaining blocks (for quick checks) +} PoolTLSPage; +static __thread PoolTLSPage g_tls_active_page_a[POOL_NUM_CLASSES]; +static __thread PoolTLSPage g_tls_active_page_b[POOL_NUM_CLASSES]; +static __thread PoolTLSPage g_tls_active_page_c[POOL_NUM_CLASSES]; // QW2-adjusted: 3 slots (was 4) + diff --git a/core/hakmem_pool.c b/core/hakmem_pool.c index 696e3edb..92148e9b 100644 --- a/core/hakmem_pool.c +++ b/core/hakmem_pool.c @@ -64,489 +64,16 @@ typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64 // =========================================================================== // Internal Data Structures // =========================================================================== - -// Freelist block header (embedded in allocated block) -typedef struct PoolBlock { - struct PoolBlock* next; // Next free block in freelist -} PoolBlock; - -// TLS cache: one block per class to avoid frequent locks (legacy single-slot) -__thread PoolBlock* tls_pool_cache[POOL_NUM_CLASSES] = {NULL}; - -// TLS ring buffer to further reduce lock traffic (configurable capacity) -// Separate ring size for L2 Pool (mid/large allocations: 8-32KB) -#ifndef POOL_L2_RING_CAP -#define POOL_L2_RING_CAP 48 // Optimized for L1 cache efficiency (384B, 6 cache lines) -#endif -typedef struct { PoolBlock* items[POOL_L2_RING_CAP]; int top; } PoolTLSRing; -typedef struct { PoolTLSRing ring; PoolBlock* lo_head; size_t lo_count; } PoolTLSBin; -static __thread PoolTLSBin g_tls_bin[POOL_NUM_CLASSES]; - -// TLS active pages (per class): bump-run (no per-block links) from privately owned pages (max 3) -typedef struct { - void* page; // page base - char* bump; // next raw allocation (header start) - char* end; // page end (bump-run limit) - int count; // remaining blocks (for quick checks) -} PoolTLSPage; -static __thread PoolTLSPage g_tls_active_page_a[POOL_NUM_CLASSES]; -static __thread PoolTLSPage g_tls_active_page_b[POOL_NUM_CLASSES]; -static __thread PoolTLSPage g_tls_active_page_c[POOL_NUM_CLASSES]; // QW2-adjusted: 3 slots (was 4) +#include "box/pool_tls_types.inc.h" // Mid page descriptor registry (64KiB pages → {class_idx, owner_tid}) -#define MID_DESC_BUCKETS 2048 -typedef struct MidPageDesc { - void* page; - uint8_t class_idx; - uint8_t _pad0; - uint16_t _pad1; - uint64_t owner_tid; - atomic_int in_use; // live allocations on this page - int blocks_per_page; // total blocks on this page - atomic_int pending_dn; // background DONTNEED enqueued - struct MidPageDesc* next; -} MidPageDesc; -static pthread_mutex_t g_mid_desc_mu[MID_DESC_BUCKETS]; -static MidPageDesc* g_mid_desc_head[MID_DESC_BUCKETS]; - -static inline uint32_t mid_desc_hash(void* page) { - uintptr_t x = (uintptr_t)page >> 16; // 64KiB alignment granularity - // mix - x ^= x >> 33; x *= 0xff51afd7ed558ccdULL; x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL; x ^= x >> 33; - return (uint32_t)(x & (MID_DESC_BUCKETS - 1)); -} - -// Thread-safe initialization using pthread_once -static pthread_once_t mid_desc_init_once_control = PTHREAD_ONCE_INIT; -static void mid_desc_init_impl(void) { - for (int i = 0; i < MID_DESC_BUCKETS; i++) { - pthread_mutex_init(&g_mid_desc_mu[i], NULL); - g_mid_desc_head[i] = NULL; - } -} -static void mid_desc_init_once(void) { - pthread_once(&mid_desc_init_once_control, mid_desc_init_impl); -} - -static void mid_desc_register(void* page, int class_idx, uint64_t owner_tid) { - mid_desc_init_once(); - uint32_t h = mid_desc_hash(page); - pthread_mutex_lock(&g_mid_desc_mu[h]); - MidPageDesc* d = (MidPageDesc*)hkm_libc_malloc(sizeof(MidPageDesc)); // P0 Fix: Use libc malloc - if (d) { - d->page = page; d->class_idx = (uint8_t)class_idx; d->owner_tid = owner_tid; d->next = g_mid_desc_head[h]; - atomic_store(&d->in_use, 0); - d->blocks_per_page = 0; // optional; not used for emptiness in P0 - atomic_store(&d->pending_dn, 0); - g_mid_desc_head[h] = d; - } - pthread_mutex_unlock(&g_mid_desc_mu[h]); -} - -static MidPageDesc* mid_desc_lookup(void* addr) { - mid_desc_init_once(); - void* page = (void*)((uintptr_t)addr & ~((uintptr_t)POOL_PAGE_SIZE - 1)); - uint32_t h = mid_desc_hash(page); - for (MidPageDesc* d = g_mid_desc_head[h]; d; d = d->next) { - if (d->page == page) return d; - } - return NULL; -} - -static void mid_desc_adopt(void* addr, int class_idx, uint64_t owner_tid) { - if (owner_tid == 0) return; - void* page = (void*)((uintptr_t)addr & ~((uintptr_t)POOL_PAGE_SIZE - 1)); - uint32_t h = mid_desc_hash(page); - pthread_mutex_lock(&g_mid_desc_mu[h]); - MidPageDesc* d = g_mid_desc_head[h]; - while (d) { if (d->page == page) break; d = d->next; } - if (d) { - if (d->owner_tid == 0) d->owner_tid = owner_tid; - } else { - MidPageDesc* nd = (MidPageDesc*)hkm_libc_malloc(sizeof(MidPageDesc)); // P0 Fix: Use libc malloc - if (nd) { nd->page = page; nd->class_idx = (uint8_t)class_idx; nd->owner_tid = owner_tid; nd->next = g_mid_desc_head[h]; g_mid_desc_head[h] = nd; } - } - pthread_mutex_unlock(&g_mid_desc_mu[h]); -} - -// Increment page in-use counter for given raw block pointer -static inline void mid_page_inuse_inc(void* raw) { - MidPageDesc* d = mid_desc_lookup(raw); - if (d) atomic_fetch_add_explicit(&d->in_use, 1, memory_order_relaxed); -} - -// Decrement page in-use counter and enqueue DONTNEED when it drops to 0 -extern int hak_batch_add_page(void* page, size_t size); -static inline void mid_page_inuse_dec_and_maybe_dn(void* raw) { - MidPageDesc* d = mid_desc_lookup(raw); - if (!d) return; - int nv = atomic_fetch_sub_explicit(&d->in_use, 1, memory_order_relaxed) - 1; - if (nv <= 0) { - // Fire once per empty transition - if (atomic_exchange_explicit(&d->pending_dn, 1, memory_order_acq_rel) == 0) { - hak_batch_add_page(d->page, POOL_PAGE_SIZE); - } - } -} +#include "box/pool_mid_desc.inc.h" // ---------------- Transfer Cache (per-thread per-class inbox) -------------- -typedef struct MidTC { - atomic_uintptr_t inbox[POOL_NUM_CLASSES]; -} MidTC; +#include "box/pool_mid_tc.inc.h" -#define MID_TC_BUCKETS 1024 -typedef struct MidTCEntry { uint64_t tid; MidTC* tc; struct MidTCEntry* next; } MidTCEntry; -static pthread_mutex_t g_mid_tc_mu[MID_TC_BUCKETS]; -static MidTCEntry* g_mid_tc_head[MID_TC_BUCKETS]; -static __thread MidTC* t_mid_tc = NULL; -static int g_tc_enabled = 1; // env: HAKMEM_TC_ENABLE (default 1) -static int g_tc_drain_unbounded = 1; // env: HAKMEM_TC_UNBOUNDED (default 1) -static int g_tc_drain_max = 0; // env: HAKMEM_TC_DRAIN_MAX (0=unbounded) -static int g_tc_drain_trigger = 2; // env: HAKMEM_TC_DRAIN_TRIGGER (ring->top < trigger) +#include "box/pool_mf2_types.inc.h" -static inline uint32_t mid_tc_hash(uint64_t tid) { - tid ^= tid >> 33; tid *= 0xff51afd7ed558ccdULL; tid ^= tid >> 33; tid *= 0xc4ceb9fe1a85ec53ULL; tid ^= tid >> 33; - return (uint32_t)(tid & (MID_TC_BUCKETS - 1)); -} - -// Thread-safe initialization using pthread_once -static pthread_once_t mid_tc_init_once_control = PTHREAD_ONCE_INIT; -static void mid_tc_init_impl(void) { - for (int i = 0; i < MID_TC_BUCKETS; i++) { - pthread_mutex_init(&g_mid_tc_mu[i], NULL); - g_mid_tc_head[i] = NULL; - } -} -static void mid_tc_init_once(void) { - pthread_once(&mid_tc_init_once_control, mid_tc_init_impl); -} - -static MidTC* mid_tc_get(void) { - if (t_mid_tc) return t_mid_tc; - mid_tc_init_once(); - MidTC* tc = (MidTC*)hkm_libc_calloc(1, sizeof(MidTC)); // P0 Fix: Use libc malloc - if (!tc) return NULL; - uint64_t tid = (uint64_t)(uintptr_t)pthread_self(); - uint32_t h = mid_tc_hash(tid); - pthread_mutex_lock(&g_mid_tc_mu[h]); - MidTCEntry* e = (MidTCEntry*)hkm_libc_malloc(sizeof(MidTCEntry)); // P0 Fix: Use libc malloc - if (e) { e->tid = tid; e->tc = tc; e->next = g_mid_tc_head[h]; g_mid_tc_head[h] = e; } - pthread_mutex_unlock(&g_mid_tc_mu[h]); - t_mid_tc = tc; - return tc; -} - -static MidTC* mid_tc_lookup_by_tid(uint64_t tid) { - mid_tc_init_once(); - uint32_t h = mid_tc_hash(tid); - MidTCEntry* e = g_mid_tc_head[h]; - while (e) { if (e->tid == tid) return e->tc; e = e->next; } - return NULL; -} - -static inline void mid_tc_push(MidTC* tc, int class_idx, PoolBlock* b) { - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&tc->inbox[class_idx], memory_order_acquire); - b->next = (PoolBlock*)old_head; - } while (!atomic_compare_exchange_weak_explicit(&tc->inbox[class_idx], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed)); -} - -static inline int mid_tc_drain_into_tls(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin) { - MidTC* tc = mid_tc_get(); - if (!tc) return 0; - HKM_TIME_START(t_tc); - uintptr_t head = atomic_exchange_explicit(&tc->inbox[class_idx], (uintptr_t)0, memory_order_acq_rel); - if (!head) { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc); return 0; } - int moved = 0; - int limit = (g_tc_drain_unbounded || g_tc_drain_max <= 0) ? INT32_MAX : g_tc_drain_max; - PoolBlock* cur = (PoolBlock*)head; - while (cur && moved < limit) { - PoolBlock* nxt = cur->next; - if (ring->top < POOL_L2_RING_CAP) { - ring->items[ring->top++] = cur; moved++; - } else { - cur->next = bin->lo_head; bin->lo_head = cur; bin->lo_count++; moved++; - } - cur = nxt; - } - while (cur) { PoolBlock* nxt = cur->next; mid_tc_push(tc, class_idx, cur); cur = nxt; } - HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc); - return moved; -} - -static inline int mid_tc_has_items(int class_idx) { - MidTC* tc = t_mid_tc; // do not allocate on peek - if (!tc) return 0; - return atomic_load_explicit(&tc->inbox[class_idx], memory_order_relaxed) != 0; -} - -// =========================================================================== -// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture -// =========================================================================== -// -// Key idea: Each 64KB page has independent freelist (no sharing!) -// - O(1) page lookup from block address: (addr & ~0xFFFF) -// - Owner thread: fast path (no locks, no atomics) -// - Cross-thread free: lock-free remote stack -// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc) - -// MF2 Configuration Constants (Quick Win #5) -#define MF2_PENDING_QUEUE_BUDGET 4 // Max pages to drain from pending queue -#define MF2_DEBUG_SAMPLE_COUNT 20 // Number of debug samples to log -#define MF2_TSC_CYCLES_PER_US 3000 // Estimated TSC cycles per microsecond -#define MF2_PAGE_SIZE_SHIFT 16 // log2(64KB) for fast division -#define MF2_PAGE_ALIGNMENT 65536 // 64KB alignment for mmap - -// Debug Logging Macros (Quick Win #6) -// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable -#ifdef HAKMEM_DEBUG_MF2 - #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__) - #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) -#else - #define MF2_DEBUG_LOG(fmt, ...) ((void)0) - #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__) -#endif - -// Forward declarations -static size_t g_class_sizes[POOL_NUM_CLASSES]; - -// MF2 Page descriptor: per-page metadata (one per 64KB page) -typedef struct MidPage { - // Page identity - void* base; // Page base address (64KB aligned) - uint8_t class_idx; // Size class index (0-6) - uint8_t flags; // Page flags (reserved for future use) - uint16_t _pad0; - - // Ownership - pthread_t owner_tid; // Owner thread ID (for fast-path check) - struct MF2_ThreadPages* owner_tp; // Owner thread's heap (for pending queue access) - uint64_t last_transfer_time; // rdtsc() timestamp of last ownership transfer (lease mechanism) - - // Page-local freelist (owner-only, NO LOCK!) - PoolBlock* freelist; // Local freelist head - uint16_t free_count; // Number of free blocks - uint16_t capacity; // Total blocks per page - - // Remote frees (cross-thread, lock-free MPSC stack) - atomic_uintptr_t remote_head; // Lock-free remote free stack - atomic_uint remote_count; // Remote free count (for quick check) - - // Lifecycle - atomic_int in_use; // Live allocations on this page - atomic_int pending_dn; // DONTNEED enqueued flag - - // Linkage (thread-local page lists) - struct MidPage* next_page; // Next page in thread's list - struct MidPage* prev_page; // Previous page in thread's list - - // Pending queue (remote drain notification) - _Atomic(_Bool) in_remote_pending; // Is this page in pending queue? - struct MidPage* next_pending; // Next page in pending queue - - // Padding to cache line boundary (avoid false sharing) - char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 + - sizeof(atomic_uintptr_t) + sizeof(atomic_uint) + - sizeof(atomic_int) * 2 + sizeof(pthread_t) + - sizeof(_Atomic(_Bool)) + 4) % 64)]; -} MidPage; - -// Page registry: O(1) lookup from block address -// Use direct indexing: (addr >> 16) & MASK -#define MF2_PAGE_REGISTRY_BITS 16 // 64K entries (4GB address space with 64KB pages) -#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS) -#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1) - -typedef struct { - // Direct-mapped page table (no hash collisions!) - MidPage* pages[MF2_PAGE_REGISTRY_SIZE]; - - // Coarse-grained locks for rare updates (page alloc/free) - // 256 locks = 256-way parallelism for page registration - pthread_mutex_t locks[256]; - - // Statistics - atomic_uint_fast64_t total_pages; // Total pages allocated - atomic_uint_fast64_t active_pages; // Pages with live allocations -} MF2_PageRegistry; - -// Thread-local page lists (one list per size class) -typedef struct MF2_ThreadPages { - // Active pages (have free blocks) - MidPage* active_page[POOL_NUM_CLASSES]; - - // Partial pages (drained pages with free blocks, LIFO for cache locality) - // Checked before allocating new pages (fast reuse path) - MidPage* partial_pages[POOL_NUM_CLASSES]; - - // Full pages (no free blocks, but may receive remote frees) - // TODO: Gradually deprecate in favor of partial_pages - MidPage* full_pages[POOL_NUM_CLASSES]; - - // Pending queue (pages with remote frees, MPSC lock-free stack) - atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES]; - - // Pending claim flags (prevent multi-consumer CAS thrashing) - // One adopter at a time per queue (test_and_set to claim, clear to release) - atomic_flag pending_claim[POOL_NUM_CLASSES]; - - // Page ownership count (for statistics) - uint32_t page_count[POOL_NUM_CLASSES]; - - // Thread identity (cached for fast comparison) - pthread_t my_tid; - - // Route P: Activity tracking for idle-based adoption - // Updated on every allocation (mf2_alloc_fast) - // Read by adopters to check if owner is idle - atomic_uint_fast64_t last_alloc_tsc; -} MF2_ThreadPages; - -// Global page registry (shared, rarely accessed) -static MF2_PageRegistry g_mf2_page_registry; - -// Thread-local page lists (hot path, no sharing!) -static __thread MF2_ThreadPages* t_mf2_pages = NULL; - -// =========================================================================== -// MF2 Global State (Quick Win #3b - Structured Globals) -// =========================================================================== -// Individual globals replaced with structured state below. -// Old declarations removed, replaced with macro-mapped struct instances. -// -// Benefits: -// - Logical grouping (config, registry, stats) -// - Better documentation -// - Easier to extend or refactor -// - Single source of truth for each category - -#define MF2_MAX_THREADS 256 - -// MF2 Configuration (environment variables) -typedef struct { - int enabled; // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled) - int max_queues; // HAKMEM_MF2_MAX_QUEUES (default: 2) - int lease_ms; // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled) - int idle_threshold_us; // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs) -} MF2_Config; - -// MF2 Thread Registry (cross-thread coordination) -typedef struct { - MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS]; // Global registry - _Atomic int num_thread_pages; // Active thread count - _Atomic int adoptable_count[POOL_NUM_CLASSES]; // Non-empty pending queues - pthread_key_t tls_key; // Thread-local storage key - pthread_once_t key_once; // TLS initialization guard -} MF2_Registry; - -// MF2 Statistics (debug instrumentation) -typedef struct { - // Allocation path - atomic_uint_fast64_t alloc_fast_hit; - atomic_uint_fast64_t alloc_slow_hit; - atomic_uint_fast64_t page_reuse_count; - atomic_uint_fast64_t new_page_count; - - // Free path - atomic_uint_fast64_t free_owner_count; - atomic_uint_fast64_t free_remote_count; - - // Drain operations - atomic_uint_fast64_t drain_count; - atomic_uint_fast64_t drain_blocks; - atomic_uint_fast64_t drain_attempts; - atomic_uint_fast64_t drain_success; - atomic_uint_fast64_t slow_checked_drain; - atomic_uint_fast64_t slow_found_remote; - - // Full page scan (obsolete, kept for historical tracking) - atomic_uint_fast64_t full_scan_checked; - atomic_uint_fast64_t full_scan_found_remote; - atomic_uint_fast64_t eager_drain_scanned; - atomic_uint_fast64_t eager_drain_found; - - // Pending queue - atomic_uint_fast64_t pending_enqueued; - atomic_uint_fast64_t pending_drained; - atomic_uint_fast64_t pending_requeued; -} MF2_Stats; - -// Instantiate structured global state (Quick Win #3b) -static MF2_Config g_mf2_config = { - .enabled = 0, // Will be set by env var - .max_queues = 2, - .lease_ms = 10, - .idle_threshold_us = 150 -}; - -static MF2_Registry g_mf2_registry = { - .all_thread_pages = {0}, - .num_thread_pages = 0, - .adoptable_count = {0}, - .tls_key = 0, - .key_once = PTHREAD_ONCE_INIT -}; - -static MF2_Stats g_mf2_stats = { - // All fields initialized to 0 (atomic zero-initialization is valid) - .alloc_fast_hit = 0, - .alloc_slow_hit = 0, - .page_reuse_count = 0, - .new_page_count = 0, - .free_owner_count = 0, - .free_remote_count = 0, - .drain_count = 0, - .drain_blocks = 0, - .drain_attempts = 0, - .drain_success = 0, - .slow_checked_drain = 0, - .slow_found_remote = 0, - .full_scan_checked = 0, - .full_scan_found_remote = 0, - .eager_drain_scanned = 0, - .eager_drain_found = 0, - .pending_enqueued = 0, - .pending_drained = 0, - .pending_requeued = 0 -}; - -// Compatibility macros: Map old global names to struct fields -// This allows existing code to work unchanged while using structured state -#define g_mf2_enabled (g_mf2_config.enabled) -#define g_mf2_max_queues (g_mf2_config.max_queues) -#define g_mf2_lease_ms (g_mf2_config.lease_ms) -#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us) - -#define g_all_thread_pages (g_mf2_registry.all_thread_pages) -#define g_num_thread_pages (g_mf2_registry.num_thread_pages) -#define g_adoptable_count (g_mf2_registry.adoptable_count) -#define g_mf2_tls_key (g_mf2_registry.tls_key) -#define g_mf2_key_once (g_mf2_registry.key_once) - -#define g_mf2_alloc_fast_hit (g_mf2_stats.alloc_fast_hit) -#define g_mf2_alloc_slow_hit (g_mf2_stats.alloc_slow_hit) -#define g_mf2_page_reuse_count (g_mf2_stats.page_reuse_count) -#define g_mf2_new_page_count (g_mf2_stats.new_page_count) -#define g_mf2_free_owner_count (g_mf2_stats.free_owner_count) -#define g_mf2_free_remote_count (g_mf2_stats.free_remote_count) -#define g_mf2_drain_count (g_mf2_stats.drain_count) -#define g_mf2_drain_blocks (g_mf2_stats.drain_blocks) -#define g_mf2_drain_attempts (g_mf2_stats.drain_attempts) -#define g_mf2_drain_success (g_mf2_stats.drain_success) -#define g_mf2_slow_checked_drain (g_mf2_stats.slow_checked_drain) -#define g_mf2_slow_found_remote (g_mf2_stats.slow_found_remote) -#define g_mf2_full_scan_checked (g_mf2_stats.full_scan_checked) -#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote) -#define g_mf2_eager_drain_scanned (g_mf2_stats.eager_drain_scanned) -#define g_mf2_eager_drain_found (g_mf2_stats.eager_drain_found) -#define g_mf2_pending_enqueued (g_mf2_stats.pending_enqueued) -#define g_mf2_pending_drained (g_mf2_stats.pending_drained) -#define g_mf2_pending_requeued (g_mf2_stats.pending_requeued) - -// =========================================================================== -// End of MF2 Data Structures -// =========================================================================== // --- MF2 Initialization Functions --- @@ -1018,293 +545,10 @@ static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) { // End of Pending Queue Operations // =========================================================================== -// Forward declarations -static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id); +#include "box/pool_mf2_helpers.inc.h" -// =========================================================================== -// Helper Functions (Clean & Modular) -// =========================================================================== -// Helper: Make page active (move old active to full_pages) -static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return; - - // Move old active page to full_pages (if any) - if (tp->active_page[class_idx]) { - MidPage* old_active = tp->active_page[class_idx]; - old_active->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = old_active; - } - - // Set new page as active - tp->active_page[class_idx] = page; - page->next_page = NULL; -} - -// Helper: Drain page and add to partial list (LIFO for cache locality) -// Returns true if page has free blocks after drain -static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return false; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // If page has freelist after drain, add to partial list (LIFO) - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - page->next_page = tp->partial_pages[class_idx]; - tp->partial_pages[class_idx] = page; - return true; - } - - // No freelist, return to full_pages - page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = page; - return false; -} - -// Helper: Drain page and activate if successful (Direct Handoff - backward compat) -// Returns true if page was activated -static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) { - if (!tp || !page) return false; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // If page has freelist after drain, make it active immediately - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - mf2_make_page_active(tp, class_idx, page); - return true; - } - - // No freelist, return to full_pages - page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = page; - return false; -} - -// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1) -// Returns true if a page was successfully drained and activated -static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return false; - - // Budget: Process up to N pages to avoid blocking - for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) { - MidPage* pending_page = mf2_dequeue_pending(tp, class_idx); - if (!pending_page) break; // Queue empty - - atomic_fetch_add(&g_mf2_pending_drained, 1); - - // Clear pending flag (no longer in queue) - atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release); - - // DIRECT HANDOFF: Drain and activate if successful - if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) { - return true; // Success! Page is now active - } - // No freelist after drain, page returned to full_pages by helper - } - return false; // No pages available for reuse -} - -// Helper: Try to drain remotes from active page (must-reuse gate part 2) -// Returns true if active page has freelist after drain -static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return false; - - MidPage* page = tp->active_page[class_idx]; - if (!page) return false; - - atomic_fetch_add(&g_mf2_slow_checked_drain, 1); - unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst); - - if (remote_cnt > 0) { - atomic_fetch_add(&g_mf2_slow_found_remote, 1); - int drained = mf2_drain_remote_frees(page); - if (drained > 0 && page->freelist) { - atomic_fetch_add(&g_mf2_drain_success, 1); - return true; // Success! Active page now has freelist - } - } - return false; // No remotes or drain failed -} - -// Helper: Allocate new page and make it active -// Returns the newly allocated page (or NULL on OOM) -static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) { - if (!tp) return NULL; - - atomic_fetch_add(&g_mf2_new_page_count, 1); - - // DEBUG: Log why we're allocating new page (first N samples) - static _Atomic int new_page_samples = 0; - int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed); - if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) { - // Count adoptable pages across all threads - int total_adoptable = 0; - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed); - } - MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p", - sample_idx, class_idx, - (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed), - total_adoptable, - tp->active_page[class_idx], - tp->full_pages[class_idx]); - } - - MidPage* page = mf2_alloc_new_page(class_idx); - if (!page) { - return NULL; // OOM - } - - // Move current active page to full list (if any) - if (tp->active_page[class_idx]) { - MidPage* old_page = tp->active_page[class_idx]; - old_page->next_page = tp->full_pages[class_idx]; - tp->full_pages[class_idx] = old_page; - } - - // Set new page as active - tp->active_page[class_idx] = page; - tp->page_count[class_idx]++; - - return page; -} - -// =========================================================================== -// End of Helper Functions -// =========================================================================== - -// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue -// Returns true if a page was successfully adopted and activated -// Called from alloc_slow when allocating thread needs memory -static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) { - if (!me) return false; - - // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating) - // Avoids scanning empty queues (major performance win!) - int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed); - if (adoptable == 0) return false; // All queues empty, no scan needed - - // Get global thread registry - int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); - if (num_tp == 0) return false; - - // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES) - // Prevents excessive scanning overhead (2-8 threads is usually enough) - int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues; - - // Round-robin scan (limited number of threads, not ALL!) - static _Atomic uint64_t adopt_counter = 0; - uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed); - - for (int i = 0; i < scan_limit; i++) { - int tp_idx = (start_idx + i) % num_tp; - MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit( - (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire); - - if (!other_tp) continue; - - // Route P: Idle Detection - Only adopt from idle owners - // Check if owner is still actively allocating (threshold configurable via env var) - uint64_t now_tsc = mf2_rdtsc(); - uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed); - uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US; - - if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) { - continue; // Owner still active, skip adoption - } - - // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!) - // Only one thread scans each queue at a time → eliminates CAS contention - if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) { - continue; // Another thread is already scanning this queue, skip - } - - // Try to dequeue a pending page from this thread - MidPage* page = mf2_dequeue_pending(other_tp, class_idx); - if (!page) { - // Queue empty, release claim and try next thread - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - continue; - } - - // Clear pending flag (no longer in queue) - atomic_store_explicit(&page->in_remote_pending, false, memory_order_release); - - // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS) - // 0ms = disabled (no lease check), >0 = lease period in milliseconds - uint64_t now = mf2_rdtsc(); - uint64_t last_transfer = page->last_transfer_time; - if (g_mf2_lease_ms > 0 && last_transfer != 0) { - // Calculate lease cycles from ms (approx 3GHz CPU) - uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL); - if ((now - last_transfer) < lease_cycles) { - // Lease still active, return page to full_pages (don't thrash ownership) - page->next_page = other_tp->full_pages[class_idx]; - other_tp->full_pages[class_idx] = page; - // Release claim before continuing - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - continue; // Try next thread - } - } - - // Try to transfer ownership using CAS - pthread_t old_owner = page->owner_tid; - pthread_t new_owner = pthread_self(); - - // Note: pthread_t may not be atomic-compatible on all platforms - // For now, we'll use a simple write (ownership transfer is rare) - // TODO: If thrashing is observed, add atomic CAS with serialization - page->owner_tid = new_owner; - page->owner_tp = me; - page->last_transfer_time = now; - - // DEBUG: Log drain state - static _Atomic int adopt_samples = 0; - int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed); - unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed); - unsigned int pre_free = page->free_count; - PoolBlock* pre_freelist = page->freelist; - - // Drain remote frees - int drained = mf2_drain_remote_frees(page); - - // DEBUG: Log result (first 10 samples) - if (sample_idx < 10) { - MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p", - sample_idx, class_idx, pre_remote, drained, - pre_free, page->free_count, pre_freelist, page->freelist); - } - - // Make adopted page ACTIVE immediately (not partial!) - // Adoption needs immediate activation for caller's mf2_alloc_fast() - // Partial list is only for own pending queue drains - if (page->freelist) { - atomic_fetch_add(&g_mf2_page_reuse_count, 1); - atomic_fetch_add(&g_mf2_pending_drained, 1); - atomic_fetch_add(&g_mf2_drain_success, 1); - - // Make it active (move old active to full_pages) - mf2_make_page_active(me, class_idx, page); - - // Release claim before returning SUCCESS - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - return true; // SUCCESS! Page adopted and activated - } - - // No freelist after drain, return to MY full_pages (I'm the new owner!) - page->next_page = me->full_pages[class_idx]; - me->full_pages[class_idx] = page; - // Release claim before continuing search - atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release); - // Continue searching for a better page - } - - return false; // No adoptable pages found -} +#include "box/pool_mf2_adoption.inc.h" // Fast allocation path (owner thread, NO LOCK!) static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) { @@ -1646,947 +890,18 @@ int hak_pool_get_shard_index(uintptr_t site_id) { return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1)); } -// Bitmap helpers (O(1) empty class detection) -static inline void set_nonempty_bit(int class_idx, int shard_idx) { - // Set bit: freelist[class][shard] is non-empty (atomic OR) - atomic_fetch_or(&g_pool.nonempty_mask[class_idx], (uint64_t)(1ULL << shard_idx)); -} +// TLS helpers +#include "box/pool_tls_core.inc.h" -static inline void clear_nonempty_bit(int class_idx, int shard_idx) { - // Clear bit: freelist[class][shard] is empty (atomic AND) - atomic_fetch_and(&g_pool.nonempty_mask[class_idx], ~(uint64_t)(1ULL << shard_idx)); -} -static inline int is_shard_nonempty(int class_idx, int shard_idx) { - // Check if shard has blocks (atomic load) - uint64_t mask = atomic_load(&g_pool.nonempty_mask[class_idx]); - return (mask & (1ULL << shard_idx)) != 0; -} +// Refill/ACE (boxed) +#include "box/pool_refill.inc.h" -// Drain remote-free MPSC stack into freelist under the shard lock -static inline void drain_remote_locked(int class_idx, int shard_idx) { - uintptr_t head = atomic_exchange_explicit(&g_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel); - unsigned drained = 0; - while (head) { - PoolBlock* b = (PoolBlock*)head; - head = (uintptr_t)b->next; // next pointer stored in first word - b->next = g_pool.freelist[class_idx][shard_idx]; - g_pool.freelist[class_idx][shard_idx] = b; - drained++; - } - if (drained) { - atomic_fetch_sub_explicit(&g_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed); - if (g_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx); - } -} +// Init/Shutdown + MF2 debug (boxed) +#include "box/pool_init_api.inc.h" -// Choose a non-empty shard near preferred using the nonempty mask. If none, return preferred. -static inline int choose_nonempty_shard(int class_idx, int preferred) { - uint64_t mask = atomic_load_explicit(&g_pool.nonempty_mask[class_idx], memory_order_acquire); - if (!mask) return preferred; - // Rotate so preferred becomes bit0 - int shift = preferred & 63; - uint64_t rot = (mask >> shift) | (mask << (64 - shift)); - if (!rot) return preferred; - int off = __builtin_ctzll(rot); - return (preferred + off) & (POOL_NUM_SHARDS - 1); -} -// Allocate a private page for TLS active page and split into a local list -static int alloc_tls_page(int class_idx, PoolTLSPage* ap) { - size_t user_size = g_class_sizes[class_idx]; - size_t block_size = HEADER_SIZE + user_size; - int blocks_per_page = POOL_PAGE_SIZE / block_size; - if (blocks_per_page <= 0) return 0; - void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!page) return 0; - // Bump-run initialization (no per-block linking) - ap->page = page; - ap->bump = (char*)page; - ap->end = (char*)page + POOL_PAGE_SIZE; - ap->count = blocks_per_page; - // Register page with owner (this thread) for owner-fast free detection - mid_desc_register(page, class_idx, (uint64_t)(uintptr_t)pthread_self()); - g_pool.refills[class_idx]++; - g_pool.total_pages_allocated++; - g_pool.pages_by_class[class_idx]++; - g_pool.total_bytes_allocated += POOL_PAGE_SIZE; - return 1; -} +// Pool statistics (boxed) +#include "box/pool_stats.inc.h" -// Refill TLS ring/LIFO from active page without building links. Returns number added. -static inline int refill_tls_from_active_page(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin, PoolTLSPage* ap, int need) { - if (!ap || !ap->page || ap->count <= 0 || ap->bump >= ap->end) return 0; - size_t blk = HEADER_SIZE + g_class_sizes[class_idx]; - int moved = 0; - int to_add = need; - while (to_add > 0 && ap->bump < ap->end && ap->count > 0) { - PoolBlock* b = (PoolBlock*)(void*)ap->bump; - if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { - ring->items[ring->top++] = b; - } else { - b->next = bin->lo_head; bin->lo_head = b; bin->lo_count++; - } - ap->bump += blk; - ap->count--; - moved++; - to_add--; - } - if (ap->bump >= ap->end || ap->count <= 0) { - ap->page = NULL; ap->bump = ap->end; ap->count = 0; - } - return moved; -} - - -// ACE: adjust bundle factor per class based on windowed hits/misses -static inline void pool_update_bundle_factor(int class_idx) { - // Compute deltas since last snapshot - uint64_t h = g_pool.hits[class_idx]; - uint64_t m = g_pool.misses[class_idx]; - uint64_t dh = h - g_pool.last_hits[class_idx]; - uint64_t dm = m - g_pool.last_misses[class_idx]; - uint64_t dt = dh + dm; - if (dt < 256) return; // wait for window to accumulate - - int bf = g_pool.bundle_factor[class_idx]; - if (bf <= 0) bf = 1; - - // Ifミス優勢(ヒット率<60% かつ ミスがヒット+一定閾値超)→増やす - if (dt > 0) { - double hit_rate = (double)dh / (double)dt; - if (hit_rate < 0.60 && dm > (dh + 16)) { - if (bf < 4) bf++; - } else if (hit_rate > 0.90 && dh > (dm + 32)) { - if (bf > 1) bf--; - } - } - - g_pool.bundle_factor[class_idx] = bf; - // Advance snapshot - g_pool.last_hits[class_idx] = h; - g_pool.last_misses[class_idx] = m; -} - -// Refill freelist by allocating a new page (64KiB) -// Args: class_idx - size class index (0-4) -// shard_idx - shard index (0-63) -// Returns: 1 on success, 0 on failure -// -// Each block now includes AllocHeader + user data -static int refill_freelist(int class_idx, int shard_idx) { - if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return 0; - if (shard_idx < 0 || shard_idx >= POOL_NUM_SHARDS) return 0; - - size_t user_size = g_class_sizes[class_idx]; - size_t block_size = HEADER_SIZE + user_size; // Header + user data - - // Calculate blocks per page (with header overhead) - int blocks_per_page = POOL_PAGE_SIZE / block_size; - if (blocks_per_page == 0) return 0; // Safety: class too large for 64KiB page - - // Allocate page via mmap (page-granular, avoids malloc overhead) - void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!page) return 0; - - // Update bundle factor based on windowed stats - pool_update_bundle_factor(class_idx); - int bundles = g_pool.bundle_factor[class_idx]; - if (bundles < 1) bundles = 1; - if (bundles > 4) bundles = 4; - - // Soft CAP guidance: use FrozenPolicy mid_cap to modulate bundling - // Semantics: mid_cap[class] is a soft target (in pages). We do not trim yet. - // If at/over cap → restrict bundling to 1; if far under cap → allow bundling up to deficit (max 4). - const FrozenPolicy* pol = hkm_policy_get(); - if (pol) { - uint16_t cap = 0; - if (class_idx < 5) cap = pol->mid_cap[class_idx]; - else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1; - else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2; - if (cap > 0) { - uint64_t have = g_pool.pages_by_class[class_idx]; - if (have >= cap) { - bundles = 1; // over cap: refill minimally - } else { - uint64_t deficit = (cap - have); - if (deficit < (uint64_t)bundles) bundles = (int)deficit; // don't exceed deficit - if (bundles < 1) bundles = 1; - if (bundles > 4) bundles = 4; - // Ensure at least min bundle under deficit for faster warm-up - if (deficit >= (uint64_t)g_pool_min_bundle && bundles < g_pool_min_bundle) bundles = g_pool_min_bundle; - } - } - } - - int pages_allocated_this_call = 0; - for (int b = 0; b < bundles; b++) { - // Split page into blocks and link into freelist - PoolBlock* freelist_head = NULL; - - for (int i = 0; i < blocks_per_page; i++) { - void* raw_block = (char*)page + (i * block_size); - // Prefetch next block header to reduce cache miss on link - __builtin_prefetch((char*)raw_block + block_size, 1, 1); - // Freelist uses raw pointer (header start). Header will be - // constructed after pop in hak_pool_try_alloc. - PoolBlock* block = (PoolBlock*)raw_block; - block->next = freelist_head; - freelist_head = block; - } - - // Prepend to existing freelist (if any) - if (g_pool.freelist[class_idx][shard_idx]) { - // Find tail of new list - PoolBlock* tail = freelist_head; - while (tail->next) { - tail = tail->next; - } - tail->next = g_pool.freelist[class_idx][shard_idx]; - } - - g_pool.freelist[class_idx][shard_idx] = freelist_head; - // Register this 64KiB page (shared owner) - mid_desc_register(page, class_idx, 0); - - // Next page if bundling - if (b + 1 < bundles) { - page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!page) break; - } - pages_allocated_this_call++; - } - - // Set non-empty bit (freelist now has blocks) - set_nonempty_bit(class_idx, shard_idx); - - // Update statistics - g_pool.refills[class_idx]++; - g_pool.total_pages_allocated += pages_allocated_this_call; - g_pool.pages_by_class[class_idx] += pages_allocated_this_call; - g_pool.total_bytes_allocated += (uint64_t)pages_allocated_this_call * (uint64_t)POOL_PAGE_SIZE; - - return 1; -} - -// =========================================================================== -// Public API -// =========================================================================== - -// Thread-safe initialization using pthread_once -static pthread_once_t hak_pool_init_once_control = PTHREAD_ONCE_INIT; -static void hak_pool_init_impl(void) { - // NOTE: Do NOT use memset() here! It would clobber 448 mutexes during concurrent init. - // All fields are explicitly initialized below. - // Configure dynamic Mid classes from FrozenPolicy (index 5/6) - const FrozenPolicy* pol = hkm_policy_get(); - if (pol && pol->mid_dyn1_bytes >= POOL_MIN_SIZE && pol->mid_dyn1_bytes <= POOL_MAX_SIZE) { - g_class_sizes[5] = pol->mid_dyn1_bytes; - } else { - g_class_sizes[5] = 0; // disabled - } - if (pol && pol->mid_dyn2_bytes >= POOL_MIN_SIZE && pol->mid_dyn2_bytes <= POOL_MAX_SIZE) { - g_class_sizes[6] = pol->mid_dyn2_bytes; - } else { - g_class_sizes[6] = 0; - } - // Initialize all g_pool fields explicitly (no memset!) - for (int c = 0; c < POOL_NUM_CLASSES; c++) { - // Initialize freelists to NULL - for (int s = 0; s < POOL_NUM_SHARDS; s++) { - g_pool.freelist[c][s] = NULL; - } - - // Initialize atomic variables and locks - atomic_store(&g_pool.nonempty_mask[c], 0); - for (int s = 0; s < POOL_NUM_SHARDS; s++) { - pthread_mutex_init(&g_pool.freelist_locks[c][s].m, NULL); - atomic_store(&g_pool.remote_head[c][s], (uintptr_t)0); - atomic_store(&g_pool.remote_count[c][s], 0); - } - - // Initialize per-class statistics - g_pool.hits[c] = 0; - g_pool.misses[c] = 0; - g_pool.refills[c] = 0; - g_pool.frees[c] = 0; - g_pool.pages_by_class[c] = 0; - - // Initialize ACE variables - g_pool.bundle_factor[c] = 1; - g_pool.last_hits[c] = 0; - g_pool.last_misses[c] = 0; - } - - // Initialize global statistics - g_pool.total_bytes_allocated = 0; - g_pool.total_pages_allocated = 0; - - // Initialize atomic metrics - atomic_store(&g_pool.trylock_attempts, 0); - atomic_store(&g_pool.trylock_success, 0); - atomic_store(&g_pool.ring_underflow, 0); - const char* e_tls = getenv("HAKMEM_POOL_TLS_FREE"); - g_pool.tls_free_enabled = (e_tls == NULL) ? 1 : (atoi(e_tls) != 0); - const char* e_wrap = getenv("HAKMEM_WRAP_L2"); - g_wrap_l2_enabled = (e_wrap && atoi(e_wrap) != 0) ? 1 : 0; - const char* e_minb = getenv("HAKMEM_POOL_MIN_BUNDLE"); - if (e_minb) { int v = atoi(e_minb); if (v >= 1 && v <= 8) g_pool_min_bundle = v; } - const char* e_mix = getenv("HAKMEM_SHARD_MIX"); - g_shard_mix_enabled = (e_mix && atoi(e_mix) != 0) ? 1 : 0; - const char* e_ring = getenv("HAKMEM_POOL_TLS_RING"); - if (e_ring) g_tls_ring_enabled = (atoi(e_ring) != 0); - const char* e_hdr = getenv("HAKMEM_HDR_LIGHT"); - if (e_hdr) g_hdr_light_enabled = atoi(e_hdr); // 0=full, 1=minimal, 2=skip header writes/validation - const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES"); - if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_trylock_probes = v; } - const char* e_div = getenv("HAKMEM_RING_RETURN_DIV"); - if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_ring_return_div = v; } - const char* e_lo = getenv("HAKMEM_TLS_LO_MAX"); - if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_tls_lo_max = v; } - const char* e_cs = getenv("HAKMEM_POOL_COUNT_SAMPLE"); - if (e_cs) { int v = atoi(e_cs); if (v>=0 && v<=16) g_count_sample_exp = v; } - const char* e_tc = getenv("HAKMEM_TC_ENABLE"); - if (e_tc) g_tc_enabled = (atoi(e_tc) != 0); - const char* e_tcu = getenv("HAKMEM_TC_UNBOUNDED"); - if (e_tcu) g_tc_drain_unbounded = (atoi(e_tcu) != 0); - const char* e_tcm = getenv("HAKMEM_TC_DRAIN_MAX"); - if (e_tcm) { int v = atoi(e_tcm); if (v>=0 && v<=65536) g_tc_drain_max = v; } - const char* e_tct = getenv("HAKMEM_TC_DRAIN_TRIGGER"); - if (e_tct) { int v = atoi(e_tct); if (v>=0 && v<=POOL_L2_RING_CAP) g_tc_drain_trigger = v; } - - // MF2: Per-Page Sharding - const char* e_mf2 = getenv("HAKMEM_MF2_ENABLE"); - if (e_mf2 && atoi(e_mf2) != 0) { - g_mf2_enabled = 1; - mf2_page_registry_init(); - - // MF2 tuning parameters - const char* e_maxq = getenv("HAKMEM_MF2_MAX_QUEUES"); - if (e_maxq) { - int v = atoi(e_maxq); - if (v >= 1 && v <= 256) g_mf2_max_queues = v; - } - const char* e_lease = getenv("HAKMEM_MF2_LEASE_MS"); - if (e_lease) { - int v = atoi(e_lease); - if (v >= 0 && v <= 1000) g_mf2_lease_ms = v; // 0=disabled, max 1000ms - } - const char* e_idle = getenv("HAKMEM_MF2_IDLE_THRESHOLD_US"); - if (e_idle) { - int v = atoi(e_idle); - if (v >= 0 && v <= 10000) g_mf2_idle_threshold_us = v; // 0µs~10ms - } - - HAKMEM_LOG("[Pool] MF2 Per-Page Sharding enabled\n"); - HAKMEM_LOG("[MF2] max_queues=%d, lease_ms=%d, idle_threshold_us=%d\n", - g_mf2_max_queues, g_mf2_lease_ms, g_mf2_idle_threshold_us); - } - - g_pool.initialized = 1; - - HAKMEM_LOG("[Pool] Initialized (L2 Hybrid Pool)\n"); - if (g_class_sizes[5] != 0 || g_class_sizes[6] != 0) { - HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB%s%s%s\n", - g_class_sizes[5] ? ", dyn1=" : "", - g_class_sizes[5] ? (const char*)"" : (g_class_sizes[6]?",":""), - (g_class_sizes[5]||g_class_sizes[6]) ? "" : ""); - } else { - HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB\n"); - } - HAKMEM_LOG("[Pool] Page size: %d KB\n", POOL_PAGE_SIZE / 1024); - HAKMEM_LOG("[Pool] Shards: %d (site-based)\n", POOL_NUM_SHARDS); -} - -void hak_pool_init(void) { - pthread_once(&hak_pool_init_once_control, hak_pool_init_impl); -} - -static void mf2_print_debug_stats(void) { - if (!g_mf2_enabled) return; - - fprintf(stderr, "\n[MF2 DEBUG STATS]\n"); - fprintf(stderr, "Alloc fast hits: %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_fast_hit)); - fprintf(stderr, "Alloc slow hits: %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_slow_hit)); - fprintf(stderr, "Page reuses: %12lu\n", (unsigned long)atomic_load(&g_mf2_page_reuse_count)); - fprintf(stderr, "New pages: %12lu\n", (unsigned long)atomic_load(&g_mf2_new_page_count)); - fprintf(stderr, "Owner frees: %12lu\n", (unsigned long)atomic_load(&g_mf2_free_owner_count)); - fprintf(stderr, "Remote frees: %12lu\n", (unsigned long)atomic_load(&g_mf2_free_remote_count)); - fprintf(stderr, "Slow checked: %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_checked_drain)); - fprintf(stderr, "Slow found rem: %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_found_remote)); - fprintf(stderr, "Full scan chk: %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_checked)); - fprintf(stderr, "Full scan rem: %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_found_remote)); - fprintf(stderr, "Eager scan: %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_scanned)); - fprintf(stderr, "Eager found: %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_found)); - fprintf(stderr, "Drain attempts: %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_attempts)); - fprintf(stderr, "Drain successes: %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_success)); - fprintf(stderr, "Remote drains: %12lu (blocks: %lu)\n", - (unsigned long)atomic_load(&g_mf2_drain_count), - (unsigned long)atomic_load(&g_mf2_drain_blocks)); - - // Pending queue statistics - fprintf(stderr, "\n[PENDING QUEUE]\n"); - fprintf(stderr, "Pending enqueued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_enqueued)); - fprintf(stderr, "Pending drained: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_drained)); - fprintf(stderr, "Pending requeued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_requeued)); - - // Calculate ratios - uint64_t total_allocs = atomic_load(&g_mf2_alloc_fast_hit) + atomic_load(&g_mf2_alloc_slow_hit); - uint64_t total_frees = atomic_load(&g_mf2_free_owner_count) + atomic_load(&g_mf2_free_remote_count); - if (total_allocs > 0) { - fprintf(stderr, "\nFast path hit rate: %.2f%%\n", - 100.0 * atomic_load(&g_mf2_alloc_fast_hit) / total_allocs); - } - if (total_frees > 0) { - fprintf(stderr, "Owner free rate: %.2f%%\n", - 100.0 * atomic_load(&g_mf2_free_owner_count) / total_frees); - } - fflush(stderr); -} - -__attribute__((destructor)) -static void mf2_destructor(void) { - mf2_print_debug_stats(); -} - -void hak_pool_shutdown(void) { - if (!g_pool.initialized) return; - - hak_pool_print_stats(); - mf2_print_debug_stats(); - - // Free all pages (walk freelists and free page heads) - // MVP: Skip for now (pages allocated via malloc, will be freed by system) - // Future: Track page allocations and munmap explicitly - - g_pool.initialized = 0; -} - -void* hak_pool_try_alloc(size_t size, uintptr_t site_id) { - hak_pool_init(); // pthread_once() ensures thread-safe init (no data race!) - // P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe) - extern int hak_in_wrapper(void); - if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL; - if (!hak_pool_is_poolable(size)) return NULL; - - // Get class and shard indices - int class_idx = hak_pool_get_class_index(size); - if (class_idx < 0) return NULL; - - // MF2: Per-Page Sharding path - if (g_mf2_enabled) { - return mf2_alloc_fast(class_idx, size, site_id); - } - - // OLD PATH: TLS fast path (ring then local LIFO); drain TC only when needed - PoolTLSRing* ring = &g_tls_bin[class_idx].ring; - if (g_tc_enabled && ring->top < g_tc_drain_trigger && mid_tc_has_items(class_idx)) { - HKM_TIME_START(t_tc_drain); - if (mid_tc_drain_into_tls(class_idx, ring, &g_tls_bin[class_idx])) { - HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); - if (ring->top > 0) { - HKM_TIME_START(t_ring_pop0); - PoolBlock* tlsb = ring->items[--ring->top]; - HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop0); - void* raw = (void*)tlsb; - AllocHeader* hdr = (AllocHeader*)raw; - mid_set_header(hdr, g_class_sizes[class_idx], site_id); - mid_page_inuse_inc(raw); - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u<top == 0) { - atomic_fetch_add_explicit(&g_pool.ring_underflow, 1, memory_order_relaxed); - } - if (ring->top > 0) { - HKM_TIME_START(t_ring_pop1); - PoolBlock* tlsb = ring->items[--ring->top]; - HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop1); - void* raw = (void*)tlsb; - AllocHeader* hdr = (AllocHeader*)raw; - mid_set_header(hdr, g_class_sizes[class_idx], site_id); - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u<next; - if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; - HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop0); - void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw; - mid_set_header(hdr, g_class_sizes[class_idx], site_id); - mid_page_inuse_inc(raw); - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u<top; if (to_ring < 0) to_ring = 0; - while (head && to_ring-- > 0) { PoolBlock* nxt = head->next; ring->items[ring->top++] = head; head = nxt; } - while (head) { PoolBlock* nxt = head->next; head->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = head; g_tls_bin[class_idx].lo_count++; head = nxt; } - g_pool.freelist[class_idx][s] = head; - if (!head) clear_nonempty_bit(class_idx, s); - pthread_mutex_unlock(l); - if (ring->top > 0) { - PoolBlock* tlsb = ring->items[--ring->top]; - void* raw = (void*)tlsb; - AllocHeader* hdr = (AllocHeader*)raw; - mid_set_header(hdr, g_class_sizes[class_idx], site_id); - mid_page_inuse_inc(raw); - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u< 0 && g_tls_active_page_a[class_idx].bump < g_tls_active_page_a[class_idx].end) ap = &g_tls_active_page_a[class_idx]; - else if (g_tls_active_page_b[class_idx].page && g_tls_active_page_b[class_idx].count > 0 && g_tls_active_page_b[class_idx].bump < g_tls_active_page_b[class_idx].end) ap = &g_tls_active_page_b[class_idx]; - else if (g_tls_active_page_c[class_idx].page && g_tls_active_page_c[class_idx].count > 0 && g_tls_active_page_c[class_idx].bump < g_tls_active_page_c[class_idx].end) ap = &g_tls_active_page_c[class_idx]; // QW2-adjusted - if (ap) { - // Opportunistically fill TLS ring from active page as well - if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { - int need = POOL_L2_RING_CAP - ring->top; - (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need); - } - PoolBlock* b = NULL; - if (ring->top > 0) { b = ring->items[--ring->top]; } - else if (ap->page && ap->count > 0 && ap->bump < ap->end) { - b = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count<=0){ ap->page=NULL; ap->count=0; } - } - if (b) { - void* raw = (void*)b; - AllocHeader* hdr = (AllocHeader*)raw; - mid_set_header(hdr, g_class_sizes[class_idx], site_id); - mid_page_inuse_inc(raw); - g_pool.hits[class_idx]++; - return (char*)raw + HEADER_SIZE; - } - } - - // Lock the shard freelist for this (class, shard) - pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m; - HKM_TIME_START(t_lock); - struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1); - (void)ts_lk1; (void)lk1; // Unused profiling variables - pthread_mutex_lock(lock); - HKM_TIME_END(HKM_CAT_POOL_LOCK, t_lock); - hkm_prof_end(lk1, HKP_POOL_LOCK, &ts_lk1); - - // Try to pop from freelist - PoolBlock* block = g_pool.freelist[class_idx][shard_idx]; - - if (!block) { - // Before refilling, try draining remote stack and simple shard steal - int stole = 0; - const FrozenPolicy* pol = hkm_policy_get(); - if (pol) { - uint16_t cap = 0; - if (class_idx < 5) cap = pol->mid_cap[class_idx]; - else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1; - else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2; - // Drain remote stack regardless of cap (cheap and helps reuse) - if (atomic_load_explicit(&g_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) { - drain_remote_locked(class_idx, shard_idx); - block = g_pool.freelist[class_idx][shard_idx]; - } - if (!block && cap > 0 && g_pool.pages_by_class[class_idx] >= cap) { - HKM_TIME_START(t_steal); - for (int d = 1; d <= 4 && !stole; d++) { - int s1 = (shard_idx + d) & (POOL_NUM_SHARDS - 1); - int s2 = (shard_idx - d) & (POOL_NUM_SHARDS - 1); - if (is_shard_nonempty(class_idx, s1)) { - pthread_mutex_t* l2 = &g_pool.freelist_locks[class_idx][s1].m; - pthread_mutex_lock(l2); - PoolBlock* b2 = g_pool.freelist[class_idx][s1]; - if (b2) { - g_pool.freelist[class_idx][s1] = b2->next; - if (!g_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1); - block = b2; stole = 1; - } - pthread_mutex_unlock(l2); - } - if (!stole && is_shard_nonempty(class_idx, s2)) { - pthread_mutex_t* l3 = &g_pool.freelist_locks[class_idx][s2].m; - pthread_mutex_lock(l3); - PoolBlock* b3 = g_pool.freelist[class_idx][s2]; - if (b3) { - g_pool.freelist[class_idx][s2] = b3->next; - if (!g_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2); - block = b3; stole = 1; - } - pthread_mutex_unlock(l3); - } - } - HKM_TIME_END(HKM_CAT_SHARD_STEAL, t_steal); - } - } - - if (!stole && !block) { - // Freelist empty, refill page - { - // choose empty TLS slot for new page (check all 3 slots) // QW2-adjusted - PoolTLSPage* tap = NULL; - if (g_tls_active_page_a[class_idx].page == NULL || g_tls_active_page_a[class_idx].count == 0) tap = &g_tls_active_page_a[class_idx]; - else if (g_tls_active_page_b[class_idx].page == NULL || g_tls_active_page_b[class_idx].count == 0) tap = &g_tls_active_page_b[class_idx]; - else if (g_tls_active_page_c[class_idx].page == NULL || g_tls_active_page_c[class_idx].count == 0) tap = &g_tls_active_page_c[class_idx]; // QW2-adjusted - else tap = &g_tls_active_page_a[class_idx]; // fallback overwrite oldest if all 3 busy - HKM_TIME_START(t_alloc_page); - if (alloc_tls_page(class_idx, tap)) { - HKM_TIME_END(HKM_CAT_POOL_ALLOC_TLS_PAGE, t_alloc_page); - pthread_mutex_unlock(lock); - // rebind to the page we just allocated and top-up ring from bump-run - ap = tap; - if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { - int need = POOL_L2_RING_CAP - ring->top; - (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need); - } - PoolBlock* takeb = NULL; - if (ring->top > 0) { HKM_TIME_START(t_ring_pop2); takeb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop2);} - else if (ap->page && ap->count > 0 && ap->bump < ap->end) { takeb = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count==0){ ap->page=NULL; ap->count=0; } } - void* raw2 = (void*)takeb; - AllocHeader* hdr2 = (AllocHeader*)raw2; - mid_set_header(hdr2, g_class_sizes[class_idx], site_id); - mid_page_inuse_inc(raw2); - g_pool.hits[class_idx]++; - return (char*)raw2 + HEADER_SIZE; - } - HKM_TIME_START(t_refill); - struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf); - int ok = refill_freelist(class_idx, shard_idx); - HKM_TIME_END(HKM_CAT_POOL_REFILL, t_refill); - hkm_prof_end(rf, HKP_POOL_REFILL, &ts_rf); - if (!ok) { - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u<> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u<next; - // Adopt shared page to this thread (first touch) to improve TC routing - mid_desc_adopt(block, class_idx, (uint64_t)(uintptr_t)pthread_self()); - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u<top < POOL_L2_RING_CAP) { HKM_CNT(HKM_CAT_TLS_FAST); ring->items[ring->top++] = block; HKM_TIME_START(t_ring_pop4); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop4); } - else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++; - if (g_tls_ring_enabled && ring->top > 0) { HKM_CNT(HKM_CAT_TLS_FAST); HKM_TIME_START(t_ring_pop5); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop5); } - else { HKM_TIME_START(t_lifo_pop2); take = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = take->next; if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop2); } } - - // Construct header fields now (freelist used header area for links) - void* raw = (void*)take; - AllocHeader* hdr = (AllocHeader*)raw; - mid_set_header(hdr, g_class_sizes[class_idx], site_id); - mid_page_inuse_inc(raw); - - // Calculate user pointer (skip header) - void* user_ptr = (char*)raw + HEADER_SIZE; - - // ゼロ化禁止(calloc以外) - // デバッグモードのみパターン埋め - #ifdef HAKMEM_DEBUG_SANITIZE - memset(user_ptr, 0xA5, g_class_sizes[class_idx]); // パターン埋め - #endif - // 本番: ゼロ化なし(15-25% 高速化) - - return user_ptr; -} - -void hak_pool_free(void* ptr, size_t size, uintptr_t site_id) { - if (!ptr) return; - hak_pool_init(); // pthread_once() ensures thread-safe init (no data race!) - if (!hak_pool_is_poolable(size)) return; - - // MF2: Per-Page Sharding path - if (g_mf2_enabled) { - mf2_free(ptr); - return; - } - - // OLD PATH: ptr is user pointer, get raw pointer (header start) - void* raw = (char*)ptr - HEADER_SIZE; - - // Validate header unless we can prove Mid ownership via page descriptor. - AllocHeader* hdr = (AllocHeader*)raw; - int mid_by_desc = 0; - MidPageDesc* d_desc = mid_desc_lookup(ptr); - if (d_desc) mid_by_desc = 1; - if (!mid_by_desc && g_hdr_light_enabled < 2) { - if (hdr->magic != HAKMEM_MAGIC) { - MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", - hdr->magic, HAKMEM_MAGIC); - return; // Skip free (corruption detected) - } - if (hdr->method != ALLOC_METHOD_POOL) { - MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)", - hdr->method, ALLOC_METHOD_POOL); - return; // Skip free (not a pool allocation) - } - } - - // Get class and shard indices - int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size); - if (class_idx < 0) return; - - int shard_idx = hak_pool_get_shard_index(site_id); - (void)shard_idx; // Unused in MF2 path - - PoolBlock* block = (PoolBlock*)raw; - if (g_pool.tls_free_enabled) { - // Same-thread fast path: prefer TLS caches. If header lacks owner (light), - // consult page descriptor for TLS-owned pages; otherwise fall back to remote. - int same_thread = 0; - if (g_hdr_light_enabled >= 1) { - MidPageDesc* d = mid_desc_lookup(raw); - if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { - same_thread = 1; - } - } else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) { - same_thread = 1; - } - if (same_thread) { - PoolTLSRing* ring = &g_tls_bin[class_idx].ring; - if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { - ring->items[ring->top++] = block; - } else { - // Push to TLS local LIFO; only溢れたときにremoteへ少量spill - block->next = g_tls_bin[class_idx].lo_head; - g_tls_bin[class_idx].lo_head = block; - g_tls_bin[class_idx].lo_count++; - if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) { - size_t spill = g_tls_bin[class_idx].lo_count / 2; - int shard = hak_pool_get_shard_index(site_id); - while (spill-- && g_tls_bin[class_idx].lo_head) { - PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; g_tls_bin[class_idx].lo_count--; - HKM_TIME_START(t_remote_push1); - uintptr_t old_head; - do { - old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); - b->next = (PoolBlock*)old_head; - } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed)); - atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); - HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1); - } - set_nonempty_bit(class_idx, shard); - } - } - - } else { - // Cross-thread: remote push to target shard - if (g_tc_enabled) { - uint64_t owner_tid = 0; - if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid; - if (owner_tid == 0) { - MidPageDesc* d = mid_desc_lookup(raw); - if (d) owner_tid = d->owner_tid; - } - if (owner_tid != 0) { - MidTC* otc = mid_tc_lookup_by_tid(owner_tid); - if (otc) { mid_tc_push(otc, class_idx, block); return; } - } - } - int shard = hak_pool_get_shard_index(site_id); - uintptr_t old_head; - HKM_TIME_START(t_remote_push2); - do { - old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); - block->next = (PoolBlock*)old_head; - } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed)); - atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); - HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2); - set_nonempty_bit(class_idx, shard); - } - } else { - // Return to global freelist (A/B testing path) - int shard_idx = hak_pool_get_shard_index(site_id); - pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m; - pthread_mutex_lock(lock); - block->next = g_pool.freelist[class_idx][shard_idx]; - g_pool.freelist[class_idx][shard_idx] = block; - set_nonempty_bit(class_idx, shard_idx); - pthread_mutex_unlock(lock); - } - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; - if ((t_pool_rng & ((1u< 0) { - double hit_rate = (double)g_pool.hits[i] / (g_pool.hits[i] + g_pool.misses[i]) * 100.0; - printf(" Hit rate: %.1f%%\n", hit_rate); - } - } - - printf("\n----------------------------------------\n"); - printf("Summary:\n"); - printf(" Total hits: %lu\n", (unsigned long)total_hits); - printf(" Total misses: %lu\n", (unsigned long)total_misses); - printf(" Total refills: %lu\n", (unsigned long)total_refills); - printf(" Total frees: %lu\n", (unsigned long)total_frees); - printf(" Pages allocated: %lu\n", (unsigned long)g_pool.total_pages_allocated); - printf(" Bytes allocated: %lu KB\n", (unsigned long)(g_pool.total_bytes_allocated / 1024)); - - if (total_hits + total_misses > 0) { - double hit_rate = (double)total_hits / (total_hits + total_misses) * 100.0; - printf(" Overall hit rate: %.1f%%\n", hit_rate); - } - - printf("========================================\n"); -} - -void hak_pool_stats_snapshot(uint64_t hits[], uint64_t misses[], uint64_t refills[], uint64_t frees[]) { - if (!g_pool.initialized) { - // Zero out if not initialized - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - if (hits) hits[i] = 0; - if (misses) misses[i] = 0; - if (refills) refills[i] = 0; - if (frees) frees[i] = 0; - } - return; - } - for (int i = 0; i < POOL_NUM_CLASSES; i++) { - if (hits) hits[i] = g_pool.hits[i]; - if (misses) misses[i] = g_pool.misses[i]; - if (refills) refills[i] = g_pool.refills[i]; - if (frees) frees[i] = g_pool.frees[i]; - } -} - -void hak_pool_extra_metrics_snapshot(uint64_t* trylock_attempts, uint64_t* trylock_success, uint64_t* ring_underflow) { - if (trylock_attempts) { - *trylock_attempts = atomic_load_explicit(&g_pool.trylock_attempts, memory_order_relaxed); - } - if (trylock_success) { - *trylock_success = atomic_load_explicit(&g_pool.trylock_success, memory_order_relaxed); - } - if (ring_underflow) { - *ring_underflow = atomic_load_explicit(&g_pool.ring_underflow, memory_order_relaxed); - } -} -int hak_pool_mid_lookup(void* ptr, size_t* out_size) { - // CRITICAL FIX: Check MF2 registry first if MF2 is enabled - if (g_mf2_enabled) { - MidPage* page = mf2_addr_to_page(ptr); - if (page) { - int c = (int)page->class_idx; - if (c < 0 || c >= POOL_NUM_CLASSES) return 0; - size_t sz = g_class_sizes[c]; - if (sz == 0) return 0; - if (out_size) *out_size = sz; - return 1; - } - // Not an MF2 page - fall through to old lookup - } - - // OLD PATH: Use mid_desc lookup - MidPageDesc* d = mid_desc_lookup(ptr); - if (!d) return 0; - int c = (int)d->class_idx; - if (c < 0 || c >= POOL_NUM_CLASSES) return 0; - size_t sz = g_class_sizes[c]; - if (sz == 0) return 0; - if (out_size) *out_size = sz; - return 1; -} - -void hak_pool_free_fast(void* ptr, uintptr_t site_id) { - if (!ptr || !g_pool.initialized) return; - - // CRITICAL FIX: If MF2 is enabled, mid_desc_lookup will FAIL because MF2 pages - // are registered in g_mf2_page_registry, not mid_desc! Route directly to MF2. - if (g_mf2_enabled) { - // Check if this is an MF2 page by looking it up in the MF2 registry - MidPage* page = mf2_addr_to_page(ptr); - - if (page) { - // MF2 page found - free through MF2 path - mf2_free(ptr); - return; - } - // Not an MF2 page - might be from old allocator or another tier - // Fall through to old path (though this shouldn't happen if MF2 is exclusive) - } - - // OLD PATH: Use mid_desc lookup - MidPageDesc* d = mid_desc_lookup(ptr); - if (!d) return; - size_t sz = g_class_sizes[(int)d->class_idx]; - if (sz == 0) return; - hak_pool_free(ptr, sz, site_id); -} +// Public API (boxed): alloc/free/lookup/free_fast +#include "box/pool_api.inc.h" diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 0941af62..1b327046 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -9,29 +9,19 @@ #include "hakmem_tiny_batch_refill.h" // Phase 1: Batch refill/spill for mini-magazine #include "hakmem_tiny_stats.h" // Phase 1: Batched statistics (replaces XOR RNG) // Phase 2B modules -#include "hakmem_tiny_stats_api.h" // Phase 2B: Stats API -#include "hakmem_tiny_query_api.h" // Phase 2B-1: Query API -#include "hakmem_tiny_rss_api.h" // Phase 2B-2: RSS Utils -#include "hakmem_tiny_registry_api.h" // Phase 2B-3: Registry +#include "tiny_api.h" // Consolidated: stats_api, query_api, rss_api, registry_api #include "tiny_tls.h" #include "tiny_debug.h" #include "tiny_mmap_gate.h" #include "tiny_debug_ring.h" +#include "tiny_route.h" #include "tiny_tls_guard.h" +#include "tiny_ready.h" #include "hakmem_tiny_tls_list.h" #include "hakmem_tiny_remote_target.h" // Phase 2C-1: Remote target queue #include "hakmem_tiny_bg_spill.h" // Phase 2C-2: Background spill queue // NOTE: hakmem_tiny_tls_ops.h included later (after type definitions) -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "tiny_system.h" // Consolidated: stdio, stdlib, string, etc. #include "hakmem_prof.h" #include "hakmem_trace.h" // Optional USDT (perf) tracepoints @@ -123,6 +113,7 @@ static __thread unsigned char g_tls_bench_warm_done[4]; // Return helper: record tiny alloc stat (guarded) then return pointer static inline void tiny_debug_track_alloc_ret(int cls, void* ptr); +// Inject route commit into return helper so any successful allocation commits a fingerprint #ifdef HAKMEM_ENABLE_STATS // Optional: sampling(ビルド時に有効化)。ホットパスは直接インライン呼び出し(間接分岐なし)。 #ifdef HAKMEM_TINY_STAT_SAMPLING @@ -136,9 +127,9 @@ static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { #else static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); } #endif -#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); hkm_stat_alloc((cls)); return (ptr); } while(0) +#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); hkm_stat_alloc((cls)); ROUTE_COMMIT((cls), 0x7F); return (ptr); } while(0) #else -#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); return (ptr); } while(0) +#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); ROUTE_COMMIT((cls), 0x7F); return (ptr); } while(0) #endif // Free-side stats: compile-time zero when stats disabled @@ -205,6 +196,61 @@ void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class static void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx); #endif +// --------------------------------------------------------------------------- +// Box: adopt_gate_try (implementation moved from header for robust linkage) +// --------------------------------------------------------------------------- +#include "box/adopt_gate_box.h" +extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; +extern int g_super_reg_class_size[TINY_NUM_CLASSES]; +extern unsigned long long g_adopt_gate_calls[]; +extern unsigned long long g_adopt_gate_success[]; +extern unsigned long long g_reg_scan_attempts[]; +extern unsigned long long g_reg_scan_hits[]; +SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) { + g_adopt_gate_calls[class_idx]++; + ROUTE_MARK(13); + SuperSlab* ss = tiny_refill_try_fast(class_idx, tls); + if (ss) { g_adopt_gate_success[class_idx]++; return ss; } + g_reg_scan_attempts[class_idx]++; + int reg_size = g_super_reg_class_size[class_idx]; + int scan_limit = tiny_reg_scan_max(); + if (scan_limit > reg_size) scan_limit = reg_size; + uint32_t self_tid = tiny_self_u32(); + for (int i = 0; i < scan_limit; i++) { + SuperSlab* cand = g_super_reg_by_class[class_idx][i]; + if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue; + // Fast path: use nonempty_mask / freelist_mask to locate candidates in O(1) + uint32_t mask = cand->nonempty_mask; + // Fallback to atomic freelist_mask for cross-thread visibility + if (mask == 0) { + mask = atomic_load_explicit(&cand->freelist_mask, memory_order_acquire); + } + if (mask == 0) continue; // No visible freelists in this SS + int cap = ss_slabs_capacity(cand); + // Iterate set bits only + while (mask) { + int sidx = __builtin_ctz(mask); + mask &= (mask - 1); // clear lowest set bit + if (sidx >= cap) continue; + SlabHandle h = slab_try_acquire(cand, sidx, self_tid); + if (!slab_is_valid(&h)) continue; + if (slab_remote_pending(&h)) { + slab_drain_remote_full(&h); + } + if (slab_is_safe_to_bind(&h)) { + tiny_tls_bind_slab(tls, h.ss, h.slab_idx); + g_adopt_gate_success[class_idx]++; + g_reg_scan_hits[class_idx]++; + ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07); + slab_release(&h); + return h.ss; + } + slab_release(&h); + } + } + return NULL; +} + // ============================================================================ // Global State // ============================================================================ @@ -264,7 +310,7 @@ static int g_use_registry = 1; // Default ON for thread-safety static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64) static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192) -#include "hakmem_tiny_tls_list.h" +// hakmem_tiny_tls_list.h already included at top static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; static int g_tls_list_enable = 1; static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want); @@ -436,7 +482,7 @@ void tiny_adopt_gate_on_remote_seen(int class_idx) { #include "tiny_sticky.h" // Mailbox box -#include "tiny_mailbox.h" +#include "box/mailbox_box.h" // Publish pipeline counters (visibility) unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0}; @@ -513,6 +559,7 @@ static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES]; unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0}; @@ -535,6 +582,10 @@ unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0}; unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0}; +// Refill item source breakdown (freelist vs carve) +unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0}; +unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0}; + static int g_rf_trace_en = -1; static inline int rf_trace_enabled(void) { if (__builtin_expect(g_rf_trace_en == -1, 0)) { @@ -566,6 +617,22 @@ unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0}; + +// Front Gate Breakdown (debug counters) +unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0}; +unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0}; + +// Free-side trigger counters +unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0}; +unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0}; + +// Adopt/Registry gate counters +unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0}; +unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0}; +unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0}; +unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0}; unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0}; unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0}; @@ -622,7 +689,7 @@ static inline uintptr_t hot_slot_pop(int class_idx) { // moved to tiny_publish.c -static void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { +static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { if (!ss) return; uintptr_t ent = slab_entry_make(ss, slab_idx); for (int i = 0; i < SLAB_PARTIAL_RING; i++) { @@ -650,7 +717,7 @@ static void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) { g_slab_publish_dbg[class_idx]++; } -static uintptr_t slab_partial_adopt(int class_idx) { +static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) { for (int i = 0; i < SLAB_PARTIAL_RING; i++) { uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel); if (ent) return ent; @@ -703,7 +770,13 @@ void ss_partial_publish(int class_idx, SuperSlab* ss) { + (has_remote ? 1u : 0u); if (score > best_score) { best_score = score; best = s; } } - if (best >= 0 && best < 256) ss->publish_hint = (uint8_t)best; else ss->publish_hint = 0xFF; + if (best >= 0 && best < 256) { + ss->publish_hint = (uint8_t)best; + // Box: Ready push — provide slab-level candidate to adopters + tiny_ready_push(class_idx, ss, best); + } else { + ss->publish_hint = 0xFF; + } for (int i = 0; i < SS_PARTIAL_RING; i++) { SuperSlab* expected = NULL; if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss, @@ -842,7 +915,7 @@ static inline int tiny_fast_push(int class_idx, void* ptr); // Functions: tiny_hot_pop_class0(), tiny_hot_pop_class1(), tiny_hot_pop_class2(), tiny_hot_pop_class3() // 88 lines (lines 407-494) -static __attribute__((cold, noinline)) void* tiny_slow_alloc_fast(int class_idx) { +static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) { int tls_enabled = g_tls_list_enable; TinyTLSList* tls = &g_tls_lists[class_idx]; pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; @@ -939,7 +1012,7 @@ static __attribute__((cold, noinline)) void* tiny_slow_alloc_fast(int class_idx) // Function: tiny_fast_refill_and_take() - 39 lines (lines 584-622) // Hot-path cheap sampling counter to avoid rand() in allocation path // Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B) -static int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable +int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable // Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c) #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; @@ -952,27 +1025,27 @@ static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL- static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation // Ultra debug counters #if HAKMEM_DEBUG_COUNTERS -static uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0}; static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0}; -static uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0}; #endif // Path counters (normal mode visibility): lightweight, for debugging/bench only #if HAKMEM_DEBUG_COUNTERS -static uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0}; -static uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0}; -static uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0}; -static uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0}; -static uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0}; // New: slow/bitmap/bump/bin instrumentation -static uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0}; -static uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0}; -static uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0}; -static uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0}; -static uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0}; -static uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0}; -static uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0}; -static uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0}; +static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0}; #endif static int g_path_debug_enabled = 0; @@ -1039,7 +1112,7 @@ static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) { } #include "tiny_refill.h" -#include "tiny_mmap_gate.h" +// tiny_mmap_gate.h already included at top #include "tiny_publish.h" static int g_sll_cap_override[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_SLL_CAP_C{0..7} @@ -1524,12 +1597,18 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { // - Eliminates: Registry lookups, mid_lookup, owner checks // ============================================================================ +// Forward declarations for Phase 6 alloc/free functions +#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE + void* hak_tiny_alloc_ultra_simple(size_t size); + void hak_tiny_free_ultra_simple(void* ptr); +#endif + #if defined(HAKMEM_TINY_PHASE6_METADATA) && defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) #error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE" #endif // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check -#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR +#if HAKMEM_TINY_PHASE6_BOX_REFACTOR #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options" #endif @@ -1563,14 +1642,33 @@ TinySlab* hak_tiny_owner_slab(void* ptr) { #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE) // Phase 6-1.5: Alignment guessing (legacy) + + // Refill count globals (needed for compatibility) + int g_refill_count_global = 0; + int g_refill_count_hot = 0; + int g_refill_count_mid = 0; + int g_refill_count_class[TINY_NUM_CLASSES] = {0}; + #include "hakmem_tiny_ultra_simple.inc" + + // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking) + void* hak_tiny_alloc_fast_wrapper(size_t size) { + return hak_tiny_alloc_ultra_simple(size); + } + + void hak_tiny_free_fast_wrapper(void* ptr) { + hak_tiny_free_ultra_simple(ptr); + } #elif defined(HAKMEM_TINY_PHASE6_METADATA) // Phase 6-1.6: Metadata header (recommended) #include "hakmem_tiny_metadata.inc" #endif // Layer 1-3: Main allocation function (simplified) -#define HAKMEM_TINY_USE_NEW_3LAYER 0 // TEMP: Disable for baseline comparison +// Build-time configurable via: -DHAKMEM_TINY_USE_NEW_3LAYER=1 +#ifndef HAKMEM_TINY_USE_NEW_3LAYER +#define HAKMEM_TINY_USE_NEW_3LAYER 0 // default OFF (legacy path) +#endif #if HAKMEM_TINY_USE_NEW_3LAYER #include "hakmem_tiny_alloc_new.inc" #else diff --git a/core/hakmem_tiny_free.inc b/core/hakmem_tiny_free.inc index 6553601e..59564560 100644 --- a/core/hakmem_tiny_free.inc +++ b/core/hakmem_tiny_free.inc @@ -3,6 +3,7 @@ #include "slab_handle.h" #include "tiny_refill.h" #include "tiny_tls_guard.h" +#include "box/free_publish_box.h" #include "mid_tcache.h" extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; @@ -132,6 +133,20 @@ void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) { return; } + // A/B: Force SS freelist path for same-thread frees (publish on first-free) + do { + static int g_free_to_ss2 = -1; + if (__builtin_expect(g_free_to_ss2 == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_FREE_TO_SS"); + g_free_to_ss2 = (e && *e && *e != '0') ? 1 : 0; // default OFF + } + if (g_free_to_ss2) { + hak_tiny_free_superslab(ptr, ss); + HAK_STAT_FREE(class_idx); + return; + } + } while (0); + if (__builtin_expect(g_debug_fast0, 0)) { tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, ptr, (uintptr_t)slab_idx); void* prev = meta->freelist; @@ -190,1227 +205,14 @@ void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) { return; } -#if !HAKMEM_BUILD_RELEASE - // SuperSlab uses Magazine for TLS caching (same as TinySlab) - tiny_small_mags_init_once(); - if (class_idx > 3) tiny_mag_init_if_needed(class_idx); - TinyTLSMag* mag = &g_tls_mags[class_idx]; - int cap = mag->cap; - - // 32/64B: SLL優先(mag優先は無効化) - // Prefer TinyQuickSlot (compile-out if HAKMEM_TINY_NO_QUICK) -#if !defined(HAKMEM_TINY_NO_QUICK) - if (g_quick_enable && class_idx <= 4) { - TinyQuickSlot* qs = &g_tls_quick[class_idx]; - if (__builtin_expect(qs->top < QUICK_CAP, 1)) { - qs->items[qs->top++] = ptr; - HAK_STAT_FREE(class_idx); - return; - } - } -#endif - - // Fast path: TLS SLL push for hottest classes - if (!g_tls_list_enable && g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)cap)) { - *(void**)ptr = g_tls_sll_head[class_idx]; - g_tls_sll_head[class_idx] = ptr; - g_tls_sll_count[class_idx]++; - // Active → Inactive: count down immediately (TLS保管中は"使用中"ではない) - ss_active_dec_one(ss); - HAK_TP1(sll_push, class_idx); - tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 3); - HAK_STAT_FREE(class_idx); - return; - } - - // Next: Magazine push(必要ならmag→SLLへバルク転送で空きを作る) - // Hysteresis: allow slight overfill before deciding to spill under lock - if (mag->top >= cap && g_spill_hyst > 0) { - (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2); - } - if (mag->top < cap + g_spill_hyst) { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = NULL; // SuperSlab owner not a TinySlab; leave NULL -#endif - mag->top++; -#if HAKMEM_DEBUG_COUNTERS - g_magazine_push_count++; // Phase 7.6: Track pushes -#endif - // Active → Inactive: decrement now(アプリ解放時に非アクティブ扱い) - ss_active_dec_one(ss); - HAK_TP1(mag_push, class_idx); - tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 2); - HAK_STAT_FREE(class_idx); - return; - } - - // Background spill: queue to BG thread instead of locking (when enabled) - if (g_bg_spill_enable) { - uint32_t qlen = atomic_load_explicit(&g_bg_spill_len[class_idx], memory_order_relaxed); - if ((int)qlen < g_bg_spill_target) { - // Build a small chain: include current ptr and pop from mag up to limit - int limit = g_bg_spill_max_batch; - if (limit > cap/2) limit = cap/2; - if (limit > 32) limit = 32; // keep free-path bounded - void* head = ptr; - *(void**)head = NULL; - void* tail = head; // current tail - int taken = 1; - while (taken < limit && mag->top > 0) { - void* p2 = mag->items[--mag->top].ptr; - *(void**)p2 = head; - head = p2; - taken++; - } - // Push chain to spill queue (single CAS) - bg_spill_push_chain(class_idx, head, tail, taken); - tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 3); - HAK_STAT_FREE(class_idx); - return; - } - } - - // Spill half (SuperSlab version - simpler than TinySlab) - pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; - hkm_prof_begin(NULL); - pthread_mutex_lock(lock); - // Batch spill: reduce lock frequency and work per call - int spill = cap / 2; - int over = mag->top - (cap + g_spill_hyst); - if (over > 0 && over < spill) spill = over; - - for (int i = 0; i < spill && mag->top > 0; i++) { - TinyMagItem it = mag->items[--mag->top]; - - // Phase 7.6: SuperSlab spill - return to freelist - SuperSlab* owner_ss = hak_super_lookup(it.ptr); - if (owner_ss && owner_ss->magic == SUPERSLAB_MAGIC) { - // Direct freelist push (same as old hak_tiny_free_superslab) - int slab_idx = slab_index_for(owner_ss, it.ptr); - TinySlabMeta* meta = &owner_ss->slabs[slab_idx]; - *(void**)it.ptr = meta->freelist; - meta->freelist = it.ptr; - meta->used--; - // Decrement SuperSlab active counter (spill returns blocks to SS) - ss_active_dec_one(owner_ss); - - // Phase 8.4: Empty SuperSlab detection (will use meta->used scan) - // TODO: Implement scan-based empty detection - // Empty SuperSlab detection/munmapは別途フラッシュAPIで実施(ホットパスから除外) - } - } - - pthread_mutex_unlock(lock); - hkm_prof_end(ss_time, HKP_TINY_SPILL, &tss); - - // Adaptive increase of cap after spill - int max_cap = tiny_cap_max_for_class(class_idx); - if (mag->cap < max_cap) { - int new_cap = mag->cap + (mag->cap / 2); - if (new_cap > max_cap) new_cap = max_cap; - if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP; - mag->cap = new_cap; - } - - // Finally, try FastCache push first (≤128B) — compile-out if HAKMEM_TINY_NO_FRONT_CACHE -#if !defined(HAKMEM_TINY_NO_FRONT_CACHE) - if (g_fastcache_enable && class_idx <= 4) { - if (fastcache_push(class_idx, ptr)) { - HAK_TP1(front_push, class_idx); - HAK_STAT_FREE(class_idx); - return; - } - } -#endif - // Then TLS SLL if room, else magazine - if (g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)mag->cap)) { - *(void**)ptr = g_tls_sll_head[class_idx]; - g_tls_sll_head[class_idx] = ptr; - g_tls_sll_count[class_idx]++; - } else { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = slab; -#endif - mag->top++; - } - -#if HAKMEM_DEBUG_COUNTERS - g_magazine_push_count++; // Phase 7.6: Track pushes -#endif - HAK_STAT_FREE(class_idx); - return; -#endif // HAKMEM_BUILD_RELEASE - } - - // Phase 7.6: TinySlab path (original) - //g_tiny_free_with_slab_count++; // Phase 7.6: Track calls - DISABLED due to segfault - // Same-thread → TLS magazine; remote-thread → MPSC stack - if (pthread_equal(slab->owner_tid, tiny_self_pt())) { - int class_idx = slab->class_idx; - - if (g_tls_list_enable) { - TinyTLSList* tls = &g_tls_lists[class_idx]; - uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed); - if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) { - tiny_tls_refresh_params(class_idx, tls); - } - // TinyHotMag front push(8/16/32B, A/B) - if (__builtin_expect(g_hotmag_enable && class_idx <= 2, 1)) { - if (hotmag_push(class_idx, ptr)) { - HAK_STAT_FREE(class_idx); - return; - } - } - if (tls->count < tls->cap) { - tiny_tls_list_guard_push(class_idx, tls, ptr); - tls_list_push(tls, ptr); - HAK_STAT_FREE(class_idx); - return; - } - seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed); - if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) { - tiny_tls_refresh_params(class_idx, tls); - } - tiny_tls_list_guard_push(class_idx, tls, ptr); - tls_list_push(tls, ptr); - if (tls_list_should_spill(tls)) { - tls_list_spill_excess(class_idx, tls); - } - HAK_STAT_FREE(class_idx); - return; - } - - tiny_mag_init_if_needed(class_idx); - TinyTLSMag* mag = &g_tls_mags[class_idx]; - int cap = mag->cap; - // 32/64B: SLL優先(mag優先は無効化) - // Fast path: FastCache push (preferred for ≤128B), then TLS SLL - if (g_fastcache_enable && class_idx <= 4) { - if (fastcache_push(class_idx, ptr)) { - HAK_STAT_FREE(class_idx); - return; - } - } - // Fast path: TLS SLL push (preferred) - if (!g_tls_list_enable && g_tls_sll_enable && class_idx <= 5) { - uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)cap); - if (g_tls_sll_count[class_idx] < sll_cap) { - *(void**)ptr = g_tls_sll_head[class_idx]; - g_tls_sll_head[class_idx] = ptr; - g_tls_sll_count[class_idx]++; - HAK_STAT_FREE(class_idx); - return; - } - } - // Next: if magazine has room, push immediately and return(満杯ならmag→SLLへバルク) - if (mag->top >= cap) { - (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2); - } - // Remote-drain can be handled opportunistically on future calls. - if (mag->top < cap) { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = slab; -#endif - mag->top++; - -#if HAKMEM_DEBUG_COUNTERS - g_magazine_push_count++; // Phase 7.6: Track pushes -#endif - // Note: SuperSlab uses separate path (slab == NULL branch above) - HAK_STAT_FREE(class_idx); // Phase 3 - return; - } - // Magazine full: before spilling, opportunistically drain remotes once under lock. - if (atomic_load_explicit(&slab->remote_count, memory_order_relaxed) >= (unsigned)g_remote_drain_thresh_per_class[class_idx] || atomic_load_explicit(&slab->remote_head, memory_order_acquire)) { - pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; - pthread_mutex_lock(lock); - HAK_TP1(remote_drain, class_idx); - tiny_remote_drain_locked(slab); - pthread_mutex_unlock(lock); - } - // Spill half under class lock - pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; - pthread_mutex_lock(lock); - int spill = cap / 2; - - // Phase 4.2: High-water threshold for gating Phase 4 logic - int high_water = (cap * 3) / 4; // 75% of capacity - - for (int i = 0; i < spill && mag->top > 0; i++) { - TinyMagItem it = mag->items[--mag->top]; - - // Phase 7.6: Check for SuperSlab first (mixed Magazine support) - SuperSlab* ss_owner = hak_super_lookup(it.ptr); - if (ss_owner && ss_owner->magic == SUPERSLAB_MAGIC) { - // SuperSlab spill - return to freelist - int slab_idx = slab_index_for(ss_owner, it.ptr); - TinySlabMeta* meta = &ss_owner->slabs[slab_idx]; - *(void**)it.ptr = meta->freelist; - meta->freelist = it.ptr; - meta->used--; - // 空SuperSlab処理はフラッシュ/バックグラウンドで対応(ホットパス除外) - HAK_STAT_FREE(class_idx); - continue; // Skip TinySlab processing - } - - TinySlab* owner = -#if HAKMEM_TINY_MAG_OWNER - it.owner; -#else - NULL; -#endif - if (!owner) { - owner = tls_active_owner_for_ptr(class_idx, it.ptr); - } - if (!owner) { - owner = hak_tiny_owner_slab(it.ptr); - } - if (!owner) continue; - - // Phase 4.2: Adaptive gating - skip Phase 4 when TLS Magazine is high-water - // Rationale: When mag->top >= 75%, next alloc will come from TLS anyway - // so pushing to mini-mag is wasted work - int is_high_water = (mag->top >= high_water); - - if (!is_high_water) { - // Low-water: Phase 4.1 logic (try mini-magazine first) - uint8_t cidx = owner->class_idx; // Option A: 1回だけ読む - TinySlab* tls_a = g_tls_active_slab_a[cidx]; - TinySlab* tls_b = g_tls_active_slab_b[cidx]; - - // Option B: Branch prediction hint (spill → TLS-active への戻りが likely) - if (__builtin_expect((owner == tls_a || owner == tls_b) && - !mini_mag_is_full(&owner->mini_mag), 1)) { - // Fast path: mini-magazineに戻す(bitmap触らない) - mini_mag_push(&owner->mini_mag, it.ptr); - HAK_TP1(spill_tiny, cidx); - HAK_STAT_FREE(cidx); - continue; // bitmap操作スキップ - } - } - // High-water or Phase 4.1 mini-mag full: fall through to bitmap - - // Slow path: bitmap直接書き込み(既存ロジック) - size_t bs = g_tiny_class_sizes[owner->class_idx]; - int idx = ((uintptr_t)it.ptr - (uintptr_t)owner->base) / bs; - if (hak_tiny_is_used(owner, idx)) { - hak_tiny_set_free(owner, idx); - int was_full = (owner->free_count == 0); - owner->free_count++; - if (was_full) move_to_free_list(owner->class_idx, owner); - if (owner->free_count == owner->total_count) { - // If this slab is TLS-active for this thread, clear the pointer before releasing - if (g_tls_active_slab_a[owner->class_idx] == owner) g_tls_active_slab_a[owner->class_idx] = NULL; - if (g_tls_active_slab_b[owner->class_idx] == owner) g_tls_active_slab_b[owner->class_idx] = NULL; - TinySlab** headp = &g_tiny_pool.free_slabs[owner->class_idx]; - TinySlab* prev = NULL; - for (TinySlab* s = *headp; s; prev = s, s = s->next) { - if (s == owner) { if (prev) prev->next = s->next; else *headp = s->next; break; } - } - release_slab(owner); - } - HAK_TP1(spill_tiny, owner->class_idx); - HAK_STAT_FREE(owner->class_idx); - } - } - pthread_mutex_unlock(lock); - hkm_prof_end(ss, HKP_TINY_SPILL, &tss); - // Adaptive increase of cap after spill - int max_cap = tiny_cap_max_for_class(class_idx); - if (mag->cap < max_cap) { - int new_cap = mag->cap + (mag->cap / 2); - if (new_cap > max_cap) new_cap = max_cap; - if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP; - mag->cap = new_cap; - } - // Finally: prefer TinyQuickSlot → SLL → UltraFront → HotMag → Magazine(順序で局所性を確保) -#if !HAKMEM_BUILD_RELEASE && !defined(HAKMEM_TINY_NO_QUICK) - if (g_quick_enable && class_idx <= 4) { - TinyQuickSlot* qs = &g_tls_quick[class_idx]; - if (__builtin_expect(qs->top < QUICK_CAP, 1)) { - qs->items[qs->top++] = ptr; - } else if (g_tls_sll_enable) { - uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap); - if (g_tls_sll_count[class_idx] < sll_cap2) { - *(void**)ptr = g_tls_sll_head[class_idx]; - g_tls_sll_head[class_idx] = ptr; - g_tls_sll_count[class_idx]++; - } else if (!tiny_optional_push(class_idx, ptr)) { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = slab; -#endif - mag->top++; - } - } else { - if (!tiny_optional_push(class_idx, ptr)) { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = slab; -#endif - mag->top++; - } - } - } else -#endif - { - if (g_tls_sll_enable && class_idx <= 5) { - uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap); - if (g_tls_sll_count[class_idx] < sll_cap2) { - *(void**)ptr = g_tls_sll_head[class_idx]; - g_tls_sll_head[class_idx] = ptr; - g_tls_sll_count[class_idx]++; - } else if (!tiny_optional_push(class_idx, ptr)) { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = slab; -#endif - mag->top++; - } - } else { - if (!tiny_optional_push(class_idx, ptr)) { - mag->items[mag->top].ptr = ptr; -#if HAKMEM_TINY_MAG_OWNER - mag->items[mag->top].owner = slab; -#endif - mag->top++; - } - } - } - -#if HAKMEM_DEBUG_COUNTERS - g_magazine_push_count++; // Phase 7.6: Track pushes -#endif - // Note: SuperSlab uses separate path (slab == NULL branch above) - HAK_STAT_FREE(class_idx); // Phase 3 - return; - } else { - tiny_remote_push(slab, ptr); - } -} - +#include "tiny_free_magazine.inc.h" // ============================================================================ // Phase 6.23: SuperSlab Allocation Helpers // ============================================================================ // Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation) -static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) { - TinySlabMeta* meta = &ss->slabs[slab_idx]; - - // Ensure remote queue is drained before handing blocks back to TLS - if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) { - uint32_t self_tid = tiny_self_u32(); - SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid); - if (slab_is_valid(&h)) { - slab_drain_remote_full(&h); - int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0; - if (__builtin_expect(pending, 0)) { - if (__builtin_expect(g_debug_remote_guard, 0)) { - uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed); - tiny_remote_watch_note("alloc_pending_remote", - ss, - slab_idx, - (void*)head, - 0xA243u, - self_tid, - 0); - } - slab_release(&h); - return NULL; - } - slab_release(&h); - } else { - if (__builtin_expect(g_debug_remote_guard, 0)) { - tiny_remote_watch_note("alloc_acquire_fail", - ss, - slab_idx, - meta, - 0xA244u, - self_tid, - 0); - } - return NULL; - } - } - - if (__builtin_expect(g_debug_remote_guard, 0)) { - uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire); - if (head_pending != 0) { - tiny_remote_watch_note("alloc_remote_pending", - ss, - slab_idx, - (void*)head_pending, - 0xA247u, - tiny_self_u32(), - 1); - return NULL; - } - } - - // Phase 6.24: Linear allocation mode (freelist == NULL) - // This avoids the 4000-8000 cycle cost of building freelist on init - if (meta->freelist == NULL && meta->used < meta->capacity) { - // Linear allocation: sequential memory access (cache-friendly!) - size_t block_size = g_tiny_class_sizes[ss->size_class]; - void* slab_start = slab_data_start(ss, slab_idx); - - // First slab: skip SuperSlab header - if (slab_idx == 0) { - slab_start = (char*)slab_start + 1024; - } - - void* block = (char*)slab_start + (meta->used * block_size); - meta->used++; - tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0); - tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0); - return block; // Fast path: O(1) pointer arithmetic - } - - // Freelist mode (after first free()) - if (meta->freelist) { - void* block = meta->freelist; - meta->freelist = *(void**)block; // Pop from freelist - meta->used++; - tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0); - tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0); - return block; - } - - return NULL; // Slab is full -} - -// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation) -static SuperSlab* superslab_refill(int class_idx) { -#if HAKMEM_DEBUG_COUNTERS - g_superslab_refill_calls_dbg[class_idx]++; -#endif - TinyTLSSlab* tls = &g_tls_slabs[class_idx]; - static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen - if (g_ss_adopt_en == -1) { - char* e = getenv("HAKMEM_TINY_SS_ADOPT"); - if (e) { - g_ss_adopt_en = (*e != '0') ? 1 : 0; - } else { - extern _Atomic int g_ss_remote_seen; - g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0; - } - } - extern int g_adopt_cool_period; - extern __thread int g_tls_adopt_cd[]; - if (g_adopt_cool_period == -1) { - char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN"); - int v = (cd ? atoi(cd) : 0); - if (v < 0) v = 0; if (v > 1024) v = 1024; - g_adopt_cool_period = v; - } - - static int g_superslab_refill_debug_once = 0; - SuperSlab* prev_ss = tls->ss; - TinySlabMeta* prev_meta = tls->meta; - uint8_t prev_slab_idx = tls->slab_idx; - uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0; - uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0; - uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0; - uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0; - int free_idx_attempted = -2; // -2 = not evaluated, -1 = none, >=0 = chosen - int reused_slabs = 0; - - // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4) - do { - static int g_mid_simple_warn = 0; - if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) { - // If current TLS has a SuperSlab, prefer taking a virgin slab directly - if (tls->ss) { - int tls_cap = ss_slabs_capacity(tls->ss); - if (tls->ss->active_slabs < tls_cap) { - int free_idx = superslab_find_free_slab(tls->ss); - if (free_idx >= 0) { - uint32_t my_tid = tiny_self_u32(); - superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid); - tiny_tls_bind_slab(tls, tls->ss, free_idx); - return tls->ss; - } - } - } - // Otherwise allocate a fresh SuperSlab and bind first slab - SuperSlab* ssn = superslab_allocate((uint8_t)class_idx); - if (!ssn) { - if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) { - g_mid_simple_warn++; - int err = errno; - fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err); - } - return NULL; - } - uint32_t my_tid = tiny_self_u32(); - superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid); - SuperSlab* old = tls->ss; - tiny_tls_bind_slab(tls, ssn, 0); - superslab_ref_inc(ssn); - if (old && old != ssn) { superslab_ref_dec(old); } - return ssn; - } - } while (0); - - - // First, try to adopt a published partial SuperSlab for this class - if (g_ss_adopt_en) { - if (g_adopt_cool_period > 0) { - if (g_tls_adopt_cd[class_idx] > 0) { - g_tls_adopt_cd[class_idx]--; - } else { - // eligible to adopt - } - } - if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) { - SuperSlab* adopt = ss_partial_adopt(class_idx); - if (adopt && adopt->magic == SUPERSLAB_MAGIC) { - // ======================================================================== - // Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs) - // For Larson, any slab with freelist works - no need to score all 32! - // Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores) - // ======================================================================== - int adopt_cap = ss_slabs_capacity(adopt); - int best = -1; - for (int s = 0; s < adopt_cap; s++) { - TinySlabMeta* m = &adopt->slabs[s]; - // Quick check: Does this slab have a freelist? - if (m->freelist) { - // Yes! Try to acquire it immediately (first-fit) - best = s; - break; // ✅ OPTIMIZATION: Stop at first slab with freelist! - } - // Optional: Also check remote_heads if we want to prioritize those - // (But for Larson, freelist is sufficient) - } - if (best >= 0) { - // Box: Try to acquire ownership atomically - uint32_t self = tiny_self_u32(); - SlabHandle h = slab_try_acquire(adopt, best, self); - if (slab_is_valid(&h)) { - slab_drain_remote_full(&h); - if (slab_remote_pending(&h)) { - if (__builtin_expect(g_debug_remote_guard, 0)) { - uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed); - tiny_remote_watch_note("adopt_remote_pending", - h.ss, - h.slab_idx, - (void*)head, - 0xA255u, - self, - 0); - } - // Remote still pending; give up adopt path and fall through to normal refill. - slab_release(&h); - } - - // Box 4 Boundary: bind は remote_head==0 を保証する必要がある - // slab_is_safe_to_bind() で TOCTOU-safe にチェック - if (slab_is_safe_to_bind(&h)) { - // Optional: move a few nodes to Front SLL to boost next hits - tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); - // 安全に bind 可能(freelist 存在 && remote_head==0 保証) - tiny_tls_bind_slab(tls, h.ss, h.slab_idx); - if (g_adopt_cool_period > 0) { - g_tls_adopt_cd[class_idx] = g_adopt_cool_period; - } - return h.ss; - } - // Safe to bind 失敗(freelist なしor remote pending)→ adopt 中止 - slab_release(&h); - } - // Failed to acquire or no freelist - continue searching - } - // If no freelist found, ignore and continue (optional: republish) - } - } - } - - // Phase 7.6 Step 4: Check existing SuperSlab with priority order - if (tls->ss) { - // Priority 1: Reuse slabs with freelist (already freed blocks) - int tls_cap = ss_slabs_capacity(tls->ss); - uint32_t nonempty_mask = 0; - do { - static int g_mask_en = -1; - if (__builtin_expect(g_mask_en == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); - g_mask_en = (e && *e && *e != '0') ? 1 : 0; - } - if (__builtin_expect(g_mask_en, 0)) { - nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire); - break; - } - for (int i = 0; i < tls_cap; i++) { - if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i); - } - } while (0); - - // O(1) lookup: scan mask with ctz (1 instruction!) - while (__builtin_expect(nonempty_mask != 0, 1)) { - int i = __builtin_ctz(nonempty_mask); // Find first non-empty slab (O(1)) - nonempty_mask &= ~(1u << i); // Clear bit for next iteration - - // FIX #1 DELETED (Race condition fix): - // Previous drain without ownership caused concurrent freelist corruption. - // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h). - // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths). - - uint32_t self_tid = tiny_self_u32(); - SlabHandle h = slab_try_acquire(tls->ss, i, self_tid); - if (slab_is_valid(&h)) { - if (slab_remote_pending(&h)) { - slab_drain_remote_full(&h); - if (__builtin_expect(g_debug_remote_guard, 0)) { - uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed); - tiny_remote_watch_note("reuse_remote_pending", - h.ss, - h.slab_idx, - (void*)head, - 0xA254u, - self_tid, - 0); - } - slab_release(&h); - continue; - } - // Box 4 Boundary: bind は remote_head==0 を保証する必要がある - if (slab_is_safe_to_bind(&h)) { - // Optional: move a few nodes to Front SLL to boost next hits - tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); - reused_slabs = 1; - tiny_tls_bind_slab(tls, h.ss, h.slab_idx); - return h.ss; - } - // Safe to bind 失敗 → 次の slab を試す - slab_release(&h); - } - } - - // Priority 2: Use unused slabs (virgin slabs) - if (tls->ss->active_slabs < tls_cap) { - // Find next free slab - int free_idx = superslab_find_free_slab(tls->ss); - free_idx_attempted = free_idx; - if (free_idx >= 0) { - // Initialize this slab - uint32_t my_tid = tiny_self_u32(); - superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid); - - // Update TLS cache (unified update) - tiny_tls_bind_slab(tls, tls->ss, free_idx); - - return tls->ss; - } - } - } - - // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan) - // This reduces pressure to allocate new SS when other threads freed blocks. - // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan - if (!tls->ss) { - // Phase 6: Use per-class registry (262K → ~10-100 entries per class!) - extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; - extern int g_super_reg_class_size[TINY_NUM_CLASSES]; - - const int scan_max = tiny_reg_scan_max(); - int reg_size = g_super_reg_class_size[class_idx]; - int scan_limit = (scan_max < reg_size) ? scan_max : reg_size; - - for (int i = 0; i < scan_limit; i++) { - SuperSlab* ss = g_super_reg_by_class[class_idx][i]; - if (!ss || ss->magic != SUPERSLAB_MAGIC) continue; - // Note: class_idx check is not needed (per-class registry!) - - // Pick first slab with freelist (Box 4: 所有権取得 + remote check) - int reg_cap = ss_slabs_capacity(ss); - uint32_t self_tid = tiny_self_u32(); - for (int s = 0; s < reg_cap; s++) { - if (ss->slabs[s].freelist) { - SlabHandle h = slab_try_acquire(ss, s, self_tid); - if (slab_is_valid(&h)) { - slab_drain_remote_full(&h); - if (slab_is_safe_to_bind(&h)) { - tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); - tiny_tls_bind_slab(tls, ss, s); - return ss; - } - slab_release(&h); - } - } - } - } - } - - // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window - { - SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls); - if (gate_ss) return gate_ss; - } - - // Allocate new SuperSlab - SuperSlab* ss = superslab_allocate((uint8_t)class_idx); - if (!ss) { - if (!g_superslab_refill_debug_once) { - g_superslab_refill_debug_once = 1; - int err = errno; - fprintf(stderr, - "[DEBUG] superslab_refill NULL detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n", - class_idx, - (void*)prev_ss, - (unsigned)prev_active, - prev_bitmap, - (void*)prev_meta, - (unsigned)prev_meta_used, - (unsigned)prev_meta_cap, - (unsigned)prev_slab_idx, - reused_slabs, - free_idx_attempted, - err); - } - return NULL; // OOM - } - - // Initialize first slab - uint32_t my_tid = tiny_self_u32(); - superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid); - - // Cache in unified TLS(前のSS参照を解放) - SuperSlab* old = tls->ss; - tiny_tls_bind_slab(tls, ss, 0); - // Maintain refcount(将来の空回収に備え、TLS参照をカウント) - superslab_ref_inc(ss); - if (old && old != ss) { - superslab_ref_dec(old); - } - - return ss; -} - -// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix) -static inline void* hak_tiny_alloc_superslab(int class_idx) { - // DEBUG: Function entry trace - tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0); - - // MidTC fast path: 128..1024B(class>=4)はTLS tcacheを最優先 - do { - void* mp = midtc_pop(class_idx); - if (mp) { - HAK_RET_ALLOC(class_idx, mp); - } - } while (0); - - // Phase 6.24: 1 TLS read (down from 3) - TinyTLSSlab* tls = &g_tls_slabs[class_idx]; - - TinySlabMeta* meta = tls->meta; - int slab_idx = tls->slab_idx; - if (meta && slab_idx >= 0 && tls->ss) { - // A/B: Relaxed read for remote head presence check - static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed - if (__builtin_expect(g_alloc_remote_relax == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX"); - g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0; - } - uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx], - g_alloc_remote_relax ? memory_order_relaxed - : memory_order_acquire); - if (__builtin_expect(pending != 0, 0)) { - uint32_t self_tid = tiny_self_u32(); - if (ss_owner_try_acquire(meta, self_tid)) { - _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta); - } - } - } - - // FIX #2 DELETED (Race condition fix): - // Previous drain-all-slabs without ownership caused concurrent freelist corruption. - // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash. - // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h). - // Remote frees will be drained when the slab is adopted via refill paths. - - // Fast path: Direct metadata access (no repeated TLS reads!) - if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { - // Linear allocation (lazy init) - size_t block_size = g_tiny_class_sizes[tls->ss->size_class]; - void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size)); - meta->used++; - // Track active blocks in SuperSlab for conservative reclamation - ss_active_inc(tls->ss); - HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead - } - - if (meta && meta->freelist) { - // Freelist allocation - void* block = meta->freelist; - // Safety: bounds/alignment check (debug) - if (__builtin_expect(g_tiny_safe_free, 0)) { - size_t blk = g_tiny_class_sizes[tls->ss->size_class]; - uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx); - uintptr_t delta = (uintptr_t)block - (uintptr_t)base; - int align_ok = ((delta % blk) == 0); - int range_ok = (delta / blk) < meta->capacity; - if (!align_ok || !range_ok) { - uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; } - return NULL; - } - } - void* next = *(void**)block; - meta->freelist = next; - meta->used++; - // Optional: clear freelist bit when becomes empty - do { - static int g_mask_en = -1; - if (__builtin_expect(g_mask_en == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); - g_mask_en = (e && *e && *e != '0') ? 1 : 0; - } - if (__builtin_expect(g_mask_en, 0) && next == NULL) { - uint32_t bit = (1u << slab_idx); - atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release); - } - } while (0); - // Track active blocks in SuperSlab for conservative reclamation - ss_active_inc(tls->ss); - HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead - } - - // Slow path: Refill TLS slab - SuperSlab* ss = superslab_refill(class_idx); - if (!ss) { - static int log_oom = 0; - if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; } - return NULL; // OOM - } - - // Retry allocation (metadata already cached in superslab_refill) - meta = tls->meta; - - // DEBUG: Check each condition (disabled for benchmarks) - // static int log_retry = 0; - // if (log_retry < 2) { - // fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n", - // (void*)meta, meta ? meta->freelist : NULL, - // meta ? meta->used : 0, meta ? meta->capacity : 0, - // (void*)tls->slab_base); - // log_retry++; - // } - - if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { - size_t block_size = g_tiny_class_sizes[ss->size_class]; - void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size)); - - // Disabled for benchmarks - // static int log_success = 0; - // if (log_success < 2) { - // fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n", - // block, class_idx, meta->used, meta->used + 1); - // log_success++; - // } - - meta->used++; - // Track active blocks in SuperSlab for conservative reclamation - ss_active_inc(ss); - HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead - } - - // Disabled for benchmarks - // static int log_fail = 0; - // if (log_fail < 2) { - // fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n"); - // log_fail++; - // } - return NULL; -} - -// Phase 6.22-B: SuperSlab fast free path -static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) { - HAK_DBG_INC(g_superslab_free_count); // Phase 7.6: Track SuperSlab frees - // Get slab index (supports 1MB/2MB SuperSlabs) - int slab_idx = slab_index_for(ss, ptr); - size_t ss_size = (size_t)1ULL << ss->lg_size; - uintptr_t ss_base = (uintptr_t)ss; - if (__builtin_expect(slab_idx < 0, 0)) { - uintptr_t aux = tiny_remote_pack_diag(0xBAD1u, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - TinySlabMeta* meta = &ss->slabs[slab_idx]; - if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) { - tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0); - extern __thread TinyTLSSlab g_tls_slabs[]; - tiny_alloc_dump_tls_state(ss->size_class, "watch_free_enter", &g_tls_slabs[ss->size_class]); -#if !HAKMEM_BUILD_RELEASE - extern __thread TinyTLSMag g_tls_mags[]; - TinyTLSMag* watch_mag = &g_tls_mags[ss->size_class]; - fprintf(stderr, - "[REMOTE_WATCH_MAG] cls=%u mag_top=%d cap=%d\n", - ss->size_class, - watch_mag->top, - watch_mag->cap); -#endif - } - if (__builtin_expect(g_tiny_safe_free, 0)) { - size_t blk = g_tiny_class_sizes[ss->size_class]; - uint8_t* base = tiny_slab_base_for(ss, slab_idx); - uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base; - int cap_ok = (meta->capacity > 0) ? 1 : 0; - int align_ok = (delta % blk) == 0; - int range_ok = cap_ok && (delta / blk) < meta->capacity; - if (!align_ok || !range_ok) { - uint32_t code = 0xA100u; - if (align_ok) code |= 0x2u; - if (range_ok) code |= 0x1u; - uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - // Duplicate in freelist (best-effort scan up to 64) - void* scan = meta->freelist; int scanned = 0; int dup = 0; - while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; } - if (dup) { - uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - } - - // Phase 6.23: Same-thread check - uint32_t my_tid = tiny_self_u32(); - const int debug_guard = g_debug_remote_guard; - static __thread int g_debug_free_count = 0; - if (!g_tiny_force_remote && meta->owner_tid != 0 && meta->owner_tid == my_tid) { - // Fast path: Direct freelist push (same-thread) - if (g_debug_free_count < 1) { - fprintf(stderr, "[FREE_SS] SAME-THREAD: owner=%u my=%u\n", - meta->owner_tid, my_tid); - g_debug_free_count++; - } - if (__builtin_expect(meta->used == 0, 0)) { - uintptr_t aux = tiny_remote_pack_diag(0x00u, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid); - if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) { - int transitioned = ss_remote_push(ss, slab_idx, ptr); - meta->used--; - ss_active_dec_one(ss); - if (transitioned) { - ss_partial_publish((int)ss->size_class, ss); - } - return; - } - // Optional: MidTC (TLS tcache for 128..1024B) - do { - int cls = (int)ss->size_class; - if (midtc_enabled() && cls >= 4) { - if (midtc_push(cls, ptr)) { - // Treat as returned to TLS cache (not SS freelist) - meta->used--; - ss_active_dec_one(ss); - return; - } - } - } while (0); - - void* prev = meta->freelist; - *(void**)ptr = prev; - meta->freelist = ptr; - do { - static int g_mask_en = -1; - if (__builtin_expect(g_mask_en == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); - g_mask_en = (e && *e && *e != '0') ? 1 : 0; - } - if (__builtin_expect(g_mask_en, 0) && prev == NULL) { - uint32_t bit = (1u << slab_idx); - atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release); - } - } while (0); - tiny_remote_track_on_local_free(ss, slab_idx, ptr, "local_free", my_tid); - meta->used--; - // Decrement SuperSlab active counter (actual return to SS) - ss_active_dec_one(ss); - if (prev == NULL) { - ss_partial_publish((int)ss->size_class, ss); - } - - if (__builtin_expect(debug_guard, 0)) { - fprintf(stderr, "[REMOTE_LOCAL] cls=%u slab=%d owner=%u my=%u ptr=%p prev=%p used=%u\n", - ss->size_class, slab_idx, meta->owner_tid, my_tid, ptr, prev, meta->used); - } - - // 空検出は別途(ホットパス除外) - } else { - if (__builtin_expect(meta->owner_tid == my_tid && meta->owner_tid == 0, 0)) { - uintptr_t aux = tiny_remote_pack_diag(0xA300u, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (debug_guard) { - fprintf(stderr, "[REMOTE_OWNER_ZERO] cls=%u slab=%d ptr=%p my=%u used=%u\n", - ss->size_class, slab_idx, ptr, my_tid, (unsigned)meta->used); - } - } - tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "remote_free_enter", my_tid); - // Slow path: Remote free (cross-thread) - if (g_debug_free_count < 5) { - fprintf(stderr, "[FREE_SS] CROSS-THREAD: owner=%u my=%u slab_idx=%d\n", - meta->owner_tid, my_tid, slab_idx); - g_debug_free_count++; - } - if (__builtin_expect(g_tiny_safe_free, 0)) { - // Best-effort duplicate scan in remote stack (up to 64 nodes) - uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire); - uintptr_t base = ss_base; - int scanned = 0; int dup = 0; - uintptr_t cur = head; - while (cur && scanned < 64) { - if ((cur < base) || (cur >= base + ss_size)) { - uintptr_t aux = tiny_remote_pack_diag(0xA200u, base, ss_size, cur); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - break; - } - if ((void*)cur == ptr) { dup = 1; break; } - if (__builtin_expect(g_remote_side_enable, 0)) { - if (!tiny_remote_sentinel_ok((void*)cur)) { - uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, cur); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux); - uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)(void*)cur, memory_order_relaxed); - tiny_remote_report_corruption("scan", (void*)cur, observed); - fprintf(stderr, - "[REMOTE_SENTINEL] cls=%u slab=%d cur=%p head=%p ptr=%p scanned=%d observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p remote_head=%p\n", - ss->size_class, - slab_idx, - (void*)cur, - (void*)head, - ptr, - scanned, - observed, - meta->owner_tid, - (unsigned)meta->used, - meta->freelist, - (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed)); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - break; - } - cur = tiny_remote_side_get(ss, slab_idx, (void*)cur); - } else { - if ((cur & (uintptr_t)(sizeof(void*) - 1)) != 0) { - uintptr_t aux = tiny_remote_pack_diag(0xA201u, base, ss_size, cur); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - break; - } - cur = (uintptr_t)(*(void**)(void*)cur); - } - scanned++; - } - if (dup) { - uintptr_t aux = tiny_remote_pack_diag(0xD1u, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - } - if (__builtin_expect(meta->used == 0, 0)) { - uintptr_t aux = tiny_remote_pack_diag(0x01u, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - static int g_ss_adopt_en2 = -1; // env cached - if (g_ss_adopt_en2 == -1) { - char* e = getenv("HAKMEM_TINY_SS_ADOPT"); - // 既定: Remote Queueを使う(1)。env指定時のみ上書き。 - g_ss_adopt_en2 = (e == NULL) ? 1 : ((*e != '0') ? 1 : 0); - if (__builtin_expect(debug_guard, 0)) { - fprintf(stderr, "[FREE_SS] g_ss_adopt_en2=%d (env='%s')\n", g_ss_adopt_en2, e ? e : "(null)"); - } - } - if (g_ss_adopt_en2) { - // Use remote queue - uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED); - fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n", - ss->size_class, - slab_idx, - meta->owner_tid, - my_tid, - ptr, - (unsigned)meta->used, - atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed), - (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed), - head_word); - int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr); - if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) { - dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr); - } - if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) { - tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0); - } - if (dup_remote) { - uintptr_t aux = tiny_remote_pack_diag(0xA214u, ss_base, ss_size, (uintptr_t)ptr); - tiny_remote_watch_mark(ptr, "dup_prevent", my_tid); - tiny_remote_watch_note("dup_prevent", ss, slab_idx, ptr, 0xA214u, my_tid, 0); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - if (__builtin_expect(g_remote_side_enable && (head_word & 0xFFFFu) == 0x6261u, 0)) { - // TLS guard scribble detected on the node's first word → same-pointer double free across routes - uintptr_t aux = tiny_remote_pack_diag(0xA213u, ss_base, ss_size, (uintptr_t)ptr); - tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); - tiny_remote_watch_mark(ptr, "pre_push", my_tid); - tiny_remote_watch_note("pre_push", ss, slab_idx, ptr, 0xA231u, my_tid, 0); - tiny_remote_report_corruption("pre_push", ptr, head_word); - if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } - return; - } - if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) { - tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0); - } - int was_empty = ss_remote_push(ss, slab_idx, ptr); - meta->used--; - ss_active_dec_one(ss); - if (was_empty) { - ss_partial_publish((int)ss->size_class, ss); - } - } else { - // Fallback: direct freelist push (legacy) - fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n"); - void* prev = meta->freelist; - *(void**)ptr = prev; - meta->freelist = ptr; - do { - static int g_mask_en = -1; - if (__builtin_expect(g_mask_en == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); - g_mask_en = (e && *e && *e != '0') ? 1 : 0; - } - if (__builtin_expect(g_mask_en, 0) && prev == NULL) { - uint32_t bit = (1u << slab_idx); - atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release); - } - } while (0); - meta->used--; - ss_active_dec_one(ss); - if (prev == NULL) { - ss_partial_publish((int)ss->size_class, ss); - } - } - - // 空検出は別途(ホットパス除外) - } -} +#include "tiny_superslab_alloc.inc.h" +#include "tiny_superslab_free.inc.h" void hak_tiny_free(void* ptr) { if (!ptr || !g_tiny_initialized) return; @@ -1474,6 +276,13 @@ void hak_tiny_free(void* ptr) { fast_ss = hak_super_lookup(ptr); if (fast_ss && fast_ss->magic == SUPERSLAB_MAGIC) { fast_class_idx = fast_ss->size_class; + // BUGFIX: Validate size_class before using as array index (prevents OOB = 85% of FREE_TO_SS SEGV) + if (__builtin_expect(fast_class_idx < 0 || fast_class_idx >= TINY_NUM_CLASSES, 0)) { + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF0, ptr, (uintptr_t)fast_class_idx); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + fast_ss = NULL; + fast_class_idx = -1; + } } else { fast_ss = NULL; } @@ -1515,6 +324,12 @@ void hak_tiny_free(void* ptr) { } } if (ss && ss->magic == SUPERSLAB_MAGIC) { + // BUGFIX: Validate size_class before using as array index (prevents OOB) + if (__builtin_expect(ss->size_class < 0 || ss->size_class >= TINY_NUM_CLASSES, 0)) { + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF2, ptr, (uintptr_t)ss->size_class); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } // Direct SuperSlab free (avoid second lookup TOCTOU) hak_tiny_free_superslab(ptr, ss); HAK_STAT_FREE(ss->size_class); diff --git a/core/tiny_api.h b/core/tiny_api.h new file mode 100644 index 00000000..353e042e --- /dev/null +++ b/core/tiny_api.h @@ -0,0 +1,12 @@ +// tiny_api.h - API headers for Tiny allocator +// Consolidates Phase 2B API modules + +#ifndef TINY_API_H +#define TINY_API_H + +#include "hakmem_tiny_stats_api.h" // Phase 2B: Stats API +#include "hakmem_tiny_query_api.h" // Phase 2B-1: Query API +#include "hakmem_tiny_rss_api.h" // Phase 2B-2: RSS Utils +#include "hakmem_tiny_registry_api.h" // Phase 2B-3: Registry + +#endif // TINY_API_H diff --git a/core/tiny_free_magazine.inc.h b/core/tiny_free_magazine.inc.h new file mode 100644 index 00000000..85358c36 --- /dev/null +++ b/core/tiny_free_magazine.inc.h @@ -0,0 +1,420 @@ +// tiny_free_magazine.inc.h - Magazine Layer for hak_tiny_free_with_slab() +// Purpose: TLS caching (TinyQuickSlot, TLS SLL, Magazine) and spill logic +// Extracted from: hakmem_tiny_free.inc lines 208-620 +// Box Theory: Box 5 (TLS Cache) integration +// +// Context: This file is #included within hak_tiny_free_with_slab() function body +// Prerequisites: ss, meta, class_idx, ptr variables must be defined in calling scope + +#if !HAKMEM_BUILD_RELEASE + // SuperSlab uses Magazine for TLS caching (same as TinySlab) + tiny_small_mags_init_once(); + if (class_idx > 3) tiny_mag_init_if_needed(class_idx); + TinyTLSMag* mag = &g_tls_mags[class_idx]; + int cap = mag->cap; + + // 32/64B: SLL優先(mag優先は無効化) + // Prefer TinyQuickSlot (compile-out if HAKMEM_TINY_NO_QUICK) +#if !defined(HAKMEM_TINY_NO_QUICK) + if (g_quick_enable && class_idx <= 4) { + TinyQuickSlot* qs = &g_tls_quick[class_idx]; + if (__builtin_expect(qs->top < QUICK_CAP, 1)) { + qs->items[qs->top++] = ptr; + HAK_STAT_FREE(class_idx); + return; + } + } +#endif + + // Fast path: TLS SLL push for hottest classes + if (!g_tls_list_enable && g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)cap)) { + *(void**)ptr = g_tls_sll_head[class_idx]; + g_tls_sll_head[class_idx] = ptr; + g_tls_sll_count[class_idx]++; + // BUGFIX: Decrement used counter (was missing, causing Fail-Fast on next free) + meta->used--; + // Active → Inactive: count down immediately (TLS保管中は"使用中"ではない) + ss_active_dec_one(ss); + HAK_TP1(sll_push, class_idx); + tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 3); + HAK_STAT_FREE(class_idx); + return; + } + + // Next: Magazine push(必要ならmag→SLLへバルク転送で空きを作る) + // Hysteresis: allow slight overfill before deciding to spill under lock + if (mag->top >= cap && g_spill_hyst > 0) { + (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2); + } + if (mag->top < cap + g_spill_hyst) { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = NULL; // SuperSlab owner not a TinySlab; leave NULL +#endif + mag->top++; +#if HAKMEM_DEBUG_COUNTERS + g_magazine_push_count++; // Phase 7.6: Track pushes +#endif + // Active → Inactive: decrement now(アプリ解放時に非アクティブ扱い) + ss_active_dec_one(ss); + HAK_TP1(mag_push, class_idx); + tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 2); + HAK_STAT_FREE(class_idx); + return; + } + + // Background spill: queue to BG thread instead of locking (when enabled) + if (g_bg_spill_enable) { + uint32_t qlen = atomic_load_explicit(&g_bg_spill_len[class_idx], memory_order_relaxed); + if ((int)qlen < g_bg_spill_target) { + // Build a small chain: include current ptr and pop from mag up to limit + int limit = g_bg_spill_max_batch; + if (limit > cap/2) limit = cap/2; + if (limit > 32) limit = 32; // keep free-path bounded + void* head = ptr; + *(void**)head = NULL; + void* tail = head; // current tail + int taken = 1; + while (taken < limit && mag->top > 0) { + void* p2 = mag->items[--mag->top].ptr; + *(void**)p2 = head; + head = p2; + taken++; + } + // Push chain to spill queue (single CAS) + bg_spill_push_chain(class_idx, head, tail, taken); + tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 3); + HAK_STAT_FREE(class_idx); + return; + } + } + + // Spill half (SuperSlab version - simpler than TinySlab) + pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; + hkm_prof_begin(NULL); + pthread_mutex_lock(lock); + // Batch spill: reduce lock frequency and work per call + int spill = cap / 2; + int over = mag->top - (cap + g_spill_hyst); + if (over > 0 && over < spill) spill = over; + + for (int i = 0; i < spill && mag->top > 0; i++) { + TinyMagItem it = mag->items[--mag->top]; + + // Phase 7.6: SuperSlab spill - return to freelist + SuperSlab* owner_ss = hak_super_lookup(it.ptr); + if (owner_ss && owner_ss->magic == SUPERSLAB_MAGIC) { + // Direct freelist push (same as old hak_tiny_free_superslab) + int slab_idx = slab_index_for(owner_ss, it.ptr); + // BUGFIX: Validate slab_idx before array access (prevents OOB) + if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(owner_ss)) { + continue; // Skip invalid index + } + TinySlabMeta* meta = &owner_ss->slabs[slab_idx]; + *(void**)it.ptr = meta->freelist; + meta->freelist = it.ptr; + meta->used--; + // Decrement SuperSlab active counter (spill returns blocks to SS) + ss_active_dec_one(owner_ss); + + // Phase 8.4: Empty SuperSlab detection (will use meta->used scan) + // TODO: Implement scan-based empty detection + // Empty SuperSlab detection/munmapは別途フラッシュAPIで実施(ホットパスから除外) + } + } + + pthread_mutex_unlock(lock); + hkm_prof_end(ss_time, HKP_TINY_SPILL, &tss); + + // Adaptive increase of cap after spill + int max_cap = tiny_cap_max_for_class(class_idx); + if (mag->cap < max_cap) { + int new_cap = mag->cap + (mag->cap / 2); + if (new_cap > max_cap) new_cap = max_cap; + if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP; + mag->cap = new_cap; + } + + // Finally, try FastCache push first (≤128B) — compile-out if HAKMEM_TINY_NO_FRONT_CACHE +#if !defined(HAKMEM_TINY_NO_FRONT_CACHE) + if (g_fastcache_enable && class_idx <= 4) { + if (fastcache_push(class_idx, ptr)) { + HAK_TP1(front_push, class_idx); + HAK_STAT_FREE(class_idx); + return; + } + } +#endif + // Then TLS SLL if room, else magazine + if (g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)mag->cap)) { + *(void**)ptr = g_tls_sll_head[class_idx]; + g_tls_sll_head[class_idx] = ptr; + g_tls_sll_count[class_idx]++; + } else { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = slab; +#endif + mag->top++; + } + +#if HAKMEM_DEBUG_COUNTERS + g_magazine_push_count++; // Phase 7.6: Track pushes +#endif + HAK_STAT_FREE(class_idx); + return; +#endif // HAKMEM_BUILD_RELEASE + } + + // Phase 7.6: TinySlab path (original) + //g_tiny_free_with_slab_count++; // Phase 7.6: Track calls - DISABLED due to segfault + // Same-thread → TLS magazine; remote-thread → MPSC stack + if (pthread_equal(slab->owner_tid, tiny_self_pt())) { + int class_idx = slab->class_idx; + + if (g_tls_list_enable) { + TinyTLSList* tls = &g_tls_lists[class_idx]; + uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed); + if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) { + tiny_tls_refresh_params(class_idx, tls); + } + // TinyHotMag front push(8/16/32B, A/B) + if (__builtin_expect(g_hotmag_enable && class_idx <= 2, 1)) { + if (hotmag_push(class_idx, ptr)) { + HAK_STAT_FREE(class_idx); + return; + } + } + if (tls->count < tls->cap) { + tiny_tls_list_guard_push(class_idx, tls, ptr); + tls_list_push(tls, ptr); + HAK_STAT_FREE(class_idx); + return; + } + seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed); + if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) { + tiny_tls_refresh_params(class_idx, tls); + } + tiny_tls_list_guard_push(class_idx, tls, ptr); + tls_list_push(tls, ptr); + if (tls_list_should_spill(tls)) { + tls_list_spill_excess(class_idx, tls); + } + HAK_STAT_FREE(class_idx); + return; + } + + tiny_mag_init_if_needed(class_idx); + TinyTLSMag* mag = &g_tls_mags[class_idx]; + int cap = mag->cap; + // 32/64B: SLL優先(mag優先は無効化) + // Fast path: FastCache push (preferred for ≤128B), then TLS SLL + if (g_fastcache_enable && class_idx <= 4) { + if (fastcache_push(class_idx, ptr)) { + HAK_STAT_FREE(class_idx); + return; + } + } + // Fast path: TLS SLL push (preferred) + if (!g_tls_list_enable && g_tls_sll_enable && class_idx <= 5) { + uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)cap); + if (g_tls_sll_count[class_idx] < sll_cap) { + *(void**)ptr = g_tls_sll_head[class_idx]; + g_tls_sll_head[class_idx] = ptr; + g_tls_sll_count[class_idx]++; + HAK_STAT_FREE(class_idx); + return; + } + } + // Next: if magazine has room, push immediately and return(満杯ならmag→SLLへバルク) + if (mag->top >= cap) { + (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2); + } + // Remote-drain can be handled opportunistically on future calls. + if (mag->top < cap) { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = slab; +#endif + mag->top++; + +#if HAKMEM_DEBUG_COUNTERS + g_magazine_push_count++; // Phase 7.6: Track pushes +#endif + // Note: SuperSlab uses separate path (slab == NULL branch above) + HAK_STAT_FREE(class_idx); // Phase 3 + return; + } + // Magazine full: before spilling, opportunistically drain remotes once under lock. + if (atomic_load_explicit(&slab->remote_count, memory_order_relaxed) >= (unsigned)g_remote_drain_thresh_per_class[class_idx] || atomic_load_explicit(&slab->remote_head, memory_order_acquire)) { + pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; + pthread_mutex_lock(lock); + HAK_TP1(remote_drain, class_idx); + tiny_remote_drain_locked(slab); + pthread_mutex_unlock(lock); + } + // Spill half under class lock + pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; + pthread_mutex_lock(lock); + int spill = cap / 2; + + // Phase 4.2: High-water threshold for gating Phase 4 logic + int high_water = (cap * 3) / 4; // 75% of capacity + + for (int i = 0; i < spill && mag->top > 0; i++) { + TinyMagItem it = mag->items[--mag->top]; + + // Phase 7.6: Check for SuperSlab first (mixed Magazine support) + SuperSlab* ss_owner = hak_super_lookup(it.ptr); + if (ss_owner && ss_owner->magic == SUPERSLAB_MAGIC) { + // SuperSlab spill - return to freelist + int slab_idx = slab_index_for(ss_owner, it.ptr); + // BUGFIX: Validate slab_idx before array access (prevents OOB) + if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss_owner)) { + HAK_STAT_FREE(class_idx); + continue; // Skip invalid index + } + TinySlabMeta* meta = &ss_owner->slabs[slab_idx]; + *(void**)it.ptr = meta->freelist; + meta->freelist = it.ptr; + meta->used--; + // 空SuperSlab処理はフラッシュ/バックグラウンドで対応(ホットパス除外) + HAK_STAT_FREE(class_idx); + continue; // Skip TinySlab processing + } + + TinySlab* owner = +#if HAKMEM_TINY_MAG_OWNER + it.owner; +#else + NULL; +#endif + if (!owner) { + owner = tls_active_owner_for_ptr(class_idx, it.ptr); + } + if (!owner) { + owner = hak_tiny_owner_slab(it.ptr); + } + if (!owner) continue; + + // Phase 4.2: Adaptive gating - skip Phase 4 when TLS Magazine is high-water + // Rationale: When mag->top >= 75%, next alloc will come from TLS anyway + // so pushing to mini-mag is wasted work + int is_high_water = (mag->top >= high_water); + + if (!is_high_water) { + // Low-water: Phase 4.1 logic (try mini-magazine first) + uint8_t cidx = owner->class_idx; // Option A: 1回だけ読む + TinySlab* tls_a = g_tls_active_slab_a[cidx]; + TinySlab* tls_b = g_tls_active_slab_b[cidx]; + + // Option B: Branch prediction hint (spill → TLS-active への戻りが likely) + if (__builtin_expect((owner == tls_a || owner == tls_b) && + !mini_mag_is_full(&owner->mini_mag), 1)) { + // Fast path: mini-magazineに戻す(bitmap触らない) + mini_mag_push(&owner->mini_mag, it.ptr); + HAK_TP1(spill_tiny, cidx); + HAK_STAT_FREE(cidx); + continue; // bitmap操作スキップ + } + } + // High-water or Phase 4.1 mini-mag full: fall through to bitmap + + // Slow path: bitmap直接書き込み(既存ロジック) + size_t bs = g_tiny_class_sizes[owner->class_idx]; + int idx = ((uintptr_t)it.ptr - (uintptr_t)owner->base) / bs; + if (hak_tiny_is_used(owner, idx)) { + hak_tiny_set_free(owner, idx); + int was_full = (owner->free_count == 0); + owner->free_count++; + if (was_full) move_to_free_list(owner->class_idx, owner); + if (owner->free_count == owner->total_count) { + // If this slab is TLS-active for this thread, clear the pointer before releasing + if (g_tls_active_slab_a[owner->class_idx] == owner) g_tls_active_slab_a[owner->class_idx] = NULL; + if (g_tls_active_slab_b[owner->class_idx] == owner) g_tls_active_slab_b[owner->class_idx] = NULL; + TinySlab** headp = &g_tiny_pool.free_slabs[owner->class_idx]; + TinySlab* prev = NULL; + for (TinySlab* s = *headp; s; prev = s, s = s->next) { + if (s == owner) { if (prev) prev->next = s->next; else *headp = s->next; break; } + } + release_slab(owner); + } + HAK_TP1(spill_tiny, owner->class_idx); + HAK_STAT_FREE(owner->class_idx); + } + } + pthread_mutex_unlock(lock); + hkm_prof_end(ss, HKP_TINY_SPILL, &tss); + // Adaptive increase of cap after spill + int max_cap = tiny_cap_max_for_class(class_idx); + if (mag->cap < max_cap) { + int new_cap = mag->cap + (mag->cap / 2); + if (new_cap > max_cap) new_cap = max_cap; + if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP; + mag->cap = new_cap; + } + // Finally: prefer TinyQuickSlot → SLL → UltraFront → HotMag → Magazine(順序で局所性を確保) +#if !HAKMEM_BUILD_RELEASE && !defined(HAKMEM_TINY_NO_QUICK) + if (g_quick_enable && class_idx <= 4) { + TinyQuickSlot* qs = &g_tls_quick[class_idx]; + if (__builtin_expect(qs->top < QUICK_CAP, 1)) { + qs->items[qs->top++] = ptr; + } else if (g_tls_sll_enable) { + uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap); + if (g_tls_sll_count[class_idx] < sll_cap2) { + *(void**)ptr = g_tls_sll_head[class_idx]; + g_tls_sll_head[class_idx] = ptr; + g_tls_sll_count[class_idx]++; + } else if (!tiny_optional_push(class_idx, ptr)) { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = slab; +#endif + mag->top++; + } + } else { + if (!tiny_optional_push(class_idx, ptr)) { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = slab; +#endif + mag->top++; + } + } + } else +#endif + { + if (g_tls_sll_enable && class_idx <= 5) { + uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap); + if (g_tls_sll_count[class_idx] < sll_cap2) { + *(void**)ptr = g_tls_sll_head[class_idx]; + g_tls_sll_head[class_idx] = ptr; + g_tls_sll_count[class_idx]++; + } else if (!tiny_optional_push(class_idx, ptr)) { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = slab; +#endif + mag->top++; + } + } else { + if (!tiny_optional_push(class_idx, ptr)) { + mag->items[mag->top].ptr = ptr; +#if HAKMEM_TINY_MAG_OWNER + mag->items[mag->top].owner = slab; +#endif + mag->top++; + } + } + } + +#if HAKMEM_DEBUG_COUNTERS + g_magazine_push_count++; // Phase 7.6: Track pushes +#endif + // Note: SuperSlab uses separate path (slab == NULL branch above) + HAK_STAT_FREE(class_idx); // Phase 3 + return; + } else { + tiny_remote_push(slab, ptr); + } +} diff --git a/core/tiny_superslab_alloc.inc.h b/core/tiny_superslab_alloc.inc.h new file mode 100644 index 00000000..65a953c6 --- /dev/null +++ b/core/tiny_superslab_alloc.inc.h @@ -0,0 +1,558 @@ +// tiny_superslab_alloc.inc.h - SuperSlab Allocation Layer +// Purpose: Slab allocation, refill, and adoption logic +// Extracted from: hakmem_tiny_free.inc lines 626-1170 +// Box Theory: Box 4 (Refill/Adoption) integration +// +// Public functions: +// - superslab_alloc_from_slab(): Allocate from specific slab (linear or freelist) +// - superslab_refill(): Refill TLS slab (adoption, registry scan, fresh alloc) +// - hak_tiny_alloc_superslab(): Main SuperSlab allocation entry point + +// ============================================================================ +// Phase 6.23: SuperSlab Allocation Helpers +// ============================================================================ + +// Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation) +static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) { + TinySlabMeta* meta = &ss->slabs[slab_idx]; + + // Ensure remote queue is drained before handing blocks back to TLS + if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) { + uint32_t self_tid = tiny_self_u32(); + SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid); + if (slab_is_valid(&h)) { + slab_drain_remote_full(&h); + int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0; + if (__builtin_expect(pending, 0)) { + if (__builtin_expect(g_debug_remote_guard, 0)) { + uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed); + tiny_remote_watch_note("alloc_pending_remote", + ss, + slab_idx, + (void*)head, + 0xA243u, + self_tid, + 0); + } + slab_release(&h); + return NULL; + } + slab_release(&h); + } else { + if (__builtin_expect(g_debug_remote_guard, 0)) { + tiny_remote_watch_note("alloc_acquire_fail", + ss, + slab_idx, + meta, + 0xA244u, + self_tid, + 0); + } + return NULL; + } + } + + if (__builtin_expect(g_debug_remote_guard, 0)) { + uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire); + if (head_pending != 0) { + tiny_remote_watch_note("alloc_remote_pending", + ss, + slab_idx, + (void*)head_pending, + 0xA247u, + tiny_self_u32(), + 1); + return NULL; + } + } + + // Phase 6.24: Linear allocation mode (freelist == NULL) + // This avoids the 4000-8000 cycle cost of building freelist on init + if (meta->freelist == NULL && meta->used < meta->capacity) { + // Linear allocation: sequential memory access (cache-friendly!) + size_t block_size = g_tiny_class_sizes[ss->size_class]; + void* slab_start = slab_data_start(ss, slab_idx); + + // First slab: skip SuperSlab header + if (slab_idx == 0) { + slab_start = (char*)slab_start + 1024; + } + + void* block = (char*)slab_start + (meta->used * block_size); + meta->used++; + tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0); + tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0); + return block; // Fast path: O(1) pointer arithmetic + } + + // Freelist mode (after first free()) + if (meta->freelist) { + void* block = meta->freelist; + meta->freelist = *(void**)block; // Pop from freelist + meta->used++; + tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0); + tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0); + return block; + } + + return NULL; // Slab is full +} + +// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation) +static SuperSlab* superslab_refill(int class_idx) { +#if HAKMEM_DEBUG_COUNTERS + g_superslab_refill_calls_dbg[class_idx]++; +#endif + TinyTLSSlab* tls = &g_tls_slabs[class_idx]; + static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen + if (g_ss_adopt_en == -1) { + char* e = getenv("HAKMEM_TINY_SS_ADOPT"); + if (e) { + g_ss_adopt_en = (*e != '0') ? 1 : 0; + } else { + extern _Atomic int g_ss_remote_seen; + g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0; + } + } + extern int g_adopt_cool_period; + extern __thread int g_tls_adopt_cd[]; + if (g_adopt_cool_period == -1) { + char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN"); + int v = (cd ? atoi(cd) : 0); + if (v < 0) v = 0; if (v > 1024) v = 1024; + g_adopt_cool_period = v; + } + + static int g_superslab_refill_debug_once = 0; + SuperSlab* prev_ss = tls->ss; + TinySlabMeta* prev_meta = tls->meta; + uint8_t prev_slab_idx = tls->slab_idx; + uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0; + uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0; + uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0; + uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0; + int free_idx_attempted = -2; // -2 = not evaluated, -1 = none, >=0 = chosen + int reused_slabs = 0; + + // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4) + do { + static int g_mid_simple_warn = 0; + if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) { + // If current TLS has a SuperSlab, prefer taking a virgin slab directly + if (tls->ss) { + int tls_cap = ss_slabs_capacity(tls->ss); + if (tls->ss->active_slabs < tls_cap) { + int free_idx = superslab_find_free_slab(tls->ss); + if (free_idx >= 0) { + uint32_t my_tid = tiny_self_u32(); + superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid); + tiny_tls_bind_slab(tls, tls->ss, free_idx); + return tls->ss; + } + } + } + // Otherwise allocate a fresh SuperSlab and bind first slab + SuperSlab* ssn = superslab_allocate((uint8_t)class_idx); + if (!ssn) { + if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) { + g_mid_simple_warn++; + int err = errno; + fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err); + } + return NULL; + } + uint32_t my_tid = tiny_self_u32(); + superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid); + SuperSlab* old = tls->ss; + tiny_tls_bind_slab(tls, ssn, 0); + superslab_ref_inc(ssn); + if (old && old != ssn) { superslab_ref_dec(old); } + return ssn; + } + } while (0); + + + // First, try to adopt a published partial SuperSlab for this class + if (g_ss_adopt_en) { + if (g_adopt_cool_period > 0) { + if (g_tls_adopt_cd[class_idx] > 0) { + g_tls_adopt_cd[class_idx]--; + } else { + // eligible to adopt + } + } + if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) { + SuperSlab* adopt = ss_partial_adopt(class_idx); + if (adopt && adopt->magic == SUPERSLAB_MAGIC) { + // ======================================================================== + // Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs) + // For Larson, any slab with freelist works - no need to score all 32! + // Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores) + // ======================================================================== + int adopt_cap = ss_slabs_capacity(adopt); + int best = -1; + for (int s = 0; s < adopt_cap; s++) { + TinySlabMeta* m = &adopt->slabs[s]; + // Quick check: Does this slab have a freelist? + if (m->freelist) { + // Yes! Try to acquire it immediately (first-fit) + best = s; + break; // ✅ OPTIMIZATION: Stop at first slab with freelist! + } + // Optional: Also check remote_heads if we want to prioritize those + // (But for Larson, freelist is sufficient) + } + if (best >= 0) { + // Box: Try to acquire ownership atomically + uint32_t self = tiny_self_u32(); + SlabHandle h = slab_try_acquire(adopt, best, self); + if (slab_is_valid(&h)) { + slab_drain_remote_full(&h); + if (slab_remote_pending(&h)) { + if (__builtin_expect(g_debug_remote_guard, 0)) { + uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed); + tiny_remote_watch_note("adopt_remote_pending", + h.ss, + h.slab_idx, + (void*)head, + 0xA255u, + self, + 0); + } + // Remote still pending; give up adopt path and fall through to normal refill. + slab_release(&h); + } + + // Box 4 Boundary: bind は remote_head==0 を保証する必要がある + // slab_is_safe_to_bind() で TOCTOU-safe にチェック + if (slab_is_safe_to_bind(&h)) { + // Optional: move a few nodes to Front SLL to boost next hits + tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); + // 安全に bind 可能(freelist 存在 && remote_head==0 保証) + tiny_tls_bind_slab(tls, h.ss, h.slab_idx); + if (g_adopt_cool_period > 0) { + g_tls_adopt_cd[class_idx] = g_adopt_cool_period; + } + return h.ss; + } + // Safe to bind 失敗(freelist なしor remote pending)→ adopt 中止 + slab_release(&h); + } + // Failed to acquire or no freelist - continue searching + } + // If no freelist found, ignore and continue (optional: republish) + } + } + } + + // Phase 7.6 Step 4: Check existing SuperSlab with priority order + if (tls->ss) { + // Priority 1: Reuse slabs with freelist (already freed blocks) + int tls_cap = ss_slabs_capacity(tls->ss); + uint32_t nonempty_mask = 0; + do { + static int g_mask_en = -1; + if (__builtin_expect(g_mask_en == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); + g_mask_en = (e && *e && *e != '0') ? 1 : 0; + } + if (__builtin_expect(g_mask_en, 0)) { + nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire); + break; + } + for (int i = 0; i < tls_cap; i++) { + if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i); + } + } while (0); + + // O(1) lookup: scan mask with ctz (1 instruction!) + while (__builtin_expect(nonempty_mask != 0, 1)) { + int i = __builtin_ctz(nonempty_mask); // Find first non-empty slab (O(1)) + nonempty_mask &= ~(1u << i); // Clear bit for next iteration + + // FIX #1 DELETED (Race condition fix): + // Previous drain without ownership caused concurrent freelist corruption. + // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h). + // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths). + + uint32_t self_tid = tiny_self_u32(); + SlabHandle h = slab_try_acquire(tls->ss, i, self_tid); + if (slab_is_valid(&h)) { + if (slab_remote_pending(&h)) { + slab_drain_remote_full(&h); + if (__builtin_expect(g_debug_remote_guard, 0)) { + uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed); + tiny_remote_watch_note("reuse_remote_pending", + h.ss, + h.slab_idx, + (void*)head, + 0xA254u, + self_tid, + 0); + } + slab_release(&h); + continue; + } + // Box 4 Boundary: bind は remote_head==0 を保証する必要がある + if (slab_is_safe_to_bind(&h)) { + // Optional: move a few nodes to Front SLL to boost next hits + tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); + reused_slabs = 1; + tiny_tls_bind_slab(tls, h.ss, h.slab_idx); + return h.ss; + } + // Safe to bind 失敗 → 次の slab を試す + slab_release(&h); + } + } + + // Priority 2: Use unused slabs (virgin slabs) + if (tls->ss->active_slabs < tls_cap) { + // Find next free slab + int free_idx = superslab_find_free_slab(tls->ss); + free_idx_attempted = free_idx; + if (free_idx >= 0) { + // Initialize this slab + uint32_t my_tid = tiny_self_u32(); + superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid); + + // Update TLS cache (unified update) + tiny_tls_bind_slab(tls, tls->ss, free_idx); + + return tls->ss; + } + } + } + + // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan) + // This reduces pressure to allocate new SS when other threads freed blocks. + // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan + if (!tls->ss) { + // Phase 6: Use per-class registry (262K → ~10-100 entries per class!) + extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; + extern int g_super_reg_class_size[TINY_NUM_CLASSES]; + + const int scan_max = tiny_reg_scan_max(); + int reg_size = g_super_reg_class_size[class_idx]; + int scan_limit = (scan_max < reg_size) ? scan_max : reg_size; + + for (int i = 0; i < scan_limit; i++) { + SuperSlab* ss = g_super_reg_by_class[class_idx][i]; + if (!ss || ss->magic != SUPERSLAB_MAGIC) continue; + // Note: class_idx check is not needed (per-class registry!) + + // Pick first slab with freelist (Box 4: 所有権取得 + remote check) + int reg_cap = ss_slabs_capacity(ss); + uint32_t self_tid = tiny_self_u32(); + for (int s = 0; s < reg_cap; s++) { + if (ss->slabs[s].freelist) { + SlabHandle h = slab_try_acquire(ss, s, self_tid); + if (slab_is_valid(&h)) { + slab_drain_remote_full(&h); + if (slab_is_safe_to_bind(&h)) { + tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); + tiny_tls_bind_slab(tls, ss, s); + return ss; + } + slab_release(&h); + } + } + } + } + } + + // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window + { + SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls); + if (gate_ss) return gate_ss; + } + + // Allocate new SuperSlab + SuperSlab* ss = superslab_allocate((uint8_t)class_idx); + if (!ss) { + if (!g_superslab_refill_debug_once) { + g_superslab_refill_debug_once = 1; + int err = errno; + fprintf(stderr, + "[DEBUG] superslab_refill NULL detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n", + class_idx, + (void*)prev_ss, + (unsigned)prev_active, + prev_bitmap, + (void*)prev_meta, + (unsigned)prev_meta_used, + (unsigned)prev_meta_cap, + (unsigned)prev_slab_idx, + reused_slabs, + free_idx_attempted, + err); + } + return NULL; // OOM + } + + // Initialize first slab + uint32_t my_tid = tiny_self_u32(); + superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid); + + // Cache in unified TLS(前のSS参照を解放) + SuperSlab* old = tls->ss; + tiny_tls_bind_slab(tls, ss, 0); + // Maintain refcount(将来の空回収に備え、TLS参照をカウント) + superslab_ref_inc(ss); + if (old && old != ss) { + superslab_ref_dec(old); + } + + return ss; +} + +// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix) +static inline void* hak_tiny_alloc_superslab(int class_idx) { + // DEBUG: Function entry trace (gated to avoid ring spam) + do { + static int g_alloc_ring = -1; + if (__builtin_expect(g_alloc_ring == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_ALLOC_RING"); + g_alloc_ring = (e && *e && *e != '0') ? 1 : 0; + } + if (g_alloc_ring) { + tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0); + } + } while (0); + + // MidTC fast path: 128..1024B(class>=4)はTLS tcacheを最優先 + do { + void* mp = midtc_pop(class_idx); + if (mp) { + HAK_RET_ALLOC(class_idx, mp); + } + } while (0); + + // Phase 6.24: 1 TLS read (down from 3) + TinyTLSSlab* tls = &g_tls_slabs[class_idx]; + + TinySlabMeta* meta = tls->meta; + int slab_idx = tls->slab_idx; + if (meta && slab_idx >= 0 && tls->ss) { + // A/B: Relaxed read for remote head presence check + static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed + if (__builtin_expect(g_alloc_remote_relax == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX"); + g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0; + } + uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx], + g_alloc_remote_relax ? memory_order_relaxed + : memory_order_acquire); + if (__builtin_expect(pending != 0, 0)) { + uint32_t self_tid = tiny_self_u32(); + if (ss_owner_try_acquire(meta, self_tid)) { + _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta); + } + } + } + + // FIX #2 DELETED (Race condition fix): + // Previous drain-all-slabs without ownership caused concurrent freelist corruption. + // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash. + // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h). + // Remote frees will be drained when the slab is adopted via refill paths. + + // Fast path: Direct metadata access (no repeated TLS reads!) + if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { + // Linear allocation (lazy init) + size_t block_size = g_tiny_class_sizes[tls->ss->size_class]; + void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size)); + meta->used++; + // Track active blocks in SuperSlab for conservative reclamation + ss_active_inc(tls->ss); + // Route: slab linear + ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60); + HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead + } + + if (meta && meta->freelist) { + // Freelist allocation + void* block = meta->freelist; + // Safety: bounds/alignment check (debug) + if (__builtin_expect(g_tiny_safe_free, 0)) { + size_t blk = g_tiny_class_sizes[tls->ss->size_class]; + uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx); + uintptr_t delta = (uintptr_t)block - (uintptr_t)base; + int align_ok = ((delta % blk) == 0); + int range_ok = (delta / blk) < meta->capacity; + if (!align_ok || !range_ok) { + uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; } + return NULL; + } + } + void* next = *(void**)block; + meta->freelist = next; + meta->used++; + // Optional: clear freelist bit when becomes empty + do { + static int g_mask_en = -1; + if (__builtin_expect(g_mask_en == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); + g_mask_en = (e && *e && *e != '0') ? 1 : 0; + } + if (__builtin_expect(g_mask_en, 0) && next == NULL) { + uint32_t bit = (1u << slab_idx); + atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release); + } + } while (0); + // Track active blocks in SuperSlab for conservative reclamation + ss_active_inc(tls->ss); + // Route: slab freelist + ROUTE_MARK(12); ROUTE_COMMIT(class_idx, 0x61); + HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead + } + + // Slow path: Refill TLS slab + SuperSlab* ss = superslab_refill(class_idx); + if (!ss) { + static int log_oom = 0; + if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; } + return NULL; // OOM + } + + // Retry allocation (metadata already cached in superslab_refill) + meta = tls->meta; + + // DEBUG: Check each condition (disabled for benchmarks) + // static int log_retry = 0; + // if (log_retry < 2) { + // fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n", + // (void*)meta, meta ? meta->freelist : NULL, + // meta ? meta->used : 0, meta ? meta->capacity : 0, + // (void*)tls->slab_base); + // log_retry++; + // } + + if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { + size_t block_size = g_tiny_class_sizes[ss->size_class]; + void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size)); + + // Disabled for benchmarks + // static int log_success = 0; + // if (log_success < 2) { + // fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n", + // block, class_idx, meta->used, meta->used + 1); + // log_success++; + // } + + meta->used++; + // Track active blocks in SuperSlab for conservative reclamation + ss_active_inc(ss); + HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead + } + + // Disabled for benchmarks + // static int log_fail = 0; + // if (log_fail < 2) { + // fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n"); + // log_fail++; + // } + return NULL; +} diff --git a/core/tiny_superslab_free.inc.h b/core/tiny_superslab_free.inc.h new file mode 100644 index 00000000..5331444c --- /dev/null +++ b/core/tiny_superslab_free.inc.h @@ -0,0 +1,313 @@ +// tiny_superslab_free.inc.h - SuperSlab Free Layer +// Purpose: Same-thread and cross-thread free handling +// Extracted from: hakmem_tiny_free.inc lines 1171-1475 +// Box Theory: Box 6 (Free Fast Path) + Box 2 (Remote Queue) integration +// +// Public functions: +// - hak_tiny_free_superslab(): Main SuperSlab free entry point + +// Phase 6.22-B: SuperSlab fast free path +static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) { + ROUTE_MARK(16); // free_enter + HAK_DBG_INC(g_superslab_free_count); // Phase 7.6: Track SuperSlab frees + // Get slab index (supports 1MB/2MB SuperSlabs) + int slab_idx = slab_index_for(ss, ptr); + size_t ss_size = (size_t)1ULL << ss->lg_size; + uintptr_t ss_base = (uintptr_t)ss; + if (__builtin_expect(slab_idx < 0, 0)) { + uintptr_t aux = tiny_remote_pack_diag(0xBAD1u, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + TinySlabMeta* meta = &ss->slabs[slab_idx]; + if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) { + tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0); + extern __thread TinyTLSSlab g_tls_slabs[]; + tiny_alloc_dump_tls_state(ss->size_class, "watch_free_enter", &g_tls_slabs[ss->size_class]); +#if !HAKMEM_BUILD_RELEASE + extern __thread TinyTLSMag g_tls_mags[]; + TinyTLSMag* watch_mag = &g_tls_mags[ss->size_class]; + fprintf(stderr, + "[REMOTE_WATCH_MAG] cls=%u mag_top=%d cap=%d\n", + ss->size_class, + watch_mag->top, + watch_mag->cap); +#endif + } + // BUGFIX: Validate size_class before using as array index (prevents OOB) + if (__builtin_expect(ss->size_class < 0 || ss->size_class >= TINY_NUM_CLASSES, 0)) { + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF1, ptr, (uintptr_t)ss->size_class); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + if (__builtin_expect(g_tiny_safe_free, 0)) { + size_t blk = g_tiny_class_sizes[ss->size_class]; + uint8_t* base = tiny_slab_base_for(ss, slab_idx); + uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base; + int cap_ok = (meta->capacity > 0) ? 1 : 0; + int align_ok = (delta % blk) == 0; + int range_ok = cap_ok && (delta / blk) < meta->capacity; + if (!align_ok || !range_ok) { + uint32_t code = 0xA100u; + if (align_ok) code |= 0x2u; + if (range_ok) code |= 0x1u; + uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + // Duplicate in freelist (best-effort scan up to 64) + void* scan = meta->freelist; int scanned = 0; int dup = 0; + while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; } + if (dup) { + uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + } + + // Phase 6.23: Same-thread check + uint32_t my_tid = tiny_self_u32(); + const int debug_guard = g_debug_remote_guard; + static __thread int g_debug_free_count = 0; + if (!g_tiny_force_remote && meta->owner_tid != 0 && meta->owner_tid == my_tid) { + ROUTE_MARK(17); // free_same_thread + // Fast path: Direct freelist push (same-thread) + if (0 && debug_guard && g_debug_free_count < 1) { + fprintf(stderr, "[FREE_SS] SAME-THREAD: owner=%u my=%u\n", + meta->owner_tid, my_tid); + g_debug_free_count++; + } + if (__builtin_expect(meta->used == 0, 0)) { + uintptr_t aux = tiny_remote_pack_diag(0x00u, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid); + if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) { + #include "box/free_remote_box.h" + int transitioned = tiny_free_remote_box(ss, slab_idx, meta, ptr, my_tid); + if (transitioned) { + extern unsigned long long g_remote_free_transitions[]; + g_remote_free_transitions[ss->size_class]++; + // Free-side route: remote transition observed + do { + static int g_route_free = -1; if (__builtin_expect(g_route_free == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_ROUTE_FREE"); + g_route_free = (e && *e && *e != '0') ? 1 : 0; } + if (g_route_free) route_free_commit((int)ss->size_class, (1ull<<18), 0xE2); + } while (0); + } + return; + } + // Optional: MidTC (TLS tcache for 128..1024B) — allow bypass via env HAKMEM_TINY_FREE_TO_SS=1 + do { + static int g_free_to_ss = -1; + if (__builtin_expect(g_free_to_ss == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_FREE_TO_SS"); + g_free_to_ss = (e && *e && *e != '0') ? 1 : 0; // default OFF + } + if (!g_free_to_ss) { + int cls = (int)ss->size_class; + if (midtc_enabled() && cls >= 4) { + if (midtc_push(cls, ptr)) { + // Treat as returned to TLS cache (not SS freelist) + meta->used--; + ss_active_dec_one(ss); + return; + } + } + } + } while (0); + + #include "box/free_local_box.h" + // Perform freelist push (+first-free publish if applicable) + void* prev_before = meta->freelist; + tiny_free_local_box(ss, slab_idx, meta, ptr, my_tid); + if (prev_before == NULL) { + ROUTE_MARK(19); // first_free_transition + extern unsigned long long g_first_free_transitions[]; + g_first_free_transitions[ss->size_class]++; + ROUTE_MARK(20); // mailbox_publish + // Free-side route commit (one-shot) + do { + static int g_route_free = -1; if (__builtin_expect(g_route_free == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_ROUTE_FREE"); + g_route_free = (e && *e && *e != '0') ? 1 : 0; } + int cls = (int)ss->size_class; + if (g_route_free) route_free_commit(cls, (1ull<<19) | (1ull<<20), 0xE1); + } while (0); + } + + if (__builtin_expect(debug_guard, 0)) { + fprintf(stderr, "[REMOTE_LOCAL] cls=%u slab=%d owner=%u my=%u ptr=%p prev=%p used=%u\n", + ss->size_class, slab_idx, meta->owner_tid, my_tid, ptr, prev_before, meta->used); + } + + // 空検出は別途(ホットパス除外) + } else { + ROUTE_MARK(18); // free_remote_transition + if (__builtin_expect(meta->owner_tid == my_tid && meta->owner_tid == 0, 0)) { + uintptr_t aux = tiny_remote_pack_diag(0xA300u, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (debug_guard) { + fprintf(stderr, "[REMOTE_OWNER_ZERO] cls=%u slab=%d ptr=%p my=%u used=%u\n", + ss->size_class, slab_idx, ptr, my_tid, (unsigned)meta->used); + } + } + tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "remote_free_enter", my_tid); + // Slow path: Remote free (cross-thread) + if (0 && debug_guard && g_debug_free_count < 5) { + fprintf(stderr, "[FREE_SS] CROSS-THREAD: owner=%u my=%u slab_idx=%d\n", + meta->owner_tid, my_tid, slab_idx); + g_debug_free_count++; + } + if (__builtin_expect(g_tiny_safe_free, 0)) { + // Best-effort duplicate scan in remote stack (up to 64 nodes) + uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire); + uintptr_t base = ss_base; + int scanned = 0; int dup = 0; + uintptr_t cur = head; + while (cur && scanned < 64) { + if ((cur < base) || (cur >= base + ss_size)) { + uintptr_t aux = tiny_remote_pack_diag(0xA200u, base, ss_size, cur); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + break; + } + if ((void*)cur == ptr) { dup = 1; break; } + if (__builtin_expect(g_remote_side_enable, 0)) { + if (!tiny_remote_sentinel_ok((void*)cur)) { + uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, cur); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux); + uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)(void*)cur, memory_order_relaxed); + tiny_remote_report_corruption("scan", (void*)cur, observed); + fprintf(stderr, + "[REMOTE_SENTINEL] cls=%u slab=%d cur=%p head=%p ptr=%p scanned=%d observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p remote_head=%p\n", + ss->size_class, + slab_idx, + (void*)cur, + (void*)head, + ptr, + scanned, + observed, + meta->owner_tid, + (unsigned)meta->used, + meta->freelist, + (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed)); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + break; + } + cur = tiny_remote_side_get(ss, slab_idx, (void*)cur); + } else { + if ((cur & (uintptr_t)(sizeof(void*) - 1)) != 0) { + uintptr_t aux = tiny_remote_pack_diag(0xA201u, base, ss_size, cur); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + break; + } + cur = (uintptr_t)(*(void**)(void*)cur); + } + scanned++; + } + if (dup) { + uintptr_t aux = tiny_remote_pack_diag(0xD1u, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + } + if (__builtin_expect(meta->used == 0, 0)) { + uintptr_t aux = tiny_remote_pack_diag(0x01u, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + static int g_ss_adopt_en2 = -1; // env cached + if (g_ss_adopt_en2 == -1) { + char* e = getenv("HAKMEM_TINY_SS_ADOPT"); + // 既定: Remote Queueを使う(1)。env指定時のみ上書き。 + g_ss_adopt_en2 = (e == NULL) ? 1 : ((*e != '0') ? 1 : 0); + if (__builtin_expect(debug_guard, 0)) { + fprintf(stderr, "[FREE_SS] g_ss_adopt_en2=%d (env='%s')\n", g_ss_adopt_en2, e ? e : "(null)"); + } + } + if (g_ss_adopt_en2) { + // Use remote queue + uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED); + if (debug_guard) fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n", + ss->size_class, + slab_idx, + meta->owner_tid, + my_tid, + ptr, + (unsigned)meta->used, + atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed), + (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed), + head_word); + int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr); + if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) { + dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr); + } + if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) { + tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0); + } + if (dup_remote) { + uintptr_t aux = tiny_remote_pack_diag(0xA214u, ss_base, ss_size, (uintptr_t)ptr); + tiny_remote_watch_mark(ptr, "dup_prevent", my_tid); + tiny_remote_watch_note("dup_prevent", ss, slab_idx, ptr, 0xA214u, my_tid, 0); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + if (__builtin_expect(g_remote_side_enable && (head_word & 0xFFFFu) == 0x6261u, 0)) { + // TLS guard scribble detected on the node's first word → same-pointer double free across routes + uintptr_t aux = tiny_remote_pack_diag(0xA213u, ss_base, ss_size, (uintptr_t)ptr); + tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux); + tiny_remote_watch_mark(ptr, "pre_push", my_tid); + tiny_remote_watch_note("pre_push", ss, slab_idx, ptr, 0xA231u, my_tid, 0); + tiny_remote_report_corruption("pre_push", ptr, head_word); + if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; } + return; + } + if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) { + tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0); + } + int was_empty = ss_remote_push(ss, slab_idx, ptr); + meta->used--; + ss_active_dec_one(ss); + if (was_empty) { + extern unsigned long long g_remote_free_transitions[]; + g_remote_free_transitions[ss->size_class]++; + ss_partial_publish((int)ss->size_class, ss); + } + } else { + // Fallback: direct freelist push (legacy) + if (debug_guard) fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n"); + void* prev = meta->freelist; + *(void**)ptr = prev; + meta->freelist = ptr; + do { + static int g_mask_en = -1; + if (__builtin_expect(g_mask_en == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); + g_mask_en = (e && *e && *e != '0') ? 1 : 0; + } + if (__builtin_expect(g_mask_en, 0) && prev == NULL) { + uint32_t bit = (1u << slab_idx); + atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release); + } + } while (0); + meta->used--; + ss_active_dec_one(ss); + if (prev == NULL) { + ss_partial_publish((int)ss->size_class, ss); + } + } + + // 空検出は別途(ホットパス除外) + } +} diff --git a/core/tiny_system.h b/core/tiny_system.h new file mode 100644 index 00000000..aabbbc08 --- /dev/null +++ b/core/tiny_system.h @@ -0,0 +1,18 @@ +// tiny_system.h - System includes for Tiny allocator +// Consolidates all standard library includes to reduce clutter + +#ifndef TINY_SYSTEM_H +#define TINY_SYSTEM_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif // TINY_SYSTEM_H