Files
hakmem/core/hakmem_pool.c.bak2
Moe Charm (CI) 1da8754d45 CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消
**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M   |  4T: SEGV 💀
After:  1T: 2.41M   |  4T: 4.19M   (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s 

# 4 threads: 完走(以前は SEGV)
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s 
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00

1455 lines
59 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB)
// ============================================================================
//
// サイズクラス定義:
// ┌──────────┬─────────┬──────────────┬─────────────┐
// │ クラス │ サイズ │ 初期CAP │ ページ構成 │
// ├──────────┼─────────┼──────────────┼─────────────┤
// │ Class 0 │ 2 KiB │ 64 pages │ 32 blocks/p │
// │ Class 1 │ 4 KiB │ 64 pages │ 16 blocks/p │
// │ Class 2 │ 8 KiB │ 64 pages │ 8 blocks/p │
// │ Class 3 │ 16 KiB │ 32 pages │ 4 blocks/p │
// │ Class 4 │ 32 KiB │ 16 pages │ 2 blocks/p │
// │ DYN1 │ 6 KiB* │ 0 (無効) │ 可変 │
// │ DYN2 │ (未使用)│ 0 (無効) │ 可変 │
// └──────────┴─────────┴──────────────┴─────────────┘
// * DYN1はギャップ(8-16KB)を埋めるための動的クラス
//
// W_MAX (切り上げ許容倍率):
// - 意味: 要求サイズの何倍までのクラスを許容するか
// - デフォルト: 1.40 (40%までの切り上げを許容)
// - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40)
// - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能
//
// CAP (在庫量):
// - 意味: 各クラスで保持する最大ページ数
// - 初期値: {64,64,64,32,16} - 保守的(フットプリント優先)
// - 推奨値: {256,256,256,128,64} - パフォーマンス優先
// - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定
// - 学習モード: HAKMEM_LEARN=1 で自動調整
//
// TLSリング構造:
// - POOL_L2_RING_CAP: リングバッファ容量デフォルト16
// - ActivePage A/B: bump-run方式ロックフリー
// - LIFO overflow: リングから溢れた分
//
// パフォーマンスチューニング:
// 1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64
// 2. W_MAX緩和: HAKMEM_WMAX_MID=1.6
// 3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64
// 4. 学習モード: HAKMEM_LEARN=1
//
// License: MIT
// Last Updated: 2025-10-26 (Code Cleanup完了)
#include "hakmem_pool.h"
#include "hakmem_config.h"
#include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC
#include "hakmem_syscall.h" // Box 3 syscall layer (bypasses LD_PRELOAD)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <pthread.h>
#include <stdatomic.h>
#include "hakmem_prof.h"
#include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP gating)
#include "hakmem_debug.h"
// False sharing mitigation: padded mutex type (64B)
typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;
// ===========================================================================
// Internal Data Structures
// ===========================================================================
#include "box/pool_tls_types.inc.h"
// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
#include "box/pool_mid_desc.inc.h"
// ---------------- Transfer Cache (per-thread per-class inbox) --------------
#include "box/pool_mid_tc.inc.h"
// ===========================================================================
// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture
// ===========================================================================
//
// Key idea: Each 64KB page has independent freelist (no sharing!)
// - O(1) page lookup from block address: (addr & ~0xFFFF)
// - Owner thread: fast path (no locks, no atomics)
// - Cross-thread free: lock-free remote stack
// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc)
// MF2 Configuration Constants (Quick Win #5)
#define MF2_PENDING_QUEUE_BUDGET 4 // Max pages to drain from pending queue
#define MF2_DEBUG_SAMPLE_COUNT 20 // Number of debug samples to log
#define MF2_TSC_CYCLES_PER_US 3000 // Estimated TSC cycles per microsecond
#define MF2_PAGE_SIZE_SHIFT 16 // log2(64KB) for fast division
#define MF2_PAGE_ALIGNMENT 65536 // 64KB alignment for mmap
// Debug Logging Macros (Quick Win #6)
// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable
#ifdef HAKMEM_DEBUG_MF2
#define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__)
#define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
#else
#define MF2_DEBUG_LOG(fmt, ...) ((void)0)
#define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
#endif
// Forward declarations
static size_t g_class_sizes[POOL_NUM_CLASSES];
// MF2 Page descriptor: per-page metadata (one per 64KB page)
typedef struct MidPage {
// Page identity
void* base; // Page base address (64KB aligned)
uint8_t class_idx; // Size class index (0-6)
uint8_t flags; // Page flags (reserved for future use)
uint16_t _pad0;
// Ownership
pthread_t owner_tid; // Owner thread ID (for fast-path check)
struct MF2_ThreadPages* owner_tp; // Owner thread's heap (for pending queue access)
uint64_t last_transfer_time; // rdtsc() timestamp of last ownership transfer (lease mechanism)
// Page-local freelist (owner-only, NO LOCK!)
PoolBlock* freelist; // Local freelist head
uint16_t free_count; // Number of free blocks
uint16_t capacity; // Total blocks per page
// Remote frees (cross-thread, lock-free MPSC stack)
atomic_uintptr_t remote_head; // Lock-free remote free stack
atomic_uint remote_count; // Remote free count (for quick check)
// Lifecycle
atomic_int in_use; // Live allocations on this page
atomic_int pending_dn; // DONTNEED enqueued flag
// Linkage (thread-local page lists)
struct MidPage* next_page; // Next page in thread's list
struct MidPage* prev_page; // Previous page in thread's list
// Pending queue (remote drain notification)
_Atomic(_Bool) in_remote_pending; // Is this page in pending queue?
struct MidPage* next_pending; // Next page in pending queue
// Padding to cache line boundary (avoid false sharing)
char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 +
sizeof(atomic_uintptr_t) + sizeof(atomic_uint) +
sizeof(atomic_int) * 2 + sizeof(pthread_t) +
sizeof(_Atomic(_Bool)) + 4) % 64)];
} MidPage;
// Page registry: O(1) lookup from block address
// Use direct indexing: (addr >> 16) & MASK
#define MF2_PAGE_REGISTRY_BITS 16 // 64K entries (4GB address space with 64KB pages)
#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS)
#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1)
typedef struct {
// Direct-mapped page table (no hash collisions!)
MidPage* pages[MF2_PAGE_REGISTRY_SIZE];
// Coarse-grained locks for rare updates (page alloc/free)
// 256 locks = 256-way parallelism for page registration
pthread_mutex_t locks[256];
// Statistics
atomic_uint_fast64_t total_pages; // Total pages allocated
atomic_uint_fast64_t active_pages; // Pages with live allocations
} MF2_PageRegistry;
// Thread-local page lists (one list per size class)
typedef struct MF2_ThreadPages {
// Active pages (have free blocks)
MidPage* active_page[POOL_NUM_CLASSES];
// Partial pages (drained pages with free blocks, LIFO for cache locality)
// Checked before allocating new pages (fast reuse path)
MidPage* partial_pages[POOL_NUM_CLASSES];
// Full pages (no free blocks, but may receive remote frees)
// TODO: Gradually deprecate in favor of partial_pages
MidPage* full_pages[POOL_NUM_CLASSES];
// Pending queue (pages with remote frees, MPSC lock-free stack)
atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES];
// Pending claim flags (prevent multi-consumer CAS thrashing)
// One adopter at a time per queue (test_and_set to claim, clear to release)
atomic_flag pending_claim[POOL_NUM_CLASSES];
// Page ownership count (for statistics)
uint32_t page_count[POOL_NUM_CLASSES];
// Thread identity (cached for fast comparison)
pthread_t my_tid;
// Route P: Activity tracking for idle-based adoption
// Updated on every allocation (mf2_alloc_fast)
// Read by adopters to check if owner is idle
atomic_uint_fast64_t last_alloc_tsc;
} MF2_ThreadPages;
// Global page registry (shared, rarely accessed)
static MF2_PageRegistry g_mf2_page_registry;
// Thread-local page lists (hot path, no sharing!)
static __thread MF2_ThreadPages* t_mf2_pages = NULL;
// ===========================================================================
// MF2 Global State (Quick Win #3b - Structured Globals)
// ===========================================================================
// Individual globals replaced with structured state below.
// Old declarations removed, replaced with macro-mapped struct instances.
//
// Benefits:
// - Logical grouping (config, registry, stats)
// - Better documentation
// - Easier to extend or refactor
// - Single source of truth for each category
#define MF2_MAX_THREADS 256
// MF2 Configuration (environment variables)
typedef struct {
int enabled; // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled)
int max_queues; // HAKMEM_MF2_MAX_QUEUES (default: 2)
int lease_ms; // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled)
int idle_threshold_us; // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs)
} MF2_Config;
// MF2 Thread Registry (cross-thread coordination)
typedef struct {
MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS]; // Global registry
_Atomic int num_thread_pages; // Active thread count
_Atomic int adoptable_count[POOL_NUM_CLASSES]; // Non-empty pending queues
pthread_key_t tls_key; // Thread-local storage key
pthread_once_t key_once; // TLS initialization guard
} MF2_Registry;
// MF2 Statistics (debug instrumentation)
typedef struct {
// Allocation path
atomic_uint_fast64_t alloc_fast_hit;
atomic_uint_fast64_t alloc_slow_hit;
atomic_uint_fast64_t page_reuse_count;
atomic_uint_fast64_t new_page_count;
// Free path
atomic_uint_fast64_t free_owner_count;
atomic_uint_fast64_t free_remote_count;
// Drain operations
atomic_uint_fast64_t drain_count;
atomic_uint_fast64_t drain_blocks;
atomic_uint_fast64_t drain_attempts;
atomic_uint_fast64_t drain_success;
atomic_uint_fast64_t slow_checked_drain;
atomic_uint_fast64_t slow_found_remote;
// Full page scan (obsolete, kept for historical tracking)
atomic_uint_fast64_t full_scan_checked;
atomic_uint_fast64_t full_scan_found_remote;
atomic_uint_fast64_t eager_drain_scanned;
atomic_uint_fast64_t eager_drain_found;
// Pending queue
atomic_uint_fast64_t pending_enqueued;
atomic_uint_fast64_t pending_drained;
atomic_uint_fast64_t pending_requeued;
} MF2_Stats;
// Instantiate structured global state (Quick Win #3b)
static MF2_Config g_mf2_config = {
.enabled = 0, // Will be set by env var
.max_queues = 2,
.lease_ms = 10,
.idle_threshold_us = 150
};
static MF2_Registry g_mf2_registry = {
.all_thread_pages = {0},
.num_thread_pages = 0,
.adoptable_count = {0},
.tls_key = 0,
.key_once = PTHREAD_ONCE_INIT
};
static MF2_Stats g_mf2_stats = {
// All fields initialized to 0 (atomic zero-initialization is valid)
.alloc_fast_hit = 0,
.alloc_slow_hit = 0,
.page_reuse_count = 0,
.new_page_count = 0,
.free_owner_count = 0,
.free_remote_count = 0,
.drain_count = 0,
.drain_blocks = 0,
.drain_attempts = 0,
.drain_success = 0,
.slow_checked_drain = 0,
.slow_found_remote = 0,
.full_scan_checked = 0,
.full_scan_found_remote = 0,
.eager_drain_scanned = 0,
.eager_drain_found = 0,
.pending_enqueued = 0,
.pending_drained = 0,
.pending_requeued = 0
};
// Compatibility macros: Map old global names to struct fields
// This allows existing code to work unchanged while using structured state
#define g_mf2_enabled (g_mf2_config.enabled)
#define g_mf2_max_queues (g_mf2_config.max_queues)
#define g_mf2_lease_ms (g_mf2_config.lease_ms)
#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us)
#define g_all_thread_pages (g_mf2_registry.all_thread_pages)
#define g_num_thread_pages (g_mf2_registry.num_thread_pages)
#define g_adoptable_count (g_mf2_registry.adoptable_count)
#define g_mf2_tls_key (g_mf2_registry.tls_key)
#define g_mf2_key_once (g_mf2_registry.key_once)
#define g_mf2_alloc_fast_hit (g_mf2_stats.alloc_fast_hit)
#define g_mf2_alloc_slow_hit (g_mf2_stats.alloc_slow_hit)
#define g_mf2_page_reuse_count (g_mf2_stats.page_reuse_count)
#define g_mf2_new_page_count (g_mf2_stats.new_page_count)
#define g_mf2_free_owner_count (g_mf2_stats.free_owner_count)
#define g_mf2_free_remote_count (g_mf2_stats.free_remote_count)
#define g_mf2_drain_count (g_mf2_stats.drain_count)
#define g_mf2_drain_blocks (g_mf2_stats.drain_blocks)
#define g_mf2_drain_attempts (g_mf2_stats.drain_attempts)
#define g_mf2_drain_success (g_mf2_stats.drain_success)
#define g_mf2_slow_checked_drain (g_mf2_stats.slow_checked_drain)
#define g_mf2_slow_found_remote (g_mf2_stats.slow_found_remote)
#define g_mf2_full_scan_checked (g_mf2_stats.full_scan_checked)
#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote)
#define g_mf2_eager_drain_scanned (g_mf2_stats.eager_drain_scanned)
#define g_mf2_eager_drain_found (g_mf2_stats.eager_drain_found)
#define g_mf2_pending_enqueued (g_mf2_stats.pending_enqueued)
#define g_mf2_pending_drained (g_mf2_stats.pending_drained)
#define g_mf2_pending_requeued (g_mf2_stats.pending_requeued)
// ===========================================================================
// End of MF2 Data Structures
// ===========================================================================
// --- MF2 Initialization Functions ---
// Thread-safe initialization using pthread_once
static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT;
static void mf2_page_registry_init_impl(void) {
// Initialize all page slots to NULL
memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry));
// Initialize 256 coarse-grained locks for registry updates
for (int i = 0; i < 256; i++) {
pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL);
}
// Initialize counters
atomic_store(&g_mf2_page_registry.total_pages, 0);
atomic_store(&g_mf2_page_registry.active_pages, 0);
}
static void mf2_page_registry_init(void) {
pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl);
}
// Strategy A: ThreadPages destructor (cleanup on thread exit)
static void mf2_thread_pages_destructor(void* arg) {
MF2_ThreadPages* tp = (MF2_ThreadPages*)arg;
if (!tp) return;
// SAFETY: Don't remove from global registry or free memory
// Reason: Causes "malloc(): unsorted double linked list corrupted" crashes
// Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime)
// TODO: Investigate safe cleanup mechanism
// Remove from global registry (DISABLED for safety)
// for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) {
// if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) {
// atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release);
// break;
// }
// }
// Free all pages owned by this thread (DISABLED for safety)
// hkm_libc_free(tp);
(void)tp; // Suppress unused warning
}
// Strategy A: Initialize pthread_key (once only)
static void mf2_init_tls_key(void) {
pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor);
}
// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection)
static inline uint64_t mf2_rdtsc(void) {
#if defined(__x86_64__) || defined(__i386__)
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
#else
// Fallback for non-x86 architectures (use clock_gettime approximation)
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}
static MF2_ThreadPages* mf2_thread_pages_get(void) {
if (t_mf2_pages) return t_mf2_pages;
// Initialize pthread_key (once only)
pthread_once(&g_mf2_key_once, mf2_init_tls_key);
// Allocate thread-local page lists
MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages));
if (!tp) return NULL;
// Initialize with current thread ID
tp->my_tid = pthread_self();
// All page lists start empty (NULL)
for (int c = 0; c < POOL_NUM_CLASSES; c++) {
tp->active_page[c] = NULL;
tp->full_pages[c] = NULL;
atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed);
atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed);
tp->page_count[c] = 0;
}
// Route P: Initialize activity tracking
atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
// Strategy A: Register in global array for round-robin drain
int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel);
if (idx < MF2_MAX_THREADS) {
atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release);
// DEBUG: Log first 10 thread registrations - Disabled for performance
// static _Atomic int reg_samples = 0;
// int rs = atomic_fetch_add_explicit(&reg_samples, 1, memory_order_relaxed);
// if (rs < 10) {
// fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n",
// rs, (unsigned long)tp->my_tid, tp, idx);
// }
} else {
MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS);
}
// Set pthread-specific data for destructor
pthread_setspecific(g_mf2_tls_key, tp);
t_mf2_pages = tp;
return tp;
}
// --- MF2 Page Allocation & Lookup ---
// O(1) page lookup from block address (mimalloc's secret sauce!)
static inline MidPage* mf2_addr_to_page(void* addr) {
// Step 1: Get page base address (64KB aligned)
// 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits
void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL);
// Step 2: Index into registry (direct-mapped, 64K entries)
// (addr >> 16) extracts page index, & 0xFFFF wraps to registry size
size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
// Step 3: Direct lookup (no hash collision handling needed with 64K entries)
MidPage* page = g_mf2_page_registry.pages[idx];
// ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups
static _Atomic int lookup_count = 0;
// DEBUG: Disabled for performance
// int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed);
// if (count < 100) {
// int found = (page != NULL);
// int match = (page && page->base == page_base);
// fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s",
// count, addr, page_base, idx, found ? "YES" : "NO");
// if (page) {
// fprintf(stderr, ", page->base=%p, match=%s",
// page->base, match ? "YES" : "NO");
// }
// fprintf(stderr, "\n");
// }
// Validation: Ensure page base matches (handles potential collisions)
if (page && page->base == page_base) {
return page;
}
// Collision or not registered (shouldn't happen in normal operation)
return NULL;
}
// Register a page in the global registry (called once per page allocation)
static void mf2_register_page(MidPage* page) {
if (!page) return;
// Calculate registry index from page base
size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
// ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance
// static int register_count = 0;
// if (register_count < 10) {
// fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n",
// register_count, page->base, idx,
// (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO");
// register_count++;
// }
// Coarse-grained lock (256 locks for 64K entries = 256 entries/lock)
int lock_idx = idx % 256;
pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
// Check for collision (should be rare with 64K entries)
if (g_mf2_page_registry.pages[idx] != NULL) {
// Collision detected - this is a problem!
// For MVP, we'll just log and overwrite (TODO: handle collisions properly)
HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx);
}
// Register the page
g_mf2_page_registry.pages[idx] = page;
// Update counters
atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
}
// Unregister a page from the global registry (called when returning page to OS)
__attribute__((unused)) static void mf2_unregister_page(MidPage* page) {
if (!page) return;
size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
int lock_idx = idx % 256;
pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
if (g_mf2_page_registry.pages[idx] == page) {
g_mf2_page_registry.pages[idx] = NULL;
atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
}
pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
}
// Allocate and initialize a new 64KB page for given size class
static MidPage* mf2_alloc_new_page(int class_idx) {
if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL;
// Get user size class (2KB, 4KB, 8KB, 16KB, 32KB)
size_t user_size = g_class_sizes[class_idx];
if (user_size == 0) return NULL; // Dynamic class disabled
// CRITICAL FIX: Each block needs HEADER_SIZE + user_size
// The header stores metadata (AllocHeader), user_size is the usable space
size_t block_size = HEADER_SIZE + user_size;
// Step 1: Allocate 64KB page (aligned to 64KB boundary)
// CRITICAL FIX #4: Must ensure 64KB alignment!
// mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup.
// This caused 97% of frees to fail silently (fatal bug!)
//
// CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion!
// Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion.
// Allocate 2x size to allow alignment adjustment
size_t alloc_size = POOL_PAGE_SIZE * 2; // 128KB
void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (raw == MAP_FAILED) {
return NULL; // OOM
}
// Find 64KB aligned address within allocation
uintptr_t addr = (uintptr_t)raw;
uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL; // Round up to 64KB boundary
void* page_base = (void*)aligned;
// Free unused prefix (if any)
size_t prefix_size = aligned - addr;
if (prefix_size > 0) {
munmap(raw, prefix_size);
}
// Free unused suffix
size_t suffix_offset = prefix_size + POOL_PAGE_SIZE;
if (suffix_offset < alloc_size) {
munmap((char*)raw + suffix_offset, alloc_size - suffix_offset);
}
// DEBUG: Log first few allocations
static _Atomic int mmap_count = 0;
int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed);
if (mc < 5) {
MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu",
mc, raw, page_base, prefix_size, alloc_size - suffix_offset);
}
// ALIGNMENT VERIFICATION (Step 1)
if (((uintptr_t)page_base & 0xFFFF) != 0) {
MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)",
page_base, ((uintptr_t)page_base & 0xFFFF));
}
// Zero-fill (required for posix_memalign)
// Note: This adds ~15μs overhead, but is necessary for correctness
memset(page_base, 0, POOL_PAGE_SIZE);
// Step 2: Allocate MidPage descriptor
MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage));
if (!page) {
// CRITICAL FIX: Use munmap for mmap-allocated memory
munmap(page_base, POOL_PAGE_SIZE);
return NULL;
}
// Step 3: Initialize page descriptor
page->base = page_base;
page->class_idx = (uint8_t)class_idx;
page->flags = 0;
page->owner_tid = pthread_self();
page->owner_tp = mf2_thread_pages_get(); // Store owner's ThreadPages for pending queue
page->last_transfer_time = 0; // No transfer yet (lease mechanism)
// Step 4: Build freelist chain (walk through page and link blocks)
// Calculate how many blocks fit in 64KB page (including header overhead)
size_t usable_size = POOL_PAGE_SIZE;
size_t num_blocks = usable_size / block_size;
page->capacity = (uint16_t)num_blocks;
page->free_count = (uint16_t)num_blocks;
// Build linked list of free blocks
PoolBlock* freelist_head = NULL;
PoolBlock* freelist_tail = NULL;
for (size_t i = 0; i < num_blocks; i++) {
char* block_addr = (char*)page_base + (i * block_size);
PoolBlock* block = (PoolBlock*)block_addr;
block->next = NULL;
if (freelist_head == NULL) {
freelist_head = block;
freelist_tail = block;
} else {
freelist_tail->next = block;
freelist_tail = block;
}
}
page->freelist = freelist_head;
// Step 5: Initialize remote stack (for cross-thread frees)
atomic_store(&page->remote_head, (uintptr_t)0);
atomic_store(&page->remote_count, 0);
// Step 6: Initialize lifecycle counters
atomic_store(&page->in_use, 0); // No blocks allocated yet
atomic_store(&page->pending_dn, 0);
// Step 7: Initialize linkage
page->next_page = NULL;
page->prev_page = NULL;
// Initialize pending queue fields
atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed);
page->next_pending = NULL;
// Step 8: Register page in global registry
mf2_register_page(page);
return page;
}
// --- MF2 Allocation & Free Operations ---
// Forward declarations
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page);
// Drain remote frees (cross-thread) into page's local freelist
// Called by owner thread when local freelist is empty
static int mf2_drain_remote_frees(MidPage* page) {
if (!page) return 0;
atomic_fetch_add(&g_mf2_drain_attempts, 1);
// Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG)
unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
if (remote_count == 0) {
return 0; // Nothing to drain
}
// Atomically swap remote stack head with NULL (lock-free pop all)
uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0,
memory_order_acq_rel);
if (!head) {
atomic_store_explicit(&page->remote_count, 0, memory_order_release);
return 0; // Race: someone else drained it
}
// Reset remote count (FIX #6: use release for future drain checks to see)
atomic_store_explicit(&page->remote_count, 0, memory_order_release);
// Walk the remote stack and count blocks
int drained = 0;
PoolBlock* cur = (PoolBlock*)head;
PoolBlock* tail = NULL;
while (cur) {
drained++;
tail = cur;
cur = cur->next;
}
// Append remote stack to local freelist (splice in front for simplicity)
if (tail) {
tail->next = page->freelist;
page->freelist = (PoolBlock*)head;
page->free_count += drained;
}
atomic_fetch_add(&g_mf2_drain_count, 1);
atomic_fetch_add(&g_mf2_drain_blocks, drained);
// CRITICAL FIX: Check if new remotes arrived DURING drain
// If so, re-enqueue to owner's pending queue (avoid losing remotes!)
unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire);
if (post_drain_count >= 1 && page->owner_tp) { // Use same threshold as initial enqueue
// New remotes arrived during drain, re-enqueue for next round
// Note: This is safe because flag was cleared earlier
mf2_enqueue_pending(page->owner_tp, page);
}
return drained;
}
// ===========================================================================
// Pending Queue Operations (MPSC Lock-Free Stack)
// ===========================================================================
// Enqueue page to owner's pending queue (called by remote threads)
// MPSC: Multiple producers (remote free threads), single consumer (owner)
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) {
if (!owner_tp || !page) return;
// Already in pending? Skip (avoid duplicate enqueue)
_Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel);
if (was_pending) {
return; // Already enqueued, nothing to do
}
atomic_fetch_add(&g_mf2_pending_enqueued, 1);
// Push to owner's pending stack (Treiber stack algorithm)
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed);
page->next_pending = (MidPage*)old_head;
} while (!atomic_compare_exchange_weak_explicit(
&owner_tp->pages_remote_pending[page->class_idx],
&old_head, (uintptr_t)page,
memory_order_release, // Publish page
memory_order_relaxed));
// 0→1 detection: Increment adoptable count for this class
// This enables O(1) early return in try_adopt (if count==0, no scan needed)
if (old_head == 0) {
atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed);
}
}
// Dequeue one page from pending queue (called by owner thread or adopter)
// Uses CAS for correctness (multi-consumer in adoption path)
static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return NULL;
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire);
if (old_head == 0) {
return NULL; // Queue empty
}
MidPage* page = (MidPage*)old_head;
// CAS to pop head
if (atomic_compare_exchange_weak_explicit(
&tp->pages_remote_pending[class_idx],
&old_head, (uintptr_t)page->next_pending,
memory_order_acq_rel, memory_order_relaxed)) {
// Successfully dequeued
MidPage* next = page->next_pending;
page->next_pending = NULL; // Clear link
// If queue became empty (next==NULL), decrement adoptable count
// This enables O(1) early return in try_adopt when all queues empty
if (next == NULL) {
atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed);
}
return page;
}
} while (1);
}
// ===========================================================================
// End of Pending Queue Operations
// ===========================================================================
// Forward declarations
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
// ===========================================================================
// Helper Functions (Clean & Modular)
// ===========================================================================
// Helper: Make page active (move old active to full_pages)
static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
if (!tp || !page) return;
// Move old active page to full_pages (if any)
if (tp->active_page[class_idx]) {
MidPage* old_active = tp->active_page[class_idx];
old_active->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = old_active;
}
// Set new page as active
tp->active_page[class_idx] = page;
page->next_page = NULL;
}
// Helper: Drain page and add to partial list (LIFO for cache locality)
// Returns true if page has free blocks after drain
static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
if (!tp || !page) return false;
// Drain remote frees
int drained = mf2_drain_remote_frees(page);
// If page has freelist after drain, add to partial list (LIFO)
if (page->freelist) {
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
page->next_page = tp->partial_pages[class_idx];
tp->partial_pages[class_idx] = page;
return true;
}
// No freelist, return to full_pages
page->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = page;
return false;
}
// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
// Returns true if page was activated
static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
if (!tp || !page) return false;
// Drain remote frees
int drained = mf2_drain_remote_frees(page);
// If page has freelist after drain, make it active immediately
if (page->freelist) {
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
mf2_make_page_active(tp, class_idx, page);
return true;
}
// No freelist, return to full_pages
page->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = page;
return false;
}
// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
// Returns true if a page was successfully drained and activated
static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return false;
// Budget: Process up to N pages to avoid blocking
for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
if (!pending_page) break; // Queue empty
atomic_fetch_add(&g_mf2_pending_drained, 1);
// Clear pending flag (no longer in queue)
atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
// DIRECT HANDOFF: Drain and activate if successful
if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
return true; // Success! Page is now active
}
// No freelist after drain, page returned to full_pages by helper
}
return false; // No pages available for reuse
}
// Helper: Try to drain remotes from active page (must-reuse gate part 2)
// Returns true if active page has freelist after drain
static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return false;
MidPage* page = tp->active_page[class_idx];
if (!page) return false;
atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
if (remote_cnt > 0) {
atomic_fetch_add(&g_mf2_slow_found_remote, 1);
int drained = mf2_drain_remote_frees(page);
if (drained > 0 && page->freelist) {
atomic_fetch_add(&g_mf2_drain_success, 1);
return true; // Success! Active page now has freelist
}
}
return false; // No remotes or drain failed
}
// Helper: Allocate new page and make it active
// Returns the newly allocated page (or NULL on OOM)
static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return NULL;
atomic_fetch_add(&g_mf2_new_page_count, 1);
// DEBUG: Log why we're allocating new page (first N samples)
static _Atomic int new_page_samples = 0;
int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
// Count adoptable pages across all threads
int total_adoptable = 0;
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
}
MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
sample_idx, class_idx,
(void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
total_adoptable,
tp->active_page[class_idx],
tp->full_pages[class_idx]);
}
MidPage* page = mf2_alloc_new_page(class_idx);
if (!page) {
return NULL; // OOM
}
// Move current active page to full list (if any)
if (tp->active_page[class_idx]) {
MidPage* old_page = tp->active_page[class_idx];
old_page->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = old_page;
}
// Set new page as active
tp->active_page[class_idx] = page;
tp->page_count[class_idx]++;
return page;
}
// ===========================================================================
// End of Helper Functions
// ===========================================================================
// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
// Returns true if a page was successfully adopted and activated
// Called from alloc_slow when allocating thread needs memory
static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
if (!me) return false;
// IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
// Avoids scanning empty queues (major performance win!)
int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
if (adoptable == 0) return false; // All queues empty, no scan needed
// Get global thread registry
int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
if (num_tp == 0) return false;
// IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
// Prevents excessive scanning overhead (2-8 threads is usually enough)
int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
// Round-robin scan (limited number of threads, not ALL!)
static _Atomic uint64_t adopt_counter = 0;
uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
for (int i = 0; i < scan_limit; i++) {
int tp_idx = (start_idx + i) % num_tp;
MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
(atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
if (!other_tp) continue;
// Route P: Idle Detection - Only adopt from idle owners
// Check if owner is still actively allocating (threshold configurable via env var)
uint64_t now_tsc = mf2_rdtsc();
uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
continue; // Owner still active, skip adoption
}
// IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
// Only one thread scans each queue at a time → eliminates CAS contention
if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
continue; // Another thread is already scanning this queue, skip
}
// Try to dequeue a pending page from this thread
MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
if (!page) {
// Queue empty, release claim and try next thread
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
continue;
}
// Clear pending flag (no longer in queue)
atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
// Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
// 0ms = disabled (no lease check), >0 = lease period in milliseconds
uint64_t now = mf2_rdtsc();
uint64_t last_transfer = page->last_transfer_time;
if (g_mf2_lease_ms > 0 && last_transfer != 0) {
// Calculate lease cycles from ms (approx 3GHz CPU)
uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
if ((now - last_transfer) < lease_cycles) {
// Lease still active, return page to full_pages (don't thrash ownership)
page->next_page = other_tp->full_pages[class_idx];
other_tp->full_pages[class_idx] = page;
// Release claim before continuing
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
continue; // Try next thread
}
}
// Try to transfer ownership using CAS
pthread_t old_owner = page->owner_tid;
pthread_t new_owner = pthread_self();
// Note: pthread_t may not be atomic-compatible on all platforms
// For now, we'll use a simple write (ownership transfer is rare)
// TODO: If thrashing is observed, add atomic CAS with serialization
page->owner_tid = new_owner;
page->owner_tp = me;
page->last_transfer_time = now;
// DEBUG: Log drain state
static _Atomic int adopt_samples = 0;
int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
unsigned int pre_free = page->free_count;
PoolBlock* pre_freelist = page->freelist;
// Drain remote frees
int drained = mf2_drain_remote_frees(page);
// DEBUG: Log result (first 10 samples)
if (sample_idx < 10) {
MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
sample_idx, class_idx, pre_remote, drained,
pre_free, page->free_count, pre_freelist, page->freelist);
}
// Make adopted page ACTIVE immediately (not partial!)
// Adoption needs immediate activation for caller's mf2_alloc_fast()
// Partial list is only for own pending queue drains
if (page->freelist) {
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
atomic_fetch_add(&g_mf2_pending_drained, 1);
atomic_fetch_add(&g_mf2_drain_success, 1);
// Make it active (move old active to full_pages)
mf2_make_page_active(me, class_idx, page);
// Release claim before returning SUCCESS
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
return true; // SUCCESS! Page adopted and activated
}
// No freelist after drain, return to MY full_pages (I'm the new owner!)
page->next_page = me->full_pages[class_idx];
me->full_pages[class_idx] = page;
// Release claim before continuing search
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
// Continue searching for a better page
}
return false; // No adoptable pages found
}
// Fast allocation path (owner thread, NO LOCK!)
static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
// Get thread-local page lists
MF2_ThreadPages* tp = mf2_thread_pages_get();
if (!tp) return NULL;
// Get active page for this class
MidPage* page = tp->active_page[class_idx];
if (!page) {
// No active page, go to slow path
return mf2_alloc_slow(class_idx, size, site_id);
}
// FAST PATH: Pop from page-local freelist (NO LOCK!)
if (page->freelist) {
atomic_fetch_add(&g_mf2_alloc_fast_hit, 1);
// Route P: Update activity tracking for idle detection
atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
PoolBlock* block = page->freelist;
page->freelist = block->next;
page->free_count--;
// Increment in-use count (atomic for cross-thread visibility)
atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed);
// Return user pointer (skip header)
return (char*)block + HEADER_SIZE;
}
// Local freelist empty, go to slow path
return mf2_alloc_slow(class_idx, size, site_id);
}
// Slow allocation path (drain remote or allocate new page)
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) {
(void)site_id; // Unused for now
atomic_fetch_add(&g_mf2_alloc_slow_hit, 1);
// Get thread-local page lists
MF2_ThreadPages* tp = mf2_thread_pages_get();
if (!tp) return NULL;
// ===========================================================================
// Allocation Strategy (Must-Reuse Order)
// ===========================================================================
// 1. MUST-REUSE GATE (Part 1): Drain own pending queue
// - Process up to 4 pages to avoid blocking
// - Direct handoff: activate first successful drain immediately
if (mf2_try_reuse_own_pending(tp, class_idx)) {
return mf2_alloc_fast(class_idx, size, site_id);
}
// 2. MUST-REUSE GATE (Part 2): Drain active page remotes
// - Check if current active page has remote frees
// - Drain and retry allocation if successful
if (mf2_try_drain_active_remotes(tp, class_idx)) {
return mf2_alloc_fast(class_idx, size, site_id);
}
// HISTORICAL NOTE: full_pages scan removed
// Old approach: Scan full_pages looking for pages with remotes
// Problem: Drained pages consumed before owner can scan them
// New approach: Direct Handoff immediately activates drained pages
// Result: full_pages scan always finds 0 pages (100% waste)
//
// Benchmark evidence (before removal):
// - Full scan checked: 1,879,484 pages
// - Full scan found: 0 pages (0% success rate!)
// 3. Consumer-Driven Adoption (Route P with idle detection)
// - Only adopt from idle owners (haven't allocated in >150µs)
// - Prevents "adoption stealing" from active owners
if (mf2_try_adopt_pending(tp, class_idx)) {
return mf2_alloc_fast(class_idx, size, site_id);
}
// 4. MUST-REUSE GATE (Final): Allocate new page (last resort)
// - Only reached after exhausting all reuse opportunities
// - Order: pending queue → active drain → adoption → NEW
MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx);
if (!page) {
return NULL; // OOM
}
// Retry allocation from new page
return mf2_alloc_fast(class_idx, size, site_id);
}
// Forward declaration of slow free path
static void mf2_free_slow(MidPage* page, void* ptr);
// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue)
// Fast free path (owner thread, NO LOCK!)
static inline void mf2_free_fast(MidPage* page, void* ptr) {
if (!page || !ptr) return;
atomic_fetch_add(&g_mf2_free_owner_count, 1);
// Get block pointer (rewind to header)
PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
// FAST PATH: Push to page-local freelist (NO LOCK!)
block->next = page->freelist;
page->freelist = block;
page->free_count++;
// Decrement in-use count (atomic for cross-thread visibility)
int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
// Check if page is now empty (all blocks free)
if (old_in_use == 1 && page->free_count == page->capacity) {
// Memory efficiency: Return empty pages to OS via MADV_DONTNEED
// Keeps VA mapped (no munmap), but releases physical memory
hak_batch_add_page(page->base, POOL_PAGE_SIZE);
}
}
// Slow free path (cross-thread free to remote stack)
static void mf2_free_slow(MidPage* page, void* ptr) {
if (!page || !ptr) return;
atomic_fetch_add(&g_mf2_free_remote_count, 1);
// Get block pointer
PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
// Push to page's remote stack (lock-free MPSC)
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire);
block->next = (PoolBlock*)old_head;
} while (!atomic_compare_exchange_weak_explicit(
&page->remote_head, &old_head, (uintptr_t)block,
memory_order_release, memory_order_relaxed));
// Increment remote count and detect threshold for enqueueing
unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst);
// CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge
// Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again)
// Solution: Only enqueue when remotes accumulate to threshold (better batching)
//
// Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4):
// 1 = immediate (0→1 edge, causes ping-pong)
// 4 = balanced (batch 4 blocks before notifying owner)
// 8 = aggressive batching (higher latency, but better efficiency)
//
// We enqueue on transitions TO the threshold (old_count == threshold-1)
static int g_enqueue_threshold = 1; // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4
if (old_count + 1 == (unsigned int)g_enqueue_threshold) {
// Remote count just reached threshold, notify owner
if (page->owner_tp) {
mf2_enqueue_pending(page->owner_tp, page);
}
}
// DEBUG: Sample first 10 remote frees - Disabled for performance
// static _Atomic int remote_free_samples = 0;
// int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed);
// if (sample < 10) {
// fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n",
// sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO");
// }
// Decrement in-use count
int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
// Check if page is now empty (FIX #6: acquire to see all remote frees)
if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) {
// Memory efficiency: Return empty pages to OS via MADV_DONTNEED
// Keeps VA mapped (no munmap), but releases physical memory
hak_batch_add_page(page->base, POOL_PAGE_SIZE);
}
}
// Top-level free dispatcher
static void mf2_free(void* ptr) {
if (!ptr) return;
// O(1) page lookup (mimalloc's magic!)
MidPage* page = mf2_addr_to_page(ptr);
if (!page) {
// Not a MF2 page (shouldn't happen if MF2 is enabled properly)
return;
}
// Check if we're the owner (fast path)
MF2_ThreadPages* tp = mf2_thread_pages_get();
if (tp && page->owner_tid == tp->my_tid) {
// Fast: Owner thread, push to local freelist (NO LOCK!)
mf2_free_fast(page, ptr);
} else {
// Slow: Cross-thread free, push to remote stack (lock-free)
mf2_free_slow(page, ptr);
}
}
// ===========================================================================
// Global pool state (simplified: single-threaded for MVP)
static struct {
PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
// Locks: per (class, shard) freelist to allow concurrent operations
PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
// Non-empty bitmap (O(1) empty class skip)
// Bit i = 1 if freelist[class][shard] is non-empty
// Use atomic to avoid class-wide locks
atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES]; // 1 bit per shard
// Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc
atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
atomic_uint remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
// Statistics
uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t total_bytes_allocated __attribute__((aligned(64)));
uint64_t total_pages_allocated __attribute__((aligned(64)));
// Per-class page accounting (for Soft CAP guidance)
uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));
// ACE: per-class bundle factor for refill (1..4) + last snapshot
int bundle_factor[POOL_NUM_CLASSES];
uint64_t last_hits[POOL_NUM_CLASSES];
uint64_t last_misses[POOL_NUM_CLASSES];
int initialized;
int tls_free_enabled; // env: HAKMEM_POOL_TLS_FREE (default: 1)
// Extra metrics (for learner logging): all relaxed atomics
atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
atomic_uint_fast64_t ring_underflow __attribute__((aligned(64)));
} g_pool;
static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
static int g_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
static int g_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES (1..8)
static int g_ring_return_div = 2; // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
static int g_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
int g_hdr_light_enabled = 0; // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation)
static int g_pool_min_bundle = 2; // env: HAKMEM_POOL_MIN_BUNDLE (default 2)
// Sampled counter updates to reduce hot-path stores: 1/2^k
static int g_count_sample_exp = 10; // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
static __thread uint32_t t_pool_rng = 0x243f6a88u; // per-thread RNG for sampling
// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap
static size_t g_class_sizes[POOL_NUM_CLASSES] = {
POOL_CLASS_2KB, // 2 KB
POOL_CLASS_4KB, // 4 KB
POOL_CLASS_8KB, // 8 KB
POOL_CLASS_16KB, // 16 KB
POOL_CLASS_32KB, // 32 KB
POOL_CLASS_40KB, // 40 KB (Bridge class 0)
POOL_CLASS_52KB // 52 KB (Bridge class 1)
};
// Blocks per page (for each class)
__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
POOL_PAGE_SIZE / POOL_CLASS_2KB, // 32 blocks (2KiB)
POOL_PAGE_SIZE / POOL_CLASS_4KB, // 16 blocks (4KiB)
POOL_PAGE_SIZE / POOL_CLASS_8KB, // 8 blocks (8KiB)
POOL_PAGE_SIZE / POOL_CLASS_16KB, // 4 blocks (16KiB)
POOL_PAGE_SIZE / POOL_CLASS_32KB, // 2 blocks (32KiB)
POOL_PAGE_SIZE / POOL_CLASS_40KB, // 1 block (40KiB Bridge)
POOL_PAGE_SIZE / POOL_CLASS_52KB // 1 block (52KiB Bridge)
};
// ===========================================================================
// Helper Functions
// ===========================================================================
// Write minimal header for Mid allocation (fast-return friendly)
static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
// For Mid, prefer headerless operation when HDR_LIGHT>=1.
// Debug or non-Mid callers can still write full headers elsewhere.
if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path
hdr->magic = HAKMEM_MAGIC;
hdr->method = ALLOC_METHOD_POOL;
hdr->size = class_sz;
if (!g_hdr_light_enabled) {
hdr->alloc_site = site_id;
hdr->class_bytes = 0;
hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
}
}
// Branchless LUT (Lookup Table) for O(1) class determination
// Expanded to 53 entries for Bridge classes (40KB, 52KB)
static const uint8_t SIZE_TO_CLASS[53] = {
0,0,0, // 0-2KB → Class 0
1,1, // 3-4KB → Class 1
2,2,2,2, // 5-8KB → Class 2
3,3,3,3,3,3,3,3, // 9-16KB → Class 3
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, // 17-32KB → Class 4
5,5,5,5,5,5,5,5, // 33-40KB → Class 5 (Bridge class 0)
6,6,6,6,6,6,6,6,6,6,6,6 // 41-52KB → Class 6 (Bridge class 1)
};
// Get size class index from size (0-6, or -1 if out of range)
// Updated range check for Bridge classes (0-52KB)
static inline int hak_pool_get_class_index(size_t size) {
// Fast path: exact match against configured class sizes (covers Bridge classes)
// Note: size passed here should already be a rounded class size from ACE.
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
size_t cs = g_class_sizes[i];
if (cs != 0 && size == cs) return i;
}
// Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior)
uint32_t kb = (uint32_t)((size + 1023) >> 10); // Round up to KB units
return (kb < 53) ? SIZE_TO_CLASS[kb] : -1; // Expanded to 53KB for Bridge classes
}
// Get shard index from site_id (0-63)
int hak_pool_get_shard_index(uintptr_t site_id) {
if (!g_shard_mix_enabled) {
// Legacy: Shift by 4 to reduce collision (instruction alignment)
return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
}
// SplitMix64-like mixer with thread id salt for better dispersion
uint64_t x = (uint64_t)site_id;
uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
x ^= (tid << 1);
x += 0x9e3779b97f4a7c15ULL;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
x = (x ^ (x >> 31));
return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
}
// TLS helpers
#include "box/pool_tls_core.inc.h"
// Refill/ACE (boxed)
#include "box/pool_refill.inc.h"
// Init/Shutdown + MF2 debug (boxed)
#include "box/pool_init_api.inc.h"
// Pool statistics (boxed)
#include "box/pool_stats.inc.h"
// Public API (boxed): alloc/free/lookup/free_fast
#include "box/pool_api.inc.h"