Files
hakmem/core/hakmem_pool.c.bak3
Moe Charm (CI) 1da8754d45 CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消
**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M   |  4T: SEGV 💀
After:  1T: 2.41M   |  4T: 4.19M   (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s 

# 4 threads: 完走(以前は SEGV)
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s 
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00

1191 lines
48 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB)
// ============================================================================
//
// サイズクラス定義:
// ┌──────────┬─────────┬──────────────┬─────────────┐
// │ クラス │ サイズ │ 初期CAP │ ページ構成 │
// ├──────────┼─────────┼──────────────┼─────────────┤
// │ Class 0 │ 2 KiB │ 64 pages │ 32 blocks/p │
// │ Class 1 │ 4 KiB │ 64 pages │ 16 blocks/p │
// │ Class 2 │ 8 KiB │ 64 pages │ 8 blocks/p │
// │ Class 3 │ 16 KiB │ 32 pages │ 4 blocks/p │
// │ Class 4 │ 32 KiB │ 16 pages │ 2 blocks/p │
// │ DYN1 │ 6 KiB* │ 0 (無効) │ 可変 │
// │ DYN2 │ (未使用)│ 0 (無効) │ 可変 │
// └──────────┴─────────┴──────────────┴─────────────┘
// * DYN1はギャップ(8-16KB)を埋めるための動的クラス
//
// W_MAX (切り上げ許容倍率):
// - 意味: 要求サイズの何倍までのクラスを許容するか
// - デフォルト: 1.40 (40%までの切り上げを許容)
// - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40)
// - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能
//
// CAP (在庫量):
// - 意味: 各クラスで保持する最大ページ数
// - 初期値: {64,64,64,32,16} - 保守的(フットプリント優先)
// - 推奨値: {256,256,256,128,64} - パフォーマンス優先
// - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定
// - 学習モード: HAKMEM_LEARN=1 で自動調整
//
// TLSリング構造:
// - POOL_L2_RING_CAP: リングバッファ容量デフォルト16
// - ActivePage A/B: bump-run方式ロックフリー
// - LIFO overflow: リングから溢れた分
//
// パフォーマンスチューニング:
// 1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64
// 2. W_MAX緩和: HAKMEM_WMAX_MID=1.6
// 3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64
// 4. 学習モード: HAKMEM_LEARN=1
//
// License: MIT
// Last Updated: 2025-10-26 (Code Cleanup完了)
#include "hakmem_pool.h"
#include "hakmem_config.h"
#include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC
#include "hakmem_syscall.h" // Box 3 syscall layer (bypasses LD_PRELOAD)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <pthread.h>
#include <stdatomic.h>
#include "hakmem_prof.h"
#include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP gating)
#include "hakmem_debug.h"
// False sharing mitigation: padded mutex type (64B)
typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;
// ===========================================================================
// Internal Data Structures
// ===========================================================================
#include "box/pool_tls_types.inc.h"
// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
#include "box/pool_mid_desc.inc.h"
// ---------------- Transfer Cache (per-thread per-class inbox) --------------
#include "box/pool_mid_tc.inc.h"
#include "box/pool_mf2_types.inc.h"
// --- MF2 Initialization Functions ---
// Thread-safe initialization using pthread_once
static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT;
static void mf2_page_registry_init_impl(void) {
// Initialize all page slots to NULL
memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry));
// Initialize 256 coarse-grained locks for registry updates
for (int i = 0; i < 256; i++) {
pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL);
}
// Initialize counters
atomic_store(&g_mf2_page_registry.total_pages, 0);
atomic_store(&g_mf2_page_registry.active_pages, 0);
}
static void mf2_page_registry_init(void) {
pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl);
}
// Strategy A: ThreadPages destructor (cleanup on thread exit)
static void mf2_thread_pages_destructor(void* arg) {
MF2_ThreadPages* tp = (MF2_ThreadPages*)arg;
if (!tp) return;
// SAFETY: Don't remove from global registry or free memory
// Reason: Causes "malloc(): unsorted double linked list corrupted" crashes
// Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime)
// TODO: Investigate safe cleanup mechanism
// Remove from global registry (DISABLED for safety)
// for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) {
// if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) {
// atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release);
// break;
// }
// }
// Free all pages owned by this thread (DISABLED for safety)
// hkm_libc_free(tp);
(void)tp; // Suppress unused warning
}
// Strategy A: Initialize pthread_key (once only)
static void mf2_init_tls_key(void) {
pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor);
}
// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection)
static inline uint64_t mf2_rdtsc(void) {
#if defined(__x86_64__) || defined(__i386__)
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
#else
// Fallback for non-x86 architectures (use clock_gettime approximation)
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}
static MF2_ThreadPages* mf2_thread_pages_get(void) {
if (t_mf2_pages) return t_mf2_pages;
// Initialize pthread_key (once only)
pthread_once(&g_mf2_key_once, mf2_init_tls_key);
// Allocate thread-local page lists
MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages));
if (!tp) return NULL;
// Initialize with current thread ID
tp->my_tid = pthread_self();
// All page lists start empty (NULL)
for (int c = 0; c < POOL_NUM_CLASSES; c++) {
tp->active_page[c] = NULL;
tp->full_pages[c] = NULL;
atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed);
atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed);
tp->page_count[c] = 0;
}
// Route P: Initialize activity tracking
atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
// Strategy A: Register in global array for round-robin drain
int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel);
if (idx < MF2_MAX_THREADS) {
atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release);
// DEBUG: Log first 10 thread registrations - Disabled for performance
// static _Atomic int reg_samples = 0;
// int rs = atomic_fetch_add_explicit(&reg_samples, 1, memory_order_relaxed);
// if (rs < 10) {
// fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n",
// rs, (unsigned long)tp->my_tid, tp, idx);
// }
} else {
MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS);
}
// Set pthread-specific data for destructor
pthread_setspecific(g_mf2_tls_key, tp);
t_mf2_pages = tp;
return tp;
}
// --- MF2 Page Allocation & Lookup ---
// O(1) page lookup from block address (mimalloc's secret sauce!)
static inline MidPage* mf2_addr_to_page(void* addr) {
// Step 1: Get page base address (64KB aligned)
// 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits
void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL);
// Step 2: Index into registry (direct-mapped, 64K entries)
// (addr >> 16) extracts page index, & 0xFFFF wraps to registry size
size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
// Step 3: Direct lookup (no hash collision handling needed with 64K entries)
MidPage* page = g_mf2_page_registry.pages[idx];
// ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups
static _Atomic int lookup_count = 0;
// DEBUG: Disabled for performance
// int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed);
// if (count < 100) {
// int found = (page != NULL);
// int match = (page && page->base == page_base);
// fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s",
// count, addr, page_base, idx, found ? "YES" : "NO");
// if (page) {
// fprintf(stderr, ", page->base=%p, match=%s",
// page->base, match ? "YES" : "NO");
// }
// fprintf(stderr, "\n");
// }
// Validation: Ensure page base matches (handles potential collisions)
if (page && page->base == page_base) {
return page;
}
// Collision or not registered (shouldn't happen in normal operation)
return NULL;
}
// Register a page in the global registry (called once per page allocation)
static void mf2_register_page(MidPage* page) {
if (!page) return;
// Calculate registry index from page base
size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
// ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance
// static int register_count = 0;
// if (register_count < 10) {
// fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n",
// register_count, page->base, idx,
// (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO");
// register_count++;
// }
// Coarse-grained lock (256 locks for 64K entries = 256 entries/lock)
int lock_idx = idx % 256;
pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
// Check for collision (should be rare with 64K entries)
if (g_mf2_page_registry.pages[idx] != NULL) {
// Collision detected - this is a problem!
// For MVP, we'll just log and overwrite (TODO: handle collisions properly)
HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx);
}
// Register the page
g_mf2_page_registry.pages[idx] = page;
// Update counters
atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
}
// Unregister a page from the global registry (called when returning page to OS)
__attribute__((unused)) static void mf2_unregister_page(MidPage* page) {
if (!page) return;
size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
int lock_idx = idx % 256;
pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
if (g_mf2_page_registry.pages[idx] == page) {
g_mf2_page_registry.pages[idx] = NULL;
atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
}
pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
}
// Allocate and initialize a new 64KB page for given size class
static MidPage* mf2_alloc_new_page(int class_idx) {
if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL;
// Get user size class (2KB, 4KB, 8KB, 16KB, 32KB)
size_t user_size = g_class_sizes[class_idx];
if (user_size == 0) return NULL; // Dynamic class disabled
// CRITICAL FIX: Each block needs HEADER_SIZE + user_size
// The header stores metadata (AllocHeader), user_size is the usable space
size_t block_size = HEADER_SIZE + user_size;
// Step 1: Allocate 64KB page (aligned to 64KB boundary)
// CRITICAL FIX #4: Must ensure 64KB alignment!
// mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup.
// This caused 97% of frees to fail silently (fatal bug!)
//
// CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion!
// Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion.
// Allocate 2x size to allow alignment adjustment
size_t alloc_size = POOL_PAGE_SIZE * 2; // 128KB
void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (raw == MAP_FAILED) {
return NULL; // OOM
}
// Find 64KB aligned address within allocation
uintptr_t addr = (uintptr_t)raw;
uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL; // Round up to 64KB boundary
void* page_base = (void*)aligned;
// Free unused prefix (if any)
size_t prefix_size = aligned - addr;
if (prefix_size > 0) {
munmap(raw, prefix_size);
}
// Free unused suffix
size_t suffix_offset = prefix_size + POOL_PAGE_SIZE;
if (suffix_offset < alloc_size) {
munmap((char*)raw + suffix_offset, alloc_size - suffix_offset);
}
// DEBUG: Log first few allocations
static _Atomic int mmap_count = 0;
int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed);
if (mc < 5) {
MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu",
mc, raw, page_base, prefix_size, alloc_size - suffix_offset);
}
// ALIGNMENT VERIFICATION (Step 1)
if (((uintptr_t)page_base & 0xFFFF) != 0) {
MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)",
page_base, ((uintptr_t)page_base & 0xFFFF));
}
// Zero-fill (required for posix_memalign)
// Note: This adds ~15μs overhead, but is necessary for correctness
memset(page_base, 0, POOL_PAGE_SIZE);
// Step 2: Allocate MidPage descriptor
MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage));
if (!page) {
// CRITICAL FIX: Use munmap for mmap-allocated memory
munmap(page_base, POOL_PAGE_SIZE);
return NULL;
}
// Step 3: Initialize page descriptor
page->base = page_base;
page->class_idx = (uint8_t)class_idx;
page->flags = 0;
page->owner_tid = pthread_self();
page->owner_tp = mf2_thread_pages_get(); // Store owner's ThreadPages for pending queue
page->last_transfer_time = 0; // No transfer yet (lease mechanism)
// Step 4: Build freelist chain (walk through page and link blocks)
// Calculate how many blocks fit in 64KB page (including header overhead)
size_t usable_size = POOL_PAGE_SIZE;
size_t num_blocks = usable_size / block_size;
page->capacity = (uint16_t)num_blocks;
page->free_count = (uint16_t)num_blocks;
// Build linked list of free blocks
PoolBlock* freelist_head = NULL;
PoolBlock* freelist_tail = NULL;
for (size_t i = 0; i < num_blocks; i++) {
char* block_addr = (char*)page_base + (i * block_size);
PoolBlock* block = (PoolBlock*)block_addr;
block->next = NULL;
if (freelist_head == NULL) {
freelist_head = block;
freelist_tail = block;
} else {
freelist_tail->next = block;
freelist_tail = block;
}
}
page->freelist = freelist_head;
// Step 5: Initialize remote stack (for cross-thread frees)
atomic_store(&page->remote_head, (uintptr_t)0);
atomic_store(&page->remote_count, 0);
// Step 6: Initialize lifecycle counters
atomic_store(&page->in_use, 0); // No blocks allocated yet
atomic_store(&page->pending_dn, 0);
// Step 7: Initialize linkage
page->next_page = NULL;
page->prev_page = NULL;
// Initialize pending queue fields
atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed);
page->next_pending = NULL;
// Step 8: Register page in global registry
mf2_register_page(page);
return page;
}
// --- MF2 Allocation & Free Operations ---
// Forward declarations
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page);
// Drain remote frees (cross-thread) into page's local freelist
// Called by owner thread when local freelist is empty
static int mf2_drain_remote_frees(MidPage* page) {
if (!page) return 0;
atomic_fetch_add(&g_mf2_drain_attempts, 1);
// Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG)
unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
if (remote_count == 0) {
return 0; // Nothing to drain
}
// Atomically swap remote stack head with NULL (lock-free pop all)
uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0,
memory_order_acq_rel);
if (!head) {
atomic_store_explicit(&page->remote_count, 0, memory_order_release);
return 0; // Race: someone else drained it
}
// Reset remote count (FIX #6: use release for future drain checks to see)
atomic_store_explicit(&page->remote_count, 0, memory_order_release);
// Walk the remote stack and count blocks
int drained = 0;
PoolBlock* cur = (PoolBlock*)head;
PoolBlock* tail = NULL;
while (cur) {
drained++;
tail = cur;
cur = cur->next;
}
// Append remote stack to local freelist (splice in front for simplicity)
if (tail) {
tail->next = page->freelist;
page->freelist = (PoolBlock*)head;
page->free_count += drained;
}
atomic_fetch_add(&g_mf2_drain_count, 1);
atomic_fetch_add(&g_mf2_drain_blocks, drained);
// CRITICAL FIX: Check if new remotes arrived DURING drain
// If so, re-enqueue to owner's pending queue (avoid losing remotes!)
unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire);
if (post_drain_count >= 1 && page->owner_tp) { // Use same threshold as initial enqueue
// New remotes arrived during drain, re-enqueue for next round
// Note: This is safe because flag was cleared earlier
mf2_enqueue_pending(page->owner_tp, page);
}
return drained;
}
// ===========================================================================
// Pending Queue Operations (MPSC Lock-Free Stack)
// ===========================================================================
// Enqueue page to owner's pending queue (called by remote threads)
// MPSC: Multiple producers (remote free threads), single consumer (owner)
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) {
if (!owner_tp || !page) return;
// Already in pending? Skip (avoid duplicate enqueue)
_Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel);
if (was_pending) {
return; // Already enqueued, nothing to do
}
atomic_fetch_add(&g_mf2_pending_enqueued, 1);
// Push to owner's pending stack (Treiber stack algorithm)
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed);
page->next_pending = (MidPage*)old_head;
} while (!atomic_compare_exchange_weak_explicit(
&owner_tp->pages_remote_pending[page->class_idx],
&old_head, (uintptr_t)page,
memory_order_release, // Publish page
memory_order_relaxed));
// 0→1 detection: Increment adoptable count for this class
// This enables O(1) early return in try_adopt (if count==0, no scan needed)
if (old_head == 0) {
atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed);
}
}
// Dequeue one page from pending queue (called by owner thread or adopter)
// Uses CAS for correctness (multi-consumer in adoption path)
static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return NULL;
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire);
if (old_head == 0) {
return NULL; // Queue empty
}
MidPage* page = (MidPage*)old_head;
// CAS to pop head
if (atomic_compare_exchange_weak_explicit(
&tp->pages_remote_pending[class_idx],
&old_head, (uintptr_t)page->next_pending,
memory_order_acq_rel, memory_order_relaxed)) {
// Successfully dequeued
MidPage* next = page->next_pending;
page->next_pending = NULL; // Clear link
// If queue became empty (next==NULL), decrement adoptable count
// This enables O(1) early return in try_adopt when all queues empty
if (next == NULL) {
atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed);
}
return page;
}
} while (1);
}
// ===========================================================================
// End of Pending Queue Operations
// ===========================================================================
// Forward declarations
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
// ===========================================================================
// Helper Functions (Clean & Modular)
// ===========================================================================
// Helper: Make page active (move old active to full_pages)
static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
if (!tp || !page) return;
// Move old active page to full_pages (if any)
if (tp->active_page[class_idx]) {
MidPage* old_active = tp->active_page[class_idx];
old_active->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = old_active;
}
// Set new page as active
tp->active_page[class_idx] = page;
page->next_page = NULL;
}
// Helper: Drain page and add to partial list (LIFO for cache locality)
// Returns true if page has free blocks after drain
static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
if (!tp || !page) return false;
// Drain remote frees
int drained = mf2_drain_remote_frees(page);
// If page has freelist after drain, add to partial list (LIFO)
if (page->freelist) {
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
page->next_page = tp->partial_pages[class_idx];
tp->partial_pages[class_idx] = page;
return true;
}
// No freelist, return to full_pages
page->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = page;
return false;
}
// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
// Returns true if page was activated
static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
if (!tp || !page) return false;
// Drain remote frees
int drained = mf2_drain_remote_frees(page);
// If page has freelist after drain, make it active immediately
if (page->freelist) {
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
mf2_make_page_active(tp, class_idx, page);
return true;
}
// No freelist, return to full_pages
page->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = page;
return false;
}
// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
// Returns true if a page was successfully drained and activated
static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return false;
// Budget: Process up to N pages to avoid blocking
for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
if (!pending_page) break; // Queue empty
atomic_fetch_add(&g_mf2_pending_drained, 1);
// Clear pending flag (no longer in queue)
atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
// DIRECT HANDOFF: Drain and activate if successful
if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
return true; // Success! Page is now active
}
// No freelist after drain, page returned to full_pages by helper
}
return false; // No pages available for reuse
}
// Helper: Try to drain remotes from active page (must-reuse gate part 2)
// Returns true if active page has freelist after drain
static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return false;
MidPage* page = tp->active_page[class_idx];
if (!page) return false;
atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
if (remote_cnt > 0) {
atomic_fetch_add(&g_mf2_slow_found_remote, 1);
int drained = mf2_drain_remote_frees(page);
if (drained > 0 && page->freelist) {
atomic_fetch_add(&g_mf2_drain_success, 1);
return true; // Success! Active page now has freelist
}
}
return false; // No remotes or drain failed
}
// Helper: Allocate new page and make it active
// Returns the newly allocated page (or NULL on OOM)
static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
if (!tp) return NULL;
atomic_fetch_add(&g_mf2_new_page_count, 1);
// DEBUG: Log why we're allocating new page (first N samples)
static _Atomic int new_page_samples = 0;
int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
// Count adoptable pages across all threads
int total_adoptable = 0;
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
}
MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
sample_idx, class_idx,
(void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
total_adoptable,
tp->active_page[class_idx],
tp->full_pages[class_idx]);
}
MidPage* page = mf2_alloc_new_page(class_idx);
if (!page) {
return NULL; // OOM
}
// Move current active page to full list (if any)
if (tp->active_page[class_idx]) {
MidPage* old_page = tp->active_page[class_idx];
old_page->next_page = tp->full_pages[class_idx];
tp->full_pages[class_idx] = old_page;
}
// Set new page as active
tp->active_page[class_idx] = page;
tp->page_count[class_idx]++;
return page;
}
// ===========================================================================
// End of Helper Functions
// ===========================================================================
// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
// Returns true if a page was successfully adopted and activated
// Called from alloc_slow when allocating thread needs memory
static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
if (!me) return false;
// IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
// Avoids scanning empty queues (major performance win!)
int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
if (adoptable == 0) return false; // All queues empty, no scan needed
// Get global thread registry
int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
if (num_tp == 0) return false;
// IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
// Prevents excessive scanning overhead (2-8 threads is usually enough)
int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
// Round-robin scan (limited number of threads, not ALL!)
static _Atomic uint64_t adopt_counter = 0;
uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
for (int i = 0; i < scan_limit; i++) {
int tp_idx = (start_idx + i) % num_tp;
MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
(atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
if (!other_tp) continue;
// Route P: Idle Detection - Only adopt from idle owners
// Check if owner is still actively allocating (threshold configurable via env var)
uint64_t now_tsc = mf2_rdtsc();
uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
continue; // Owner still active, skip adoption
}
// IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
// Only one thread scans each queue at a time → eliminates CAS contention
if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
continue; // Another thread is already scanning this queue, skip
}
// Try to dequeue a pending page from this thread
MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
if (!page) {
// Queue empty, release claim and try next thread
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
continue;
}
// Clear pending flag (no longer in queue)
atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
// Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
// 0ms = disabled (no lease check), >0 = lease period in milliseconds
uint64_t now = mf2_rdtsc();
uint64_t last_transfer = page->last_transfer_time;
if (g_mf2_lease_ms > 0 && last_transfer != 0) {
// Calculate lease cycles from ms (approx 3GHz CPU)
uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
if ((now - last_transfer) < lease_cycles) {
// Lease still active, return page to full_pages (don't thrash ownership)
page->next_page = other_tp->full_pages[class_idx];
other_tp->full_pages[class_idx] = page;
// Release claim before continuing
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
continue; // Try next thread
}
}
// Try to transfer ownership using CAS
pthread_t old_owner = page->owner_tid;
pthread_t new_owner = pthread_self();
// Note: pthread_t may not be atomic-compatible on all platforms
// For now, we'll use a simple write (ownership transfer is rare)
// TODO: If thrashing is observed, add atomic CAS with serialization
page->owner_tid = new_owner;
page->owner_tp = me;
page->last_transfer_time = now;
// DEBUG: Log drain state
static _Atomic int adopt_samples = 0;
int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
unsigned int pre_free = page->free_count;
PoolBlock* pre_freelist = page->freelist;
// Drain remote frees
int drained = mf2_drain_remote_frees(page);
// DEBUG: Log result (first 10 samples)
if (sample_idx < 10) {
MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
sample_idx, class_idx, pre_remote, drained,
pre_free, page->free_count, pre_freelist, page->freelist);
}
// Make adopted page ACTIVE immediately (not partial!)
// Adoption needs immediate activation for caller's mf2_alloc_fast()
// Partial list is only for own pending queue drains
if (page->freelist) {
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
atomic_fetch_add(&g_mf2_pending_drained, 1);
atomic_fetch_add(&g_mf2_drain_success, 1);
// Make it active (move old active to full_pages)
mf2_make_page_active(me, class_idx, page);
// Release claim before returning SUCCESS
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
return true; // SUCCESS! Page adopted and activated
}
// No freelist after drain, return to MY full_pages (I'm the new owner!)
page->next_page = me->full_pages[class_idx];
me->full_pages[class_idx] = page;
// Release claim before continuing search
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
// Continue searching for a better page
}
return false; // No adoptable pages found
}
// Fast allocation path (owner thread, NO LOCK!)
static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
// Get thread-local page lists
MF2_ThreadPages* tp = mf2_thread_pages_get();
if (!tp) return NULL;
// Get active page for this class
MidPage* page = tp->active_page[class_idx];
if (!page) {
// No active page, go to slow path
return mf2_alloc_slow(class_idx, size, site_id);
}
// FAST PATH: Pop from page-local freelist (NO LOCK!)
if (page->freelist) {
atomic_fetch_add(&g_mf2_alloc_fast_hit, 1);
// Route P: Update activity tracking for idle detection
atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
PoolBlock* block = page->freelist;
page->freelist = block->next;
page->free_count--;
// Increment in-use count (atomic for cross-thread visibility)
atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed);
// Return user pointer (skip header)
return (char*)block + HEADER_SIZE;
}
// Local freelist empty, go to slow path
return mf2_alloc_slow(class_idx, size, site_id);
}
// Slow allocation path (drain remote or allocate new page)
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) {
(void)site_id; // Unused for now
atomic_fetch_add(&g_mf2_alloc_slow_hit, 1);
// Get thread-local page lists
MF2_ThreadPages* tp = mf2_thread_pages_get();
if (!tp) return NULL;
// ===========================================================================
// Allocation Strategy (Must-Reuse Order)
// ===========================================================================
// 1. MUST-REUSE GATE (Part 1): Drain own pending queue
// - Process up to 4 pages to avoid blocking
// - Direct handoff: activate first successful drain immediately
if (mf2_try_reuse_own_pending(tp, class_idx)) {
return mf2_alloc_fast(class_idx, size, site_id);
}
// 2. MUST-REUSE GATE (Part 2): Drain active page remotes
// - Check if current active page has remote frees
// - Drain and retry allocation if successful
if (mf2_try_drain_active_remotes(tp, class_idx)) {
return mf2_alloc_fast(class_idx, size, site_id);
}
// HISTORICAL NOTE: full_pages scan removed
// Old approach: Scan full_pages looking for pages with remotes
// Problem: Drained pages consumed before owner can scan them
// New approach: Direct Handoff immediately activates drained pages
// Result: full_pages scan always finds 0 pages (100% waste)
//
// Benchmark evidence (before removal):
// - Full scan checked: 1,879,484 pages
// - Full scan found: 0 pages (0% success rate!)
// 3. Consumer-Driven Adoption (Route P with idle detection)
// - Only adopt from idle owners (haven't allocated in >150µs)
// - Prevents "adoption stealing" from active owners
if (mf2_try_adopt_pending(tp, class_idx)) {
return mf2_alloc_fast(class_idx, size, site_id);
}
// 4. MUST-REUSE GATE (Final): Allocate new page (last resort)
// - Only reached after exhausting all reuse opportunities
// - Order: pending queue → active drain → adoption → NEW
MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx);
if (!page) {
return NULL; // OOM
}
// Retry allocation from new page
return mf2_alloc_fast(class_idx, size, site_id);
}
// Forward declaration of slow free path
static void mf2_free_slow(MidPage* page, void* ptr);
// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue)
// Fast free path (owner thread, NO LOCK!)
static inline void mf2_free_fast(MidPage* page, void* ptr) {
if (!page || !ptr) return;
atomic_fetch_add(&g_mf2_free_owner_count, 1);
// Get block pointer (rewind to header)
PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
// FAST PATH: Push to page-local freelist (NO LOCK!)
block->next = page->freelist;
page->freelist = block;
page->free_count++;
// Decrement in-use count (atomic for cross-thread visibility)
int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
// Check if page is now empty (all blocks free)
if (old_in_use == 1 && page->free_count == page->capacity) {
// Memory efficiency: Return empty pages to OS via MADV_DONTNEED
// Keeps VA mapped (no munmap), but releases physical memory
hak_batch_add_page(page->base, POOL_PAGE_SIZE);
}
}
// Slow free path (cross-thread free to remote stack)
static void mf2_free_slow(MidPage* page, void* ptr) {
if (!page || !ptr) return;
atomic_fetch_add(&g_mf2_free_remote_count, 1);
// Get block pointer
PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
// Push to page's remote stack (lock-free MPSC)
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire);
block->next = (PoolBlock*)old_head;
} while (!atomic_compare_exchange_weak_explicit(
&page->remote_head, &old_head, (uintptr_t)block,
memory_order_release, memory_order_relaxed));
// Increment remote count and detect threshold for enqueueing
unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst);
// CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge
// Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again)
// Solution: Only enqueue when remotes accumulate to threshold (better batching)
//
// Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4):
// 1 = immediate (0→1 edge, causes ping-pong)
// 4 = balanced (batch 4 blocks before notifying owner)
// 8 = aggressive batching (higher latency, but better efficiency)
//
// We enqueue on transitions TO the threshold (old_count == threshold-1)
static int g_enqueue_threshold = 1; // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4
if (old_count + 1 == (unsigned int)g_enqueue_threshold) {
// Remote count just reached threshold, notify owner
if (page->owner_tp) {
mf2_enqueue_pending(page->owner_tp, page);
}
}
// DEBUG: Sample first 10 remote frees - Disabled for performance
// static _Atomic int remote_free_samples = 0;
// int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed);
// if (sample < 10) {
// fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n",
// sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO");
// }
// Decrement in-use count
int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
// Check if page is now empty (FIX #6: acquire to see all remote frees)
if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) {
// Memory efficiency: Return empty pages to OS via MADV_DONTNEED
// Keeps VA mapped (no munmap), but releases physical memory
hak_batch_add_page(page->base, POOL_PAGE_SIZE);
}
}
// Top-level free dispatcher
static void mf2_free(void* ptr) {
if (!ptr) return;
// O(1) page lookup (mimalloc's magic!)
MidPage* page = mf2_addr_to_page(ptr);
if (!page) {
// Not a MF2 page (shouldn't happen if MF2 is enabled properly)
return;
}
// Check if we're the owner (fast path)
MF2_ThreadPages* tp = mf2_thread_pages_get();
if (tp && page->owner_tid == tp->my_tid) {
// Fast: Owner thread, push to local freelist (NO LOCK!)
mf2_free_fast(page, ptr);
} else {
// Slow: Cross-thread free, push to remote stack (lock-free)
mf2_free_slow(page, ptr);
}
}
// ===========================================================================
// Global pool state (simplified: single-threaded for MVP)
static struct {
PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
// Locks: per (class, shard) freelist to allow concurrent operations
PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
// Non-empty bitmap (O(1) empty class skip)
// Bit i = 1 if freelist[class][shard] is non-empty
// Use atomic to avoid class-wide locks
atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES]; // 1 bit per shard
// Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc
atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
atomic_uint remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
// Statistics
uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t total_bytes_allocated __attribute__((aligned(64)));
uint64_t total_pages_allocated __attribute__((aligned(64)));
// Per-class page accounting (for Soft CAP guidance)
uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));
// ACE: per-class bundle factor for refill (1..4) + last snapshot
int bundle_factor[POOL_NUM_CLASSES];
uint64_t last_hits[POOL_NUM_CLASSES];
uint64_t last_misses[POOL_NUM_CLASSES];
int initialized;
int tls_free_enabled; // env: HAKMEM_POOL_TLS_FREE (default: 1)
// Extra metrics (for learner logging): all relaxed atomics
atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
atomic_uint_fast64_t ring_underflow __attribute__((aligned(64)));
} g_pool;
static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
static int g_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
static int g_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES (1..8)
static int g_ring_return_div = 2; // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
static int g_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
int g_hdr_light_enabled = 0; // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation)
static int g_pool_min_bundle = 2; // env: HAKMEM_POOL_MIN_BUNDLE (default 2)
// Sampled counter updates to reduce hot-path stores: 1/2^k
static int g_count_sample_exp = 10; // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
static __thread uint32_t t_pool_rng = 0x243f6a88u; // per-thread RNG for sampling
// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap
static size_t g_class_sizes[POOL_NUM_CLASSES] = {
POOL_CLASS_2KB, // 2 KB
POOL_CLASS_4KB, // 4 KB
POOL_CLASS_8KB, // 8 KB
POOL_CLASS_16KB, // 16 KB
POOL_CLASS_32KB, // 32 KB
POOL_CLASS_40KB, // 40 KB (Bridge class 0)
POOL_CLASS_52KB // 52 KB (Bridge class 1)
};
// Blocks per page (for each class)
__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
POOL_PAGE_SIZE / POOL_CLASS_2KB, // 32 blocks (2KiB)
POOL_PAGE_SIZE / POOL_CLASS_4KB, // 16 blocks (4KiB)
POOL_PAGE_SIZE / POOL_CLASS_8KB, // 8 blocks (8KiB)
POOL_PAGE_SIZE / POOL_CLASS_16KB, // 4 blocks (16KiB)
POOL_PAGE_SIZE / POOL_CLASS_32KB, // 2 blocks (32KiB)
POOL_PAGE_SIZE / POOL_CLASS_40KB, // 1 block (40KiB Bridge)
POOL_PAGE_SIZE / POOL_CLASS_52KB // 1 block (52KiB Bridge)
};
// ===========================================================================
// Helper Functions
// ===========================================================================
// Write minimal header for Mid allocation (fast-return friendly)
static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
// For Mid, prefer headerless operation when HDR_LIGHT>=1.
// Debug or non-Mid callers can still write full headers elsewhere.
if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path
hdr->magic = HAKMEM_MAGIC;
hdr->method = ALLOC_METHOD_POOL;
hdr->size = class_sz;
if (!g_hdr_light_enabled) {
hdr->alloc_site = site_id;
hdr->class_bytes = 0;
hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
}
}
// Branchless LUT (Lookup Table) for O(1) class determination
// Expanded to 53 entries for Bridge classes (40KB, 52KB)
static const uint8_t SIZE_TO_CLASS[53] = {
0,0,0, // 0-2KB → Class 0
1,1, // 3-4KB → Class 1
2,2,2,2, // 5-8KB → Class 2
3,3,3,3,3,3,3,3, // 9-16KB → Class 3
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, // 17-32KB → Class 4
5,5,5,5,5,5,5,5, // 33-40KB → Class 5 (Bridge class 0)
6,6,6,6,6,6,6,6,6,6,6,6 // 41-52KB → Class 6 (Bridge class 1)
};
// Get size class index from size (0-6, or -1 if out of range)
// Updated range check for Bridge classes (0-52KB)
static inline int hak_pool_get_class_index(size_t size) {
// Fast path: exact match against configured class sizes (covers Bridge classes)
// Note: size passed here should already be a rounded class size from ACE.
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
size_t cs = g_class_sizes[i];
if (cs != 0 && size == cs) return i;
}
// Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior)
uint32_t kb = (uint32_t)((size + 1023) >> 10); // Round up to KB units
return (kb < 53) ? SIZE_TO_CLASS[kb] : -1; // Expanded to 53KB for Bridge classes
}
// Get shard index from site_id (0-63)
int hak_pool_get_shard_index(uintptr_t site_id) {
if (!g_shard_mix_enabled) {
// Legacy: Shift by 4 to reduce collision (instruction alignment)
return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
}
// SplitMix64-like mixer with thread id salt for better dispersion
uint64_t x = (uint64_t)site_id;
uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
x ^= (tid << 1);
x += 0x9e3779b97f4a7c15ULL;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
x = (x ^ (x >> 31));
return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
}
// TLS helpers
#include "box/pool_tls_core.inc.h"
// Refill/ACE (boxed)
#include "box/pool_refill.inc.h"
// Init/Shutdown + MF2 debug (boxed)
#include "box/pool_init_api.inc.h"
// Pool statistics (boxed)
#include "box/pool_stats.inc.h"
// Public API (boxed): alloc/free/lookup/free_fast
#include "box/pool_api.inc.h"