2593 lines
110 KiB
C
2593 lines
110 KiB
C
|
|
// ============================================================================
|
|||
|
|
// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB)
|
|||
|
|
// ============================================================================
|
|||
|
|
//
|
|||
|
|
// サイズクラス定義:
|
|||
|
|
// ┌──────────┬─────────┬──────────────┬─────────────┐
|
|||
|
|
// │ クラス │ サイズ │ 初期CAP │ ページ構成 │
|
|||
|
|
// ├──────────┼─────────┼──────────────┼─────────────┤
|
|||
|
|
// │ Class 0 │ 2 KiB │ 64 pages │ 32 blocks/p │
|
|||
|
|
// │ Class 1 │ 4 KiB │ 64 pages │ 16 blocks/p │
|
|||
|
|
// │ Class 2 │ 8 KiB │ 64 pages │ 8 blocks/p │
|
|||
|
|
// │ Class 3 │ 16 KiB │ 32 pages │ 4 blocks/p │
|
|||
|
|
// │ Class 4 │ 32 KiB │ 16 pages │ 2 blocks/p │
|
|||
|
|
// │ DYN1 │ 6 KiB* │ 0 (無効) │ 可変 │
|
|||
|
|
// │ DYN2 │ (未使用)│ 0 (無効) │ 可変 │
|
|||
|
|
// └──────────┴─────────┴──────────────┴─────────────┘
|
|||
|
|
// * DYN1はギャップ(8-16KB)を埋めるための動的クラス
|
|||
|
|
//
|
|||
|
|
// W_MAX (切り上げ許容倍率):
|
|||
|
|
// - 意味: 要求サイズの何倍までのクラスを許容するか
|
|||
|
|
// - デフォルト: 1.40 (40%までの切り上げを許容)
|
|||
|
|
// - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40)
|
|||
|
|
// - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能
|
|||
|
|
//
|
|||
|
|
// CAP (在庫量):
|
|||
|
|
// - 意味: 各クラスで保持する最大ページ数
|
|||
|
|
// - 初期値: {64,64,64,32,16} - 保守的(フットプリント優先)
|
|||
|
|
// - 推奨値: {256,256,256,128,64} - パフォーマンス優先
|
|||
|
|
// - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定
|
|||
|
|
// - 学習モード: HAKMEM_LEARN=1 で自動調整
|
|||
|
|
//
|
|||
|
|
// TLSリング構造:
|
|||
|
|
// - POOL_L2_RING_CAP: リングバッファ容量(デフォルト16)
|
|||
|
|
// - ActivePage A/B: bump-run方式(ロックフリー)
|
|||
|
|
// - LIFO overflow: リングから溢れた分
|
|||
|
|
//
|
|||
|
|
// パフォーマンスチューニング:
|
|||
|
|
// 1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64
|
|||
|
|
// 2. W_MAX緩和: HAKMEM_WMAX_MID=1.6
|
|||
|
|
// 3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64
|
|||
|
|
// 4. 学習モード: HAKMEM_LEARN=1
|
|||
|
|
//
|
|||
|
|
// License: MIT
|
|||
|
|
// Last Updated: 2025-10-26 (Code Cleanup完了)
|
|||
|
|
|
|||
|
|
#include "hakmem_pool.h"
|
|||
|
|
#include "hakmem_config.h"
|
|||
|
|
#include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC
|
|||
|
|
#include "hakmem_syscall.h" // Box 3 syscall layer (bypasses LD_PRELOAD)
|
|||
|
|
#include <stdlib.h>
|
|||
|
|
#include <string.h>
|
|||
|
|
#include <stdio.h>
|
|||
|
|
#include <stdbool.h>
|
|||
|
|
#include <sys/mman.h>
|
|||
|
|
#include <pthread.h>
|
|||
|
|
#include <stdatomic.h>
|
|||
|
|
#include "hakmem_prof.h"
|
|||
|
|
#include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP gating)
|
|||
|
|
#include "hakmem_debug.h"
|
|||
|
|
|
|||
|
|
// False sharing mitigation: padded mutex type (64B)
|
|||
|
|
typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Internal Data Structures
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Freelist block header (embedded in allocated block)
|
|||
|
|
typedef struct PoolBlock {
|
|||
|
|
struct PoolBlock* next; // Next free block in freelist
|
|||
|
|
} PoolBlock;
|
|||
|
|
|
|||
|
|
// TLS cache: one block per class to avoid frequent locks (legacy single-slot)
|
|||
|
|
__thread PoolBlock* tls_pool_cache[POOL_NUM_CLASSES] = {NULL};
|
|||
|
|
|
|||
|
|
// TLS ring buffer to further reduce lock traffic (configurable capacity)
|
|||
|
|
// Separate ring size for L2 Pool (mid/large allocations: 8-32KB)
|
|||
|
|
#ifndef POOL_L2_RING_CAP
|
|||
|
|
#define POOL_L2_RING_CAP 48 // Optimized for L1 cache efficiency (384B, 6 cache lines)
|
|||
|
|
#endif
|
|||
|
|
typedef struct { PoolBlock* items[POOL_L2_RING_CAP]; int top; } PoolTLSRing;
|
|||
|
|
typedef struct { PoolTLSRing ring; PoolBlock* lo_head; size_t lo_count; } PoolTLSBin;
|
|||
|
|
static __thread PoolTLSBin g_tls_bin[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// TLS active pages (per class): bump-run (no per-block links) from privately owned pages (max 3)
|
|||
|
|
typedef struct {
|
|||
|
|
void* page; // page base
|
|||
|
|
char* bump; // next raw allocation (header start)
|
|||
|
|
char* end; // page end (bump-run limit)
|
|||
|
|
int count; // remaining blocks (for quick checks)
|
|||
|
|
} PoolTLSPage;
|
|||
|
|
static __thread PoolTLSPage g_tls_active_page_a[POOL_NUM_CLASSES];
|
|||
|
|
static __thread PoolTLSPage g_tls_active_page_b[POOL_NUM_CLASSES];
|
|||
|
|
static __thread PoolTLSPage g_tls_active_page_c[POOL_NUM_CLASSES]; // QW2-adjusted: 3 slots (was 4)
|
|||
|
|
|
|||
|
|
// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
|
|||
|
|
#define MID_DESC_BUCKETS 2048
|
|||
|
|
typedef struct MidPageDesc {
|
|||
|
|
void* page;
|
|||
|
|
uint8_t class_idx;
|
|||
|
|
uint8_t _pad0;
|
|||
|
|
uint16_t _pad1;
|
|||
|
|
uint64_t owner_tid;
|
|||
|
|
atomic_int in_use; // live allocations on this page
|
|||
|
|
int blocks_per_page; // total blocks on this page
|
|||
|
|
atomic_int pending_dn; // background DONTNEED enqueued
|
|||
|
|
struct MidPageDesc* next;
|
|||
|
|
} MidPageDesc;
|
|||
|
|
static pthread_mutex_t g_mid_desc_mu[MID_DESC_BUCKETS];
|
|||
|
|
static MidPageDesc* g_mid_desc_head[MID_DESC_BUCKETS];
|
|||
|
|
|
|||
|
|
static inline uint32_t mid_desc_hash(void* page) {
|
|||
|
|
uintptr_t x = (uintptr_t)page >> 16; // 64KiB alignment granularity
|
|||
|
|
// mix
|
|||
|
|
x ^= x >> 33; x *= 0xff51afd7ed558ccdULL; x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL; x ^= x >> 33;
|
|||
|
|
return (uint32_t)(x & (MID_DESC_BUCKETS - 1));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Thread-safe initialization using pthread_once
|
|||
|
|
static pthread_once_t mid_desc_init_once_control = PTHREAD_ONCE_INIT;
|
|||
|
|
static void mid_desc_init_impl(void) {
|
|||
|
|
for (int i = 0; i < MID_DESC_BUCKETS; i++) {
|
|||
|
|
pthread_mutex_init(&g_mid_desc_mu[i], NULL);
|
|||
|
|
g_mid_desc_head[i] = NULL;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
static void mid_desc_init_once(void) {
|
|||
|
|
pthread_once(&mid_desc_init_once_control, mid_desc_init_impl);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static void mid_desc_register(void* page, int class_idx, uint64_t owner_tid) {
|
|||
|
|
mid_desc_init_once();
|
|||
|
|
uint32_t h = mid_desc_hash(page);
|
|||
|
|
pthread_mutex_lock(&g_mid_desc_mu[h]);
|
|||
|
|
MidPageDesc* d = (MidPageDesc*)hkm_libc_malloc(sizeof(MidPageDesc)); // P0 Fix: Use libc malloc
|
|||
|
|
if (d) {
|
|||
|
|
d->page = page; d->class_idx = (uint8_t)class_idx; d->owner_tid = owner_tid; d->next = g_mid_desc_head[h];
|
|||
|
|
atomic_store(&d->in_use, 0);
|
|||
|
|
d->blocks_per_page = 0; // optional; not used for emptiness in P0
|
|||
|
|
atomic_store(&d->pending_dn, 0);
|
|||
|
|
g_mid_desc_head[h] = d;
|
|||
|
|
}
|
|||
|
|
pthread_mutex_unlock(&g_mid_desc_mu[h]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static MidPageDesc* mid_desc_lookup(void* addr) {
|
|||
|
|
mid_desc_init_once();
|
|||
|
|
void* page = (void*)((uintptr_t)addr & ~((uintptr_t)POOL_PAGE_SIZE - 1));
|
|||
|
|
uint32_t h = mid_desc_hash(page);
|
|||
|
|
for (MidPageDesc* d = g_mid_desc_head[h]; d; d = d->next) {
|
|||
|
|
if (d->page == page) return d;
|
|||
|
|
}
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static void mid_desc_adopt(void* addr, int class_idx, uint64_t owner_tid) {
|
|||
|
|
if (owner_tid == 0) return;
|
|||
|
|
void* page = (void*)((uintptr_t)addr & ~((uintptr_t)POOL_PAGE_SIZE - 1));
|
|||
|
|
uint32_t h = mid_desc_hash(page);
|
|||
|
|
pthread_mutex_lock(&g_mid_desc_mu[h]);
|
|||
|
|
MidPageDesc* d = g_mid_desc_head[h];
|
|||
|
|
while (d) { if (d->page == page) break; d = d->next; }
|
|||
|
|
if (d) {
|
|||
|
|
if (d->owner_tid == 0) d->owner_tid = owner_tid;
|
|||
|
|
} else {
|
|||
|
|
MidPageDesc* nd = (MidPageDesc*)hkm_libc_malloc(sizeof(MidPageDesc)); // P0 Fix: Use libc malloc
|
|||
|
|
if (nd) { nd->page = page; nd->class_idx = (uint8_t)class_idx; nd->owner_tid = owner_tid; nd->next = g_mid_desc_head[h]; g_mid_desc_head[h] = nd; }
|
|||
|
|
}
|
|||
|
|
pthread_mutex_unlock(&g_mid_desc_mu[h]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Increment page in-use counter for given raw block pointer
|
|||
|
|
static inline void mid_page_inuse_inc(void* raw) {
|
|||
|
|
MidPageDesc* d = mid_desc_lookup(raw);
|
|||
|
|
if (d) atomic_fetch_add_explicit(&d->in_use, 1, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Decrement page in-use counter and enqueue DONTNEED when it drops to 0
|
|||
|
|
extern int hak_batch_add_page(void* page, size_t size);
|
|||
|
|
static inline void mid_page_inuse_dec_and_maybe_dn(void* raw) {
|
|||
|
|
MidPageDesc* d = mid_desc_lookup(raw);
|
|||
|
|
if (!d) return;
|
|||
|
|
int nv = atomic_fetch_sub_explicit(&d->in_use, 1, memory_order_relaxed) - 1;
|
|||
|
|
if (nv <= 0) {
|
|||
|
|
// Fire once per empty transition
|
|||
|
|
if (atomic_exchange_explicit(&d->pending_dn, 1, memory_order_acq_rel) == 0) {
|
|||
|
|
hak_batch_add_page(d->page, POOL_PAGE_SIZE);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------- Transfer Cache (per-thread per-class inbox) --------------
|
|||
|
|
typedef struct MidTC {
|
|||
|
|
atomic_uintptr_t inbox[POOL_NUM_CLASSES];
|
|||
|
|
} MidTC;
|
|||
|
|
|
|||
|
|
#define MID_TC_BUCKETS 1024
|
|||
|
|
typedef struct MidTCEntry { uint64_t tid; MidTC* tc; struct MidTCEntry* next; } MidTCEntry;
|
|||
|
|
static pthread_mutex_t g_mid_tc_mu[MID_TC_BUCKETS];
|
|||
|
|
static MidTCEntry* g_mid_tc_head[MID_TC_BUCKETS];
|
|||
|
|
static __thread MidTC* t_mid_tc = NULL;
|
|||
|
|
static int g_tc_enabled = 1; // env: HAKMEM_TC_ENABLE (default 1)
|
|||
|
|
static int g_tc_drain_unbounded = 1; // env: HAKMEM_TC_UNBOUNDED (default 1)
|
|||
|
|
static int g_tc_drain_max = 0; // env: HAKMEM_TC_DRAIN_MAX (0=unbounded)
|
|||
|
|
static int g_tc_drain_trigger = 2; // env: HAKMEM_TC_DRAIN_TRIGGER (ring->top < trigger)
|
|||
|
|
|
|||
|
|
static inline uint32_t mid_tc_hash(uint64_t tid) {
|
|||
|
|
tid ^= tid >> 33; tid *= 0xff51afd7ed558ccdULL; tid ^= tid >> 33; tid *= 0xc4ceb9fe1a85ec53ULL; tid ^= tid >> 33;
|
|||
|
|
return (uint32_t)(tid & (MID_TC_BUCKETS - 1));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Thread-safe initialization using pthread_once
|
|||
|
|
static pthread_once_t mid_tc_init_once_control = PTHREAD_ONCE_INIT;
|
|||
|
|
static void mid_tc_init_impl(void) {
|
|||
|
|
for (int i = 0; i < MID_TC_BUCKETS; i++) {
|
|||
|
|
pthread_mutex_init(&g_mid_tc_mu[i], NULL);
|
|||
|
|
g_mid_tc_head[i] = NULL;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
static void mid_tc_init_once(void) {
|
|||
|
|
pthread_once(&mid_tc_init_once_control, mid_tc_init_impl);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static MidTC* mid_tc_get(void) {
|
|||
|
|
if (t_mid_tc) return t_mid_tc;
|
|||
|
|
mid_tc_init_once();
|
|||
|
|
MidTC* tc = (MidTC*)hkm_libc_calloc(1, sizeof(MidTC)); // P0 Fix: Use libc malloc
|
|||
|
|
if (!tc) return NULL;
|
|||
|
|
uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
|
|||
|
|
uint32_t h = mid_tc_hash(tid);
|
|||
|
|
pthread_mutex_lock(&g_mid_tc_mu[h]);
|
|||
|
|
MidTCEntry* e = (MidTCEntry*)hkm_libc_malloc(sizeof(MidTCEntry)); // P0 Fix: Use libc malloc
|
|||
|
|
if (e) { e->tid = tid; e->tc = tc; e->next = g_mid_tc_head[h]; g_mid_tc_head[h] = e; }
|
|||
|
|
pthread_mutex_unlock(&g_mid_tc_mu[h]);
|
|||
|
|
t_mid_tc = tc;
|
|||
|
|
return tc;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static MidTC* mid_tc_lookup_by_tid(uint64_t tid) {
|
|||
|
|
mid_tc_init_once();
|
|||
|
|
uint32_t h = mid_tc_hash(tid);
|
|||
|
|
MidTCEntry* e = g_mid_tc_head[h];
|
|||
|
|
while (e) { if (e->tid == tid) return e->tc; e = e->next; }
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static inline void mid_tc_push(MidTC* tc, int class_idx, PoolBlock* b) {
|
|||
|
|
uintptr_t old_head;
|
|||
|
|
do {
|
|||
|
|
old_head = atomic_load_explicit(&tc->inbox[class_idx], memory_order_acquire);
|
|||
|
|
b->next = (PoolBlock*)old_head;
|
|||
|
|
} while (!atomic_compare_exchange_weak_explicit(&tc->inbox[class_idx], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static inline int mid_tc_drain_into_tls(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin) {
|
|||
|
|
MidTC* tc = mid_tc_get();
|
|||
|
|
if (!tc) return 0;
|
|||
|
|
HKM_TIME_START(t_tc);
|
|||
|
|
uintptr_t head = atomic_exchange_explicit(&tc->inbox[class_idx], (uintptr_t)0, memory_order_acq_rel);
|
|||
|
|
if (!head) { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc); return 0; }
|
|||
|
|
int moved = 0;
|
|||
|
|
int limit = (g_tc_drain_unbounded || g_tc_drain_max <= 0) ? INT32_MAX : g_tc_drain_max;
|
|||
|
|
PoolBlock* cur = (PoolBlock*)head;
|
|||
|
|
while (cur && moved < limit) {
|
|||
|
|
PoolBlock* nxt = cur->next;
|
|||
|
|
if (ring->top < POOL_L2_RING_CAP) {
|
|||
|
|
ring->items[ring->top++] = cur; moved++;
|
|||
|
|
} else {
|
|||
|
|
cur->next = bin->lo_head; bin->lo_head = cur; bin->lo_count++; moved++;
|
|||
|
|
}
|
|||
|
|
cur = nxt;
|
|||
|
|
}
|
|||
|
|
while (cur) { PoolBlock* nxt = cur->next; mid_tc_push(tc, class_idx, cur); cur = nxt; }
|
|||
|
|
HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc);
|
|||
|
|
return moved;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static inline int mid_tc_has_items(int class_idx) {
|
|||
|
|
MidTC* tc = t_mid_tc; // do not allocate on peek
|
|||
|
|
if (!tc) return 0;
|
|||
|
|
return atomic_load_explicit(&tc->inbox[class_idx], memory_order_relaxed) != 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture
|
|||
|
|
// ===========================================================================
|
|||
|
|
//
|
|||
|
|
// Key idea: Each 64KB page has independent freelist (no sharing!)
|
|||
|
|
// - O(1) page lookup from block address: (addr & ~0xFFFF)
|
|||
|
|
// - Owner thread: fast path (no locks, no atomics)
|
|||
|
|
// - Cross-thread free: lock-free remote stack
|
|||
|
|
// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc)
|
|||
|
|
|
|||
|
|
// MF2 Configuration Constants (Quick Win #5)
|
|||
|
|
#define MF2_PENDING_QUEUE_BUDGET 4 // Max pages to drain from pending queue
|
|||
|
|
#define MF2_DEBUG_SAMPLE_COUNT 20 // Number of debug samples to log
|
|||
|
|
#define MF2_TSC_CYCLES_PER_US 3000 // Estimated TSC cycles per microsecond
|
|||
|
|
#define MF2_PAGE_SIZE_SHIFT 16 // log2(64KB) for fast division
|
|||
|
|
#define MF2_PAGE_ALIGNMENT 65536 // 64KB alignment for mmap
|
|||
|
|
|
|||
|
|
// Debug Logging Macros (Quick Win #6)
|
|||
|
|
// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable
|
|||
|
|
#ifdef HAKMEM_DEBUG_MF2
|
|||
|
|
#define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__)
|
|||
|
|
#define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
|
|||
|
|
#else
|
|||
|
|
#define MF2_DEBUG_LOG(fmt, ...) ((void)0)
|
|||
|
|
#define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// Forward declarations
|
|||
|
|
static size_t g_class_sizes[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// MF2 Page descriptor: per-page metadata (one per 64KB page)
|
|||
|
|
typedef struct MidPage {
|
|||
|
|
// Page identity
|
|||
|
|
void* base; // Page base address (64KB aligned)
|
|||
|
|
uint8_t class_idx; // Size class index (0-6)
|
|||
|
|
uint8_t flags; // Page flags (reserved for future use)
|
|||
|
|
uint16_t _pad0;
|
|||
|
|
|
|||
|
|
// Ownership
|
|||
|
|
pthread_t owner_tid; // Owner thread ID (for fast-path check)
|
|||
|
|
struct MF2_ThreadPages* owner_tp; // Owner thread's heap (for pending queue access)
|
|||
|
|
uint64_t last_transfer_time; // rdtsc() timestamp of last ownership transfer (lease mechanism)
|
|||
|
|
|
|||
|
|
// Page-local freelist (owner-only, NO LOCK!)
|
|||
|
|
PoolBlock* freelist; // Local freelist head
|
|||
|
|
uint16_t free_count; // Number of free blocks
|
|||
|
|
uint16_t capacity; // Total blocks per page
|
|||
|
|
|
|||
|
|
// Remote frees (cross-thread, lock-free MPSC stack)
|
|||
|
|
atomic_uintptr_t remote_head; // Lock-free remote free stack
|
|||
|
|
atomic_uint remote_count; // Remote free count (for quick check)
|
|||
|
|
|
|||
|
|
// Lifecycle
|
|||
|
|
atomic_int in_use; // Live allocations on this page
|
|||
|
|
atomic_int pending_dn; // DONTNEED enqueued flag
|
|||
|
|
|
|||
|
|
// Linkage (thread-local page lists)
|
|||
|
|
struct MidPage* next_page; // Next page in thread's list
|
|||
|
|
struct MidPage* prev_page; // Previous page in thread's list
|
|||
|
|
|
|||
|
|
// Pending queue (remote drain notification)
|
|||
|
|
_Atomic(_Bool) in_remote_pending; // Is this page in pending queue?
|
|||
|
|
struct MidPage* next_pending; // Next page in pending queue
|
|||
|
|
|
|||
|
|
// Padding to cache line boundary (avoid false sharing)
|
|||
|
|
char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 +
|
|||
|
|
sizeof(atomic_uintptr_t) + sizeof(atomic_uint) +
|
|||
|
|
sizeof(atomic_int) * 2 + sizeof(pthread_t) +
|
|||
|
|
sizeof(_Atomic(_Bool)) + 4) % 64)];
|
|||
|
|
} MidPage;
|
|||
|
|
|
|||
|
|
// Page registry: O(1) lookup from block address
|
|||
|
|
// Use direct indexing: (addr >> 16) & MASK
|
|||
|
|
#define MF2_PAGE_REGISTRY_BITS 16 // 64K entries (4GB address space with 64KB pages)
|
|||
|
|
#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS)
|
|||
|
|
#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1)
|
|||
|
|
|
|||
|
|
typedef struct {
|
|||
|
|
// Direct-mapped page table (no hash collisions!)
|
|||
|
|
MidPage* pages[MF2_PAGE_REGISTRY_SIZE];
|
|||
|
|
|
|||
|
|
// Coarse-grained locks for rare updates (page alloc/free)
|
|||
|
|
// 256 locks = 256-way parallelism for page registration
|
|||
|
|
pthread_mutex_t locks[256];
|
|||
|
|
|
|||
|
|
// Statistics
|
|||
|
|
atomic_uint_fast64_t total_pages; // Total pages allocated
|
|||
|
|
atomic_uint_fast64_t active_pages; // Pages with live allocations
|
|||
|
|
} MF2_PageRegistry;
|
|||
|
|
|
|||
|
|
// Thread-local page lists (one list per size class)
|
|||
|
|
typedef struct MF2_ThreadPages {
|
|||
|
|
// Active pages (have free blocks)
|
|||
|
|
MidPage* active_page[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// Partial pages (drained pages with free blocks, LIFO for cache locality)
|
|||
|
|
// Checked before allocating new pages (fast reuse path)
|
|||
|
|
MidPage* partial_pages[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// Full pages (no free blocks, but may receive remote frees)
|
|||
|
|
// TODO: Gradually deprecate in favor of partial_pages
|
|||
|
|
MidPage* full_pages[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// Pending queue (pages with remote frees, MPSC lock-free stack)
|
|||
|
|
atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// Pending claim flags (prevent multi-consumer CAS thrashing)
|
|||
|
|
// One adopter at a time per queue (test_and_set to claim, clear to release)
|
|||
|
|
atomic_flag pending_claim[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// Page ownership count (for statistics)
|
|||
|
|
uint32_t page_count[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
// Thread identity (cached for fast comparison)
|
|||
|
|
pthread_t my_tid;
|
|||
|
|
|
|||
|
|
// Route P: Activity tracking for idle-based adoption
|
|||
|
|
// Updated on every allocation (mf2_alloc_fast)
|
|||
|
|
// Read by adopters to check if owner is idle
|
|||
|
|
atomic_uint_fast64_t last_alloc_tsc;
|
|||
|
|
} MF2_ThreadPages;
|
|||
|
|
|
|||
|
|
// Global page registry (shared, rarely accessed)
|
|||
|
|
static MF2_PageRegistry g_mf2_page_registry;
|
|||
|
|
|
|||
|
|
// Thread-local page lists (hot path, no sharing!)
|
|||
|
|
static __thread MF2_ThreadPages* t_mf2_pages = NULL;
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// MF2 Global State (Quick Win #3b - Structured Globals)
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Individual globals replaced with structured state below.
|
|||
|
|
// Old declarations removed, replaced with macro-mapped struct instances.
|
|||
|
|
//
|
|||
|
|
// Benefits:
|
|||
|
|
// - Logical grouping (config, registry, stats)
|
|||
|
|
// - Better documentation
|
|||
|
|
// - Easier to extend or refactor
|
|||
|
|
// - Single source of truth for each category
|
|||
|
|
|
|||
|
|
#define MF2_MAX_THREADS 256
|
|||
|
|
|
|||
|
|
// MF2 Configuration (environment variables)
|
|||
|
|
typedef struct {
|
|||
|
|
int enabled; // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled)
|
|||
|
|
int max_queues; // HAKMEM_MF2_MAX_QUEUES (default: 2)
|
|||
|
|
int lease_ms; // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled)
|
|||
|
|
int idle_threshold_us; // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs)
|
|||
|
|
} MF2_Config;
|
|||
|
|
|
|||
|
|
// MF2 Thread Registry (cross-thread coordination)
|
|||
|
|
typedef struct {
|
|||
|
|
MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS]; // Global registry
|
|||
|
|
_Atomic int num_thread_pages; // Active thread count
|
|||
|
|
_Atomic int adoptable_count[POOL_NUM_CLASSES]; // Non-empty pending queues
|
|||
|
|
pthread_key_t tls_key; // Thread-local storage key
|
|||
|
|
pthread_once_t key_once; // TLS initialization guard
|
|||
|
|
} MF2_Registry;
|
|||
|
|
|
|||
|
|
// MF2 Statistics (debug instrumentation)
|
|||
|
|
typedef struct {
|
|||
|
|
// Allocation path
|
|||
|
|
atomic_uint_fast64_t alloc_fast_hit;
|
|||
|
|
atomic_uint_fast64_t alloc_slow_hit;
|
|||
|
|
atomic_uint_fast64_t page_reuse_count;
|
|||
|
|
atomic_uint_fast64_t new_page_count;
|
|||
|
|
|
|||
|
|
// Free path
|
|||
|
|
atomic_uint_fast64_t free_owner_count;
|
|||
|
|
atomic_uint_fast64_t free_remote_count;
|
|||
|
|
|
|||
|
|
// Drain operations
|
|||
|
|
atomic_uint_fast64_t drain_count;
|
|||
|
|
atomic_uint_fast64_t drain_blocks;
|
|||
|
|
atomic_uint_fast64_t drain_attempts;
|
|||
|
|
atomic_uint_fast64_t drain_success;
|
|||
|
|
atomic_uint_fast64_t slow_checked_drain;
|
|||
|
|
atomic_uint_fast64_t slow_found_remote;
|
|||
|
|
|
|||
|
|
// Full page scan (obsolete, kept for historical tracking)
|
|||
|
|
atomic_uint_fast64_t full_scan_checked;
|
|||
|
|
atomic_uint_fast64_t full_scan_found_remote;
|
|||
|
|
atomic_uint_fast64_t eager_drain_scanned;
|
|||
|
|
atomic_uint_fast64_t eager_drain_found;
|
|||
|
|
|
|||
|
|
// Pending queue
|
|||
|
|
atomic_uint_fast64_t pending_enqueued;
|
|||
|
|
atomic_uint_fast64_t pending_drained;
|
|||
|
|
atomic_uint_fast64_t pending_requeued;
|
|||
|
|
} MF2_Stats;
|
|||
|
|
|
|||
|
|
// Instantiate structured global state (Quick Win #3b)
|
|||
|
|
static MF2_Config g_mf2_config = {
|
|||
|
|
.enabled = 0, // Will be set by env var
|
|||
|
|
.max_queues = 2,
|
|||
|
|
.lease_ms = 10,
|
|||
|
|
.idle_threshold_us = 150
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
static MF2_Registry g_mf2_registry = {
|
|||
|
|
.all_thread_pages = {0},
|
|||
|
|
.num_thread_pages = 0,
|
|||
|
|
.adoptable_count = {0},
|
|||
|
|
.tls_key = 0,
|
|||
|
|
.key_once = PTHREAD_ONCE_INIT
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
static MF2_Stats g_mf2_stats = {
|
|||
|
|
// All fields initialized to 0 (atomic zero-initialization is valid)
|
|||
|
|
.alloc_fast_hit = 0,
|
|||
|
|
.alloc_slow_hit = 0,
|
|||
|
|
.page_reuse_count = 0,
|
|||
|
|
.new_page_count = 0,
|
|||
|
|
.free_owner_count = 0,
|
|||
|
|
.free_remote_count = 0,
|
|||
|
|
.drain_count = 0,
|
|||
|
|
.drain_blocks = 0,
|
|||
|
|
.drain_attempts = 0,
|
|||
|
|
.drain_success = 0,
|
|||
|
|
.slow_checked_drain = 0,
|
|||
|
|
.slow_found_remote = 0,
|
|||
|
|
.full_scan_checked = 0,
|
|||
|
|
.full_scan_found_remote = 0,
|
|||
|
|
.eager_drain_scanned = 0,
|
|||
|
|
.eager_drain_found = 0,
|
|||
|
|
.pending_enqueued = 0,
|
|||
|
|
.pending_drained = 0,
|
|||
|
|
.pending_requeued = 0
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// Compatibility macros: Map old global names to struct fields
|
|||
|
|
// This allows existing code to work unchanged while using structured state
|
|||
|
|
#define g_mf2_enabled (g_mf2_config.enabled)
|
|||
|
|
#define g_mf2_max_queues (g_mf2_config.max_queues)
|
|||
|
|
#define g_mf2_lease_ms (g_mf2_config.lease_ms)
|
|||
|
|
#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us)
|
|||
|
|
|
|||
|
|
#define g_all_thread_pages (g_mf2_registry.all_thread_pages)
|
|||
|
|
#define g_num_thread_pages (g_mf2_registry.num_thread_pages)
|
|||
|
|
#define g_adoptable_count (g_mf2_registry.adoptable_count)
|
|||
|
|
#define g_mf2_tls_key (g_mf2_registry.tls_key)
|
|||
|
|
#define g_mf2_key_once (g_mf2_registry.key_once)
|
|||
|
|
|
|||
|
|
#define g_mf2_alloc_fast_hit (g_mf2_stats.alloc_fast_hit)
|
|||
|
|
#define g_mf2_alloc_slow_hit (g_mf2_stats.alloc_slow_hit)
|
|||
|
|
#define g_mf2_page_reuse_count (g_mf2_stats.page_reuse_count)
|
|||
|
|
#define g_mf2_new_page_count (g_mf2_stats.new_page_count)
|
|||
|
|
#define g_mf2_free_owner_count (g_mf2_stats.free_owner_count)
|
|||
|
|
#define g_mf2_free_remote_count (g_mf2_stats.free_remote_count)
|
|||
|
|
#define g_mf2_drain_count (g_mf2_stats.drain_count)
|
|||
|
|
#define g_mf2_drain_blocks (g_mf2_stats.drain_blocks)
|
|||
|
|
#define g_mf2_drain_attempts (g_mf2_stats.drain_attempts)
|
|||
|
|
#define g_mf2_drain_success (g_mf2_stats.drain_success)
|
|||
|
|
#define g_mf2_slow_checked_drain (g_mf2_stats.slow_checked_drain)
|
|||
|
|
#define g_mf2_slow_found_remote (g_mf2_stats.slow_found_remote)
|
|||
|
|
#define g_mf2_full_scan_checked (g_mf2_stats.full_scan_checked)
|
|||
|
|
#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote)
|
|||
|
|
#define g_mf2_eager_drain_scanned (g_mf2_stats.eager_drain_scanned)
|
|||
|
|
#define g_mf2_eager_drain_found (g_mf2_stats.eager_drain_found)
|
|||
|
|
#define g_mf2_pending_enqueued (g_mf2_stats.pending_enqueued)
|
|||
|
|
#define g_mf2_pending_drained (g_mf2_stats.pending_drained)
|
|||
|
|
#define g_mf2_pending_requeued (g_mf2_stats.pending_requeued)
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// End of MF2 Data Structures
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// --- MF2 Initialization Functions ---
|
|||
|
|
|
|||
|
|
// Thread-safe initialization using pthread_once
|
|||
|
|
static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT;
|
|||
|
|
static void mf2_page_registry_init_impl(void) {
|
|||
|
|
// Initialize all page slots to NULL
|
|||
|
|
memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry));
|
|||
|
|
|
|||
|
|
// Initialize 256 coarse-grained locks for registry updates
|
|||
|
|
for (int i = 0; i < 256; i++) {
|
|||
|
|
pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize counters
|
|||
|
|
atomic_store(&g_mf2_page_registry.total_pages, 0);
|
|||
|
|
atomic_store(&g_mf2_page_registry.active_pages, 0);
|
|||
|
|
}
|
|||
|
|
static void mf2_page_registry_init(void) {
|
|||
|
|
pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Strategy A: ThreadPages destructor (cleanup on thread exit)
|
|||
|
|
static void mf2_thread_pages_destructor(void* arg) {
|
|||
|
|
MF2_ThreadPages* tp = (MF2_ThreadPages*)arg;
|
|||
|
|
if (!tp) return;
|
|||
|
|
|
|||
|
|
// SAFETY: Don't remove from global registry or free memory
|
|||
|
|
// Reason: Causes "malloc(): unsorted double linked list corrupted" crashes
|
|||
|
|
// Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime)
|
|||
|
|
// TODO: Investigate safe cleanup mechanism
|
|||
|
|
|
|||
|
|
// Remove from global registry (DISABLED for safety)
|
|||
|
|
// for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) {
|
|||
|
|
// if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) {
|
|||
|
|
// atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release);
|
|||
|
|
// break;
|
|||
|
|
// }
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
// Free all pages owned by this thread (DISABLED for safety)
|
|||
|
|
// hkm_libc_free(tp);
|
|||
|
|
|
|||
|
|
(void)tp; // Suppress unused warning
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Strategy A: Initialize pthread_key (once only)
|
|||
|
|
static void mf2_init_tls_key(void) {
|
|||
|
|
pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection)
|
|||
|
|
static inline uint64_t mf2_rdtsc(void) {
|
|||
|
|
#if defined(__x86_64__) || defined(__i386__)
|
|||
|
|
uint32_t lo, hi;
|
|||
|
|
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
|
|||
|
|
return ((uint64_t)hi << 32) | lo;
|
|||
|
|
#else
|
|||
|
|
// Fallback for non-x86 architectures (use clock_gettime approximation)
|
|||
|
|
struct timespec ts;
|
|||
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|||
|
|
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static MF2_ThreadPages* mf2_thread_pages_get(void) {
|
|||
|
|
if (t_mf2_pages) return t_mf2_pages;
|
|||
|
|
|
|||
|
|
// Initialize pthread_key (once only)
|
|||
|
|
pthread_once(&g_mf2_key_once, mf2_init_tls_key);
|
|||
|
|
|
|||
|
|
// Allocate thread-local page lists
|
|||
|
|
MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages));
|
|||
|
|
if (!tp) return NULL;
|
|||
|
|
|
|||
|
|
// Initialize with current thread ID
|
|||
|
|
tp->my_tid = pthread_self();
|
|||
|
|
|
|||
|
|
// All page lists start empty (NULL)
|
|||
|
|
for (int c = 0; c < POOL_NUM_CLASSES; c++) {
|
|||
|
|
tp->active_page[c] = NULL;
|
|||
|
|
tp->full_pages[c] = NULL;
|
|||
|
|
atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed);
|
|||
|
|
atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed);
|
|||
|
|
tp->page_count[c] = 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Route P: Initialize activity tracking
|
|||
|
|
atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
|
|||
|
|
|
|||
|
|
// Strategy A: Register in global array for round-robin drain
|
|||
|
|
int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel);
|
|||
|
|
if (idx < MF2_MAX_THREADS) {
|
|||
|
|
atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release);
|
|||
|
|
|
|||
|
|
// DEBUG: Log first 10 thread registrations - Disabled for performance
|
|||
|
|
// static _Atomic int reg_samples = 0;
|
|||
|
|
// int rs = atomic_fetch_add_explicit(®_samples, 1, memory_order_relaxed);
|
|||
|
|
// if (rs < 10) {
|
|||
|
|
// fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n",
|
|||
|
|
// rs, (unsigned long)tp->my_tid, tp, idx);
|
|||
|
|
// }
|
|||
|
|
} else {
|
|||
|
|
MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Set pthread-specific data for destructor
|
|||
|
|
pthread_setspecific(g_mf2_tls_key, tp);
|
|||
|
|
|
|||
|
|
t_mf2_pages = tp;
|
|||
|
|
return tp;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// --- MF2 Page Allocation & Lookup ---
|
|||
|
|
|
|||
|
|
// O(1) page lookup from block address (mimalloc's secret sauce!)
|
|||
|
|
static inline MidPage* mf2_addr_to_page(void* addr) {
|
|||
|
|
// Step 1: Get page base address (64KB aligned)
|
|||
|
|
// 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits
|
|||
|
|
void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL);
|
|||
|
|
|
|||
|
|
// Step 2: Index into registry (direct-mapped, 64K entries)
|
|||
|
|
// (addr >> 16) extracts page index, & 0xFFFF wraps to registry size
|
|||
|
|
size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
|
|||
|
|
|
|||
|
|
// Step 3: Direct lookup (no hash collision handling needed with 64K entries)
|
|||
|
|
MidPage* page = g_mf2_page_registry.pages[idx];
|
|||
|
|
|
|||
|
|
// ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups
|
|||
|
|
static _Atomic int lookup_count = 0;
|
|||
|
|
// DEBUG: Disabled for performance
|
|||
|
|
// int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed);
|
|||
|
|
// if (count < 100) {
|
|||
|
|
// int found = (page != NULL);
|
|||
|
|
// int match = (page && page->base == page_base);
|
|||
|
|
// fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s",
|
|||
|
|
// count, addr, page_base, idx, found ? "YES" : "NO");
|
|||
|
|
// if (page) {
|
|||
|
|
// fprintf(stderr, ", page->base=%p, match=%s",
|
|||
|
|
// page->base, match ? "YES" : "NO");
|
|||
|
|
// }
|
|||
|
|
// fprintf(stderr, "\n");
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
// Validation: Ensure page base matches (handles potential collisions)
|
|||
|
|
if (page && page->base == page_base) {
|
|||
|
|
return page;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Collision or not registered (shouldn't happen in normal operation)
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Register a page in the global registry (called once per page allocation)
|
|||
|
|
static void mf2_register_page(MidPage* page) {
|
|||
|
|
if (!page) return;
|
|||
|
|
|
|||
|
|
// Calculate registry index from page base
|
|||
|
|
size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
|
|||
|
|
|
|||
|
|
// ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance
|
|||
|
|
// static int register_count = 0;
|
|||
|
|
// if (register_count < 10) {
|
|||
|
|
// fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n",
|
|||
|
|
// register_count, page->base, idx,
|
|||
|
|
// (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO");
|
|||
|
|
// register_count++;
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
// Coarse-grained lock (256 locks for 64K entries = 256 entries/lock)
|
|||
|
|
int lock_idx = idx % 256;
|
|||
|
|
pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
|
|||
|
|
|
|||
|
|
// Check for collision (should be rare with 64K entries)
|
|||
|
|
if (g_mf2_page_registry.pages[idx] != NULL) {
|
|||
|
|
// Collision detected - this is a problem!
|
|||
|
|
// For MVP, we'll just log and overwrite (TODO: handle collisions properly)
|
|||
|
|
HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Register the page
|
|||
|
|
g_mf2_page_registry.pages[idx] = page;
|
|||
|
|
|
|||
|
|
// Update counters
|
|||
|
|
atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed);
|
|||
|
|
atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
|
|||
|
|
|
|||
|
|
pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Unregister a page from the global registry (called when returning page to OS)
|
|||
|
|
__attribute__((unused)) static void mf2_unregister_page(MidPage* page) {
|
|||
|
|
if (!page) return;
|
|||
|
|
|
|||
|
|
size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
|
|||
|
|
int lock_idx = idx % 256;
|
|||
|
|
|
|||
|
|
pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
|
|||
|
|
|
|||
|
|
if (g_mf2_page_registry.pages[idx] == page) {
|
|||
|
|
g_mf2_page_registry.pages[idx] = NULL;
|
|||
|
|
atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Allocate and initialize a new 64KB page for given size class
|
|||
|
|
static MidPage* mf2_alloc_new_page(int class_idx) {
|
|||
|
|
if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL;
|
|||
|
|
|
|||
|
|
// Get user size class (2KB, 4KB, 8KB, 16KB, 32KB)
|
|||
|
|
size_t user_size = g_class_sizes[class_idx];
|
|||
|
|
if (user_size == 0) return NULL; // Dynamic class disabled
|
|||
|
|
|
|||
|
|
// CRITICAL FIX: Each block needs HEADER_SIZE + user_size
|
|||
|
|
// The header stores metadata (AllocHeader), user_size is the usable space
|
|||
|
|
size_t block_size = HEADER_SIZE + user_size;
|
|||
|
|
|
|||
|
|
// Step 1: Allocate 64KB page (aligned to 64KB boundary)
|
|||
|
|
// CRITICAL FIX #4: Must ensure 64KB alignment!
|
|||
|
|
// mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup.
|
|||
|
|
// This caused 97% of frees to fail silently (fatal bug!)
|
|||
|
|
//
|
|||
|
|
// CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion!
|
|||
|
|
// Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion.
|
|||
|
|
|
|||
|
|
// Allocate 2x size to allow alignment adjustment
|
|||
|
|
size_t alloc_size = POOL_PAGE_SIZE * 2; // 128KB
|
|||
|
|
void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
|
|||
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|||
|
|
if (raw == MAP_FAILED) {
|
|||
|
|
return NULL; // OOM
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Find 64KB aligned address within allocation
|
|||
|
|
uintptr_t addr = (uintptr_t)raw;
|
|||
|
|
uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL; // Round up to 64KB boundary
|
|||
|
|
void* page_base = (void*)aligned;
|
|||
|
|
|
|||
|
|
// Free unused prefix (if any)
|
|||
|
|
size_t prefix_size = aligned - addr;
|
|||
|
|
if (prefix_size > 0) {
|
|||
|
|
munmap(raw, prefix_size);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Free unused suffix
|
|||
|
|
size_t suffix_offset = prefix_size + POOL_PAGE_SIZE;
|
|||
|
|
if (suffix_offset < alloc_size) {
|
|||
|
|
munmap((char*)raw + suffix_offset, alloc_size - suffix_offset);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DEBUG: Log first few allocations
|
|||
|
|
static _Atomic int mmap_count = 0;
|
|||
|
|
int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed);
|
|||
|
|
if (mc < 5) {
|
|||
|
|
MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu",
|
|||
|
|
mc, raw, page_base, prefix_size, alloc_size - suffix_offset);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ALIGNMENT VERIFICATION (Step 1)
|
|||
|
|
if (((uintptr_t)page_base & 0xFFFF) != 0) {
|
|||
|
|
MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)",
|
|||
|
|
page_base, ((uintptr_t)page_base & 0xFFFF));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Zero-fill (required for posix_memalign)
|
|||
|
|
// Note: This adds ~15μs overhead, but is necessary for correctness
|
|||
|
|
memset(page_base, 0, POOL_PAGE_SIZE);
|
|||
|
|
|
|||
|
|
// Step 2: Allocate MidPage descriptor
|
|||
|
|
MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage));
|
|||
|
|
if (!page) {
|
|||
|
|
// CRITICAL FIX: Use munmap for mmap-allocated memory
|
|||
|
|
munmap(page_base, POOL_PAGE_SIZE);
|
|||
|
|
return NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Step 3: Initialize page descriptor
|
|||
|
|
page->base = page_base;
|
|||
|
|
page->class_idx = (uint8_t)class_idx;
|
|||
|
|
page->flags = 0;
|
|||
|
|
page->owner_tid = pthread_self();
|
|||
|
|
page->owner_tp = mf2_thread_pages_get(); // Store owner's ThreadPages for pending queue
|
|||
|
|
page->last_transfer_time = 0; // No transfer yet (lease mechanism)
|
|||
|
|
|
|||
|
|
// Step 4: Build freelist chain (walk through page and link blocks)
|
|||
|
|
// Calculate how many blocks fit in 64KB page (including header overhead)
|
|||
|
|
size_t usable_size = POOL_PAGE_SIZE;
|
|||
|
|
size_t num_blocks = usable_size / block_size;
|
|||
|
|
|
|||
|
|
page->capacity = (uint16_t)num_blocks;
|
|||
|
|
page->free_count = (uint16_t)num_blocks;
|
|||
|
|
|
|||
|
|
// Build linked list of free blocks
|
|||
|
|
PoolBlock* freelist_head = NULL;
|
|||
|
|
PoolBlock* freelist_tail = NULL;
|
|||
|
|
|
|||
|
|
for (size_t i = 0; i < num_blocks; i++) {
|
|||
|
|
char* block_addr = (char*)page_base + (i * block_size);
|
|||
|
|
PoolBlock* block = (PoolBlock*)block_addr;
|
|||
|
|
|
|||
|
|
block->next = NULL;
|
|||
|
|
|
|||
|
|
if (freelist_head == NULL) {
|
|||
|
|
freelist_head = block;
|
|||
|
|
freelist_tail = block;
|
|||
|
|
} else {
|
|||
|
|
freelist_tail->next = block;
|
|||
|
|
freelist_tail = block;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
page->freelist = freelist_head;
|
|||
|
|
|
|||
|
|
// Step 5: Initialize remote stack (for cross-thread frees)
|
|||
|
|
atomic_store(&page->remote_head, (uintptr_t)0);
|
|||
|
|
atomic_store(&page->remote_count, 0);
|
|||
|
|
|
|||
|
|
// Step 6: Initialize lifecycle counters
|
|||
|
|
atomic_store(&page->in_use, 0); // No blocks allocated yet
|
|||
|
|
atomic_store(&page->pending_dn, 0);
|
|||
|
|
|
|||
|
|
// Step 7: Initialize linkage
|
|||
|
|
page->next_page = NULL;
|
|||
|
|
page->prev_page = NULL;
|
|||
|
|
|
|||
|
|
// Initialize pending queue fields
|
|||
|
|
atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed);
|
|||
|
|
page->next_pending = NULL;
|
|||
|
|
|
|||
|
|
// Step 8: Register page in global registry
|
|||
|
|
mf2_register_page(page);
|
|||
|
|
|
|||
|
|
return page;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// --- MF2 Allocation & Free Operations ---
|
|||
|
|
|
|||
|
|
// Forward declarations
|
|||
|
|
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page);
|
|||
|
|
|
|||
|
|
// Drain remote frees (cross-thread) into page's local freelist
|
|||
|
|
// Called by owner thread when local freelist is empty
|
|||
|
|
static int mf2_drain_remote_frees(MidPage* page) {
|
|||
|
|
if (!page) return 0;
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_drain_attempts, 1);
|
|||
|
|
|
|||
|
|
// Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG)
|
|||
|
|
unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
|
|||
|
|
if (remote_count == 0) {
|
|||
|
|
return 0; // Nothing to drain
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Atomically swap remote stack head with NULL (lock-free pop all)
|
|||
|
|
uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0,
|
|||
|
|
memory_order_acq_rel);
|
|||
|
|
if (!head) {
|
|||
|
|
atomic_store_explicit(&page->remote_count, 0, memory_order_release);
|
|||
|
|
return 0; // Race: someone else drained it
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Reset remote count (FIX #6: use release for future drain checks to see)
|
|||
|
|
atomic_store_explicit(&page->remote_count, 0, memory_order_release);
|
|||
|
|
|
|||
|
|
// Walk the remote stack and count blocks
|
|||
|
|
int drained = 0;
|
|||
|
|
PoolBlock* cur = (PoolBlock*)head;
|
|||
|
|
PoolBlock* tail = NULL;
|
|||
|
|
|
|||
|
|
while (cur) {
|
|||
|
|
drained++;
|
|||
|
|
tail = cur;
|
|||
|
|
cur = cur->next;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Append remote stack to local freelist (splice in front for simplicity)
|
|||
|
|
if (tail) {
|
|||
|
|
tail->next = page->freelist;
|
|||
|
|
page->freelist = (PoolBlock*)head;
|
|||
|
|
page->free_count += drained;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_drain_count, 1);
|
|||
|
|
atomic_fetch_add(&g_mf2_drain_blocks, drained);
|
|||
|
|
|
|||
|
|
// CRITICAL FIX: Check if new remotes arrived DURING drain
|
|||
|
|
// If so, re-enqueue to owner's pending queue (avoid losing remotes!)
|
|||
|
|
unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire);
|
|||
|
|
if (post_drain_count >= 1 && page->owner_tp) { // Use same threshold as initial enqueue
|
|||
|
|
// New remotes arrived during drain, re-enqueue for next round
|
|||
|
|
// Note: This is safe because flag was cleared earlier
|
|||
|
|
mf2_enqueue_pending(page->owner_tp, page);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return drained;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Pending Queue Operations (MPSC Lock-Free Stack)
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Enqueue page to owner's pending queue (called by remote threads)
|
|||
|
|
// MPSC: Multiple producers (remote free threads), single consumer (owner)
|
|||
|
|
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) {
|
|||
|
|
if (!owner_tp || !page) return;
|
|||
|
|
|
|||
|
|
// Already in pending? Skip (avoid duplicate enqueue)
|
|||
|
|
_Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel);
|
|||
|
|
if (was_pending) {
|
|||
|
|
return; // Already enqueued, nothing to do
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_pending_enqueued, 1);
|
|||
|
|
|
|||
|
|
// Push to owner's pending stack (Treiber stack algorithm)
|
|||
|
|
uintptr_t old_head;
|
|||
|
|
do {
|
|||
|
|
old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed);
|
|||
|
|
page->next_pending = (MidPage*)old_head;
|
|||
|
|
} while (!atomic_compare_exchange_weak_explicit(
|
|||
|
|
&owner_tp->pages_remote_pending[page->class_idx],
|
|||
|
|
&old_head, (uintptr_t)page,
|
|||
|
|
memory_order_release, // Publish page
|
|||
|
|
memory_order_relaxed));
|
|||
|
|
|
|||
|
|
// 0→1 detection: Increment adoptable count for this class
|
|||
|
|
// This enables O(1) early return in try_adopt (if count==0, no scan needed)
|
|||
|
|
if (old_head == 0) {
|
|||
|
|
atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Dequeue one page from pending queue (called by owner thread or adopter)
|
|||
|
|
// Uses CAS for correctness (multi-consumer in adoption path)
|
|||
|
|
static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
|
|||
|
|
if (!tp) return NULL;
|
|||
|
|
|
|||
|
|
uintptr_t old_head;
|
|||
|
|
do {
|
|||
|
|
old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire);
|
|||
|
|
if (old_head == 0) {
|
|||
|
|
return NULL; // Queue empty
|
|||
|
|
}
|
|||
|
|
MidPage* page = (MidPage*)old_head;
|
|||
|
|
|
|||
|
|
// CAS to pop head
|
|||
|
|
if (atomic_compare_exchange_weak_explicit(
|
|||
|
|
&tp->pages_remote_pending[class_idx],
|
|||
|
|
&old_head, (uintptr_t)page->next_pending,
|
|||
|
|
memory_order_acq_rel, memory_order_relaxed)) {
|
|||
|
|
// Successfully dequeued
|
|||
|
|
MidPage* next = page->next_pending;
|
|||
|
|
page->next_pending = NULL; // Clear link
|
|||
|
|
|
|||
|
|
// If queue became empty (next==NULL), decrement adoptable count
|
|||
|
|
// This enables O(1) early return in try_adopt when all queues empty
|
|||
|
|
if (next == NULL) {
|
|||
|
|
atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return page;
|
|||
|
|
}
|
|||
|
|
} while (1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// End of Pending Queue Operations
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Forward declarations
|
|||
|
|
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Helper Functions (Clean & Modular)
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Helper: Make page active (move old active to full_pages)
|
|||
|
|
static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
|
|||
|
|
if (!tp || !page) return;
|
|||
|
|
|
|||
|
|
// Move old active page to full_pages (if any)
|
|||
|
|
if (tp->active_page[class_idx]) {
|
|||
|
|
MidPage* old_active = tp->active_page[class_idx];
|
|||
|
|
old_active->next_page = tp->full_pages[class_idx];
|
|||
|
|
tp->full_pages[class_idx] = old_active;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Set new page as active
|
|||
|
|
tp->active_page[class_idx] = page;
|
|||
|
|
page->next_page = NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: Drain page and add to partial list (LIFO for cache locality)
|
|||
|
|
// Returns true if page has free blocks after drain
|
|||
|
|
static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
|
|||
|
|
if (!tp || !page) return false;
|
|||
|
|
|
|||
|
|
// Drain remote frees
|
|||
|
|
int drained = mf2_drain_remote_frees(page);
|
|||
|
|
|
|||
|
|
// If page has freelist after drain, add to partial list (LIFO)
|
|||
|
|
if (page->freelist) {
|
|||
|
|
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
|
|||
|
|
page->next_page = tp->partial_pages[class_idx];
|
|||
|
|
tp->partial_pages[class_idx] = page;
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// No freelist, return to full_pages
|
|||
|
|
page->next_page = tp->full_pages[class_idx];
|
|||
|
|
tp->full_pages[class_idx] = page;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
|
|||
|
|
// Returns true if page was activated
|
|||
|
|
static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
|
|||
|
|
if (!tp || !page) return false;
|
|||
|
|
|
|||
|
|
// Drain remote frees
|
|||
|
|
int drained = mf2_drain_remote_frees(page);
|
|||
|
|
|
|||
|
|
// If page has freelist after drain, make it active immediately
|
|||
|
|
if (page->freelist) {
|
|||
|
|
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
|
|||
|
|
mf2_make_page_active(tp, class_idx, page);
|
|||
|
|
return true;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// No freelist, return to full_pages
|
|||
|
|
page->next_page = tp->full_pages[class_idx];
|
|||
|
|
tp->full_pages[class_idx] = page;
|
|||
|
|
return false;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
|
|||
|
|
// Returns true if a page was successfully drained and activated
|
|||
|
|
static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
|
|||
|
|
if (!tp) return false;
|
|||
|
|
|
|||
|
|
// Budget: Process up to N pages to avoid blocking
|
|||
|
|
for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
|
|||
|
|
MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
|
|||
|
|
if (!pending_page) break; // Queue empty
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_pending_drained, 1);
|
|||
|
|
|
|||
|
|
// Clear pending flag (no longer in queue)
|
|||
|
|
atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
|
|||
|
|
|
|||
|
|
// DIRECT HANDOFF: Drain and activate if successful
|
|||
|
|
if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
|
|||
|
|
return true; // Success! Page is now active
|
|||
|
|
}
|
|||
|
|
// No freelist after drain, page returned to full_pages by helper
|
|||
|
|
}
|
|||
|
|
return false; // No pages available for reuse
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: Try to drain remotes from active page (must-reuse gate part 2)
|
|||
|
|
// Returns true if active page has freelist after drain
|
|||
|
|
static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
|
|||
|
|
if (!tp) return false;
|
|||
|
|
|
|||
|
|
MidPage* page = tp->active_page[class_idx];
|
|||
|
|
if (!page) return false;
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
|
|||
|
|
unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
|
|||
|
|
|
|||
|
|
if (remote_cnt > 0) {
|
|||
|
|
atomic_fetch_add(&g_mf2_slow_found_remote, 1);
|
|||
|
|
int drained = mf2_drain_remote_frees(page);
|
|||
|
|
if (drained > 0 && page->freelist) {
|
|||
|
|
atomic_fetch_add(&g_mf2_drain_success, 1);
|
|||
|
|
return true; // Success! Active page now has freelist
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return false; // No remotes or drain failed
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Helper: Allocate new page and make it active
|
|||
|
|
// Returns the newly allocated page (or NULL on OOM)
|
|||
|
|
static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
|
|||
|
|
if (!tp) return NULL;
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_new_page_count, 1);
|
|||
|
|
|
|||
|
|
// DEBUG: Log why we're allocating new page (first N samples)
|
|||
|
|
static _Atomic int new_page_samples = 0;
|
|||
|
|
int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
|
|||
|
|
if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
|
|||
|
|
// Count adoptable pages across all threads
|
|||
|
|
int total_adoptable = 0;
|
|||
|
|
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
|
|||
|
|
total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
|
|||
|
|
sample_idx, class_idx,
|
|||
|
|
(void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
|
|||
|
|
total_adoptable,
|
|||
|
|
tp->active_page[class_idx],
|
|||
|
|
tp->full_pages[class_idx]);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
MidPage* page = mf2_alloc_new_page(class_idx);
|
|||
|
|
if (!page) {
|
|||
|
|
return NULL; // OOM
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Move current active page to full list (if any)
|
|||
|
|
if (tp->active_page[class_idx]) {
|
|||
|
|
MidPage* old_page = tp->active_page[class_idx];
|
|||
|
|
old_page->next_page = tp->full_pages[class_idx];
|
|||
|
|
tp->full_pages[class_idx] = old_page;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Set new page as active
|
|||
|
|
tp->active_page[class_idx] = page;
|
|||
|
|
tp->page_count[class_idx]++;
|
|||
|
|
|
|||
|
|
return page;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// End of Helper Functions
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
|
|||
|
|
// Returns true if a page was successfully adopted and activated
|
|||
|
|
// Called from alloc_slow when allocating thread needs memory
|
|||
|
|
static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
|
|||
|
|
if (!me) return false;
|
|||
|
|
|
|||
|
|
// IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
|
|||
|
|
// Avoids scanning empty queues (major performance win!)
|
|||
|
|
int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
|
|||
|
|
if (adoptable == 0) return false; // All queues empty, no scan needed
|
|||
|
|
|
|||
|
|
// Get global thread registry
|
|||
|
|
int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
|
|||
|
|
if (num_tp == 0) return false;
|
|||
|
|
|
|||
|
|
// IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
|
|||
|
|
// Prevents excessive scanning overhead (2-8 threads is usually enough)
|
|||
|
|
int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
|
|||
|
|
|
|||
|
|
// Round-robin scan (limited number of threads, not ALL!)
|
|||
|
|
static _Atomic uint64_t adopt_counter = 0;
|
|||
|
|
uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
|
|||
|
|
|
|||
|
|
for (int i = 0; i < scan_limit; i++) {
|
|||
|
|
int tp_idx = (start_idx + i) % num_tp;
|
|||
|
|
MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
|
|||
|
|
(atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
|
|||
|
|
|
|||
|
|
if (!other_tp) continue;
|
|||
|
|
|
|||
|
|
// Route P: Idle Detection - Only adopt from idle owners
|
|||
|
|
// Check if owner is still actively allocating (threshold configurable via env var)
|
|||
|
|
uint64_t now_tsc = mf2_rdtsc();
|
|||
|
|
uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
|
|||
|
|
uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
|
|||
|
|
|
|||
|
|
if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
|
|||
|
|
continue; // Owner still active, skip adoption
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
|
|||
|
|
// Only one thread scans each queue at a time → eliminates CAS contention
|
|||
|
|
if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
|
|||
|
|
continue; // Another thread is already scanning this queue, skip
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Try to dequeue a pending page from this thread
|
|||
|
|
MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
|
|||
|
|
if (!page) {
|
|||
|
|
// Queue empty, release claim and try next thread
|
|||
|
|
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
|
|||
|
|
continue;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Clear pending flag (no longer in queue)
|
|||
|
|
atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
|
|||
|
|
|
|||
|
|
// Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
|
|||
|
|
// 0ms = disabled (no lease check), >0 = lease period in milliseconds
|
|||
|
|
uint64_t now = mf2_rdtsc();
|
|||
|
|
uint64_t last_transfer = page->last_transfer_time;
|
|||
|
|
if (g_mf2_lease_ms > 0 && last_transfer != 0) {
|
|||
|
|
// Calculate lease cycles from ms (approx 3GHz CPU)
|
|||
|
|
uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
|
|||
|
|
if ((now - last_transfer) < lease_cycles) {
|
|||
|
|
// Lease still active, return page to full_pages (don't thrash ownership)
|
|||
|
|
page->next_page = other_tp->full_pages[class_idx];
|
|||
|
|
other_tp->full_pages[class_idx] = page;
|
|||
|
|
// Release claim before continuing
|
|||
|
|
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
|
|||
|
|
continue; // Try next thread
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Try to transfer ownership using CAS
|
|||
|
|
pthread_t old_owner = page->owner_tid;
|
|||
|
|
pthread_t new_owner = pthread_self();
|
|||
|
|
|
|||
|
|
// Note: pthread_t may not be atomic-compatible on all platforms
|
|||
|
|
// For now, we'll use a simple write (ownership transfer is rare)
|
|||
|
|
// TODO: If thrashing is observed, add atomic CAS with serialization
|
|||
|
|
page->owner_tid = new_owner;
|
|||
|
|
page->owner_tp = me;
|
|||
|
|
page->last_transfer_time = now;
|
|||
|
|
|
|||
|
|
// DEBUG: Log drain state
|
|||
|
|
static _Atomic int adopt_samples = 0;
|
|||
|
|
int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
|
|||
|
|
unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
|
|||
|
|
unsigned int pre_free = page->free_count;
|
|||
|
|
PoolBlock* pre_freelist = page->freelist;
|
|||
|
|
|
|||
|
|
// Drain remote frees
|
|||
|
|
int drained = mf2_drain_remote_frees(page);
|
|||
|
|
|
|||
|
|
// DEBUG: Log result (first 10 samples)
|
|||
|
|
if (sample_idx < 10) {
|
|||
|
|
MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
|
|||
|
|
sample_idx, class_idx, pre_remote, drained,
|
|||
|
|
pre_free, page->free_count, pre_freelist, page->freelist);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Make adopted page ACTIVE immediately (not partial!)
|
|||
|
|
// Adoption needs immediate activation for caller's mf2_alloc_fast()
|
|||
|
|
// Partial list is only for own pending queue drains
|
|||
|
|
if (page->freelist) {
|
|||
|
|
atomic_fetch_add(&g_mf2_page_reuse_count, 1);
|
|||
|
|
atomic_fetch_add(&g_mf2_pending_drained, 1);
|
|||
|
|
atomic_fetch_add(&g_mf2_drain_success, 1);
|
|||
|
|
|
|||
|
|
// Make it active (move old active to full_pages)
|
|||
|
|
mf2_make_page_active(me, class_idx, page);
|
|||
|
|
|
|||
|
|
// Release claim before returning SUCCESS
|
|||
|
|
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
|
|||
|
|
return true; // SUCCESS! Page adopted and activated
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// No freelist after drain, return to MY full_pages (I'm the new owner!)
|
|||
|
|
page->next_page = me->full_pages[class_idx];
|
|||
|
|
me->full_pages[class_idx] = page;
|
|||
|
|
// Release claim before continuing search
|
|||
|
|
atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
|
|||
|
|
// Continue searching for a better page
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return false; // No adoptable pages found
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Fast allocation path (owner thread, NO LOCK!)
|
|||
|
|
static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
|
|||
|
|
// Get thread-local page lists
|
|||
|
|
MF2_ThreadPages* tp = mf2_thread_pages_get();
|
|||
|
|
if (!tp) return NULL;
|
|||
|
|
|
|||
|
|
// Get active page for this class
|
|||
|
|
MidPage* page = tp->active_page[class_idx];
|
|||
|
|
if (!page) {
|
|||
|
|
// No active page, go to slow path
|
|||
|
|
return mf2_alloc_slow(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// FAST PATH: Pop from page-local freelist (NO LOCK!)
|
|||
|
|
if (page->freelist) {
|
|||
|
|
atomic_fetch_add(&g_mf2_alloc_fast_hit, 1);
|
|||
|
|
|
|||
|
|
// Route P: Update activity tracking for idle detection
|
|||
|
|
atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
|
|||
|
|
|
|||
|
|
PoolBlock* block = page->freelist;
|
|||
|
|
page->freelist = block->next;
|
|||
|
|
page->free_count--;
|
|||
|
|
|
|||
|
|
// Increment in-use count (atomic for cross-thread visibility)
|
|||
|
|
atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed);
|
|||
|
|
|
|||
|
|
// Return user pointer (skip header)
|
|||
|
|
return (char*)block + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Local freelist empty, go to slow path
|
|||
|
|
return mf2_alloc_slow(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Slow allocation path (drain remote or allocate new page)
|
|||
|
|
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) {
|
|||
|
|
(void)site_id; // Unused for now
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_alloc_slow_hit, 1);
|
|||
|
|
|
|||
|
|
// Get thread-local page lists
|
|||
|
|
MF2_ThreadPages* tp = mf2_thread_pages_get();
|
|||
|
|
if (!tp) return NULL;
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Allocation Strategy (Must-Reuse Order)
|
|||
|
|
// ===========================================================================
|
|||
|
|
// 1. MUST-REUSE GATE (Part 1): Drain own pending queue
|
|||
|
|
// - Process up to 4 pages to avoid blocking
|
|||
|
|
// - Direct handoff: activate first successful drain immediately
|
|||
|
|
if (mf2_try_reuse_own_pending(tp, class_idx)) {
|
|||
|
|
return mf2_alloc_fast(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 2. MUST-REUSE GATE (Part 2): Drain active page remotes
|
|||
|
|
// - Check if current active page has remote frees
|
|||
|
|
// - Drain and retry allocation if successful
|
|||
|
|
if (mf2_try_drain_active_remotes(tp, class_idx)) {
|
|||
|
|
return mf2_alloc_fast(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// HISTORICAL NOTE: full_pages scan removed
|
|||
|
|
// Old approach: Scan full_pages looking for pages with remotes
|
|||
|
|
// Problem: Drained pages consumed before owner can scan them
|
|||
|
|
// New approach: Direct Handoff immediately activates drained pages
|
|||
|
|
// Result: full_pages scan always finds 0 pages (100% waste)
|
|||
|
|
//
|
|||
|
|
// Benchmark evidence (before removal):
|
|||
|
|
// - Full scan checked: 1,879,484 pages
|
|||
|
|
// - Full scan found: 0 pages (0% success rate!)
|
|||
|
|
|
|||
|
|
// 3. Consumer-Driven Adoption (Route P with idle detection)
|
|||
|
|
// - Only adopt from idle owners (haven't allocated in >150µs)
|
|||
|
|
// - Prevents "adoption stealing" from active owners
|
|||
|
|
if (mf2_try_adopt_pending(tp, class_idx)) {
|
|||
|
|
return mf2_alloc_fast(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 4. MUST-REUSE GATE (Final): Allocate new page (last resort)
|
|||
|
|
// - Only reached after exhausting all reuse opportunities
|
|||
|
|
// - Order: pending queue → active drain → adoption → NEW
|
|||
|
|
MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx);
|
|||
|
|
if (!page) {
|
|||
|
|
return NULL; // OOM
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Retry allocation from new page
|
|||
|
|
return mf2_alloc_fast(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Forward declaration of slow free path
|
|||
|
|
static void mf2_free_slow(MidPage* page, void* ptr);
|
|||
|
|
|
|||
|
|
// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue)
|
|||
|
|
// Fast free path (owner thread, NO LOCK!)
|
|||
|
|
static inline void mf2_free_fast(MidPage* page, void* ptr) {
|
|||
|
|
if (!page || !ptr) return;
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_free_owner_count, 1);
|
|||
|
|
|
|||
|
|
// Get block pointer (rewind to header)
|
|||
|
|
PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
|
|||
|
|
|
|||
|
|
// FAST PATH: Push to page-local freelist (NO LOCK!)
|
|||
|
|
block->next = page->freelist;
|
|||
|
|
page->freelist = block;
|
|||
|
|
page->free_count++;
|
|||
|
|
|
|||
|
|
// Decrement in-use count (atomic for cross-thread visibility)
|
|||
|
|
int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
|
|||
|
|
|
|||
|
|
// Check if page is now empty (all blocks free)
|
|||
|
|
if (old_in_use == 1 && page->free_count == page->capacity) {
|
|||
|
|
// Memory efficiency: Return empty pages to OS via MADV_DONTNEED
|
|||
|
|
// Keeps VA mapped (no munmap), but releases physical memory
|
|||
|
|
hak_batch_add_page(page->base, POOL_PAGE_SIZE);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Slow free path (cross-thread free to remote stack)
|
|||
|
|
static void mf2_free_slow(MidPage* page, void* ptr) {
|
|||
|
|
if (!page || !ptr) return;
|
|||
|
|
|
|||
|
|
atomic_fetch_add(&g_mf2_free_remote_count, 1);
|
|||
|
|
|
|||
|
|
// Get block pointer
|
|||
|
|
PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
|
|||
|
|
|
|||
|
|
// Push to page's remote stack (lock-free MPSC)
|
|||
|
|
uintptr_t old_head;
|
|||
|
|
do {
|
|||
|
|
old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire);
|
|||
|
|
block->next = (PoolBlock*)old_head;
|
|||
|
|
} while (!atomic_compare_exchange_weak_explicit(
|
|||
|
|
&page->remote_head, &old_head, (uintptr_t)block,
|
|||
|
|
memory_order_release, memory_order_relaxed));
|
|||
|
|
|
|||
|
|
// Increment remote count and detect threshold for enqueueing
|
|||
|
|
unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst);
|
|||
|
|
|
|||
|
|
// CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge
|
|||
|
|
// Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again)
|
|||
|
|
// Solution: Only enqueue when remotes accumulate to threshold (better batching)
|
|||
|
|
//
|
|||
|
|
// Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4):
|
|||
|
|
// 1 = immediate (0→1 edge, causes ping-pong)
|
|||
|
|
// 4 = balanced (batch 4 blocks before notifying owner)
|
|||
|
|
// 8 = aggressive batching (higher latency, but better efficiency)
|
|||
|
|
//
|
|||
|
|
// We enqueue on transitions TO the threshold (old_count == threshold-1)
|
|||
|
|
static int g_enqueue_threshold = 1; // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4
|
|||
|
|
if (old_count + 1 == (unsigned int)g_enqueue_threshold) {
|
|||
|
|
// Remote count just reached threshold, notify owner
|
|||
|
|
if (page->owner_tp) {
|
|||
|
|
mf2_enqueue_pending(page->owner_tp, page);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DEBUG: Sample first 10 remote frees - Disabled for performance
|
|||
|
|
// static _Atomic int remote_free_samples = 0;
|
|||
|
|
// int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed);
|
|||
|
|
// if (sample < 10) {
|
|||
|
|
// fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n",
|
|||
|
|
// sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO");
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
// Decrement in-use count
|
|||
|
|
int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
|
|||
|
|
|
|||
|
|
// Check if page is now empty (FIX #6: acquire to see all remote frees)
|
|||
|
|
if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) {
|
|||
|
|
// Memory efficiency: Return empty pages to OS via MADV_DONTNEED
|
|||
|
|
// Keeps VA mapped (no munmap), but releases physical memory
|
|||
|
|
hak_batch_add_page(page->base, POOL_PAGE_SIZE);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Top-level free dispatcher
|
|||
|
|
static void mf2_free(void* ptr) {
|
|||
|
|
if (!ptr) return;
|
|||
|
|
|
|||
|
|
// O(1) page lookup (mimalloc's magic!)
|
|||
|
|
MidPage* page = mf2_addr_to_page(ptr);
|
|||
|
|
if (!page) {
|
|||
|
|
// Not a MF2 page (shouldn't happen if MF2 is enabled properly)
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check if we're the owner (fast path)
|
|||
|
|
MF2_ThreadPages* tp = mf2_thread_pages_get();
|
|||
|
|
|
|||
|
|
if (tp && page->owner_tid == tp->my_tid) {
|
|||
|
|
// Fast: Owner thread, push to local freelist (NO LOCK!)
|
|||
|
|
mf2_free_fast(page, ptr);
|
|||
|
|
} else {
|
|||
|
|
// Slow: Cross-thread free, push to remote stack (lock-free)
|
|||
|
|
mf2_free_slow(page, ptr);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Global pool state (simplified: single-threaded for MVP)
|
|||
|
|
static struct {
|
|||
|
|
PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
|
|||
|
|
|
|||
|
|
// Locks: per (class, shard) freelist to allow concurrent operations
|
|||
|
|
PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
|
|||
|
|
|
|||
|
|
// Non-empty bitmap (O(1) empty class skip)
|
|||
|
|
// Bit i = 1 if freelist[class][shard] is non-empty
|
|||
|
|
// Use atomic to avoid class-wide locks
|
|||
|
|
atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES]; // 1 bit per shard
|
|||
|
|
|
|||
|
|
// Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc
|
|||
|
|
atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
|
|||
|
|
atomic_uint remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
|
|||
|
|
|
|||
|
|
// Statistics
|
|||
|
|
uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
|
|||
|
|
uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
|
|||
|
|
uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
|
|||
|
|
uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
|
|||
|
|
uint64_t total_bytes_allocated __attribute__((aligned(64)));
|
|||
|
|
uint64_t total_pages_allocated __attribute__((aligned(64)));
|
|||
|
|
|
|||
|
|
// Per-class page accounting (for Soft CAP guidance)
|
|||
|
|
uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));
|
|||
|
|
|
|||
|
|
// ACE: per-class bundle factor for refill (1..4) + last snapshot
|
|||
|
|
int bundle_factor[POOL_NUM_CLASSES];
|
|||
|
|
uint64_t last_hits[POOL_NUM_CLASSES];
|
|||
|
|
uint64_t last_misses[POOL_NUM_CLASSES];
|
|||
|
|
|
|||
|
|
int initialized;
|
|||
|
|
int tls_free_enabled; // env: HAKMEM_POOL_TLS_FREE (default: 1)
|
|||
|
|
|
|||
|
|
// Extra metrics (for learner logging): all relaxed atomics
|
|||
|
|
atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
|
|||
|
|
atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
|
|||
|
|
atomic_uint_fast64_t ring_underflow __attribute__((aligned(64)));
|
|||
|
|
} g_pool;
|
|||
|
|
|
|||
|
|
static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
|
|||
|
|
static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
|
|||
|
|
static int g_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
|
|||
|
|
static int g_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES (1..8)
|
|||
|
|
static int g_ring_return_div = 2; // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
|
|||
|
|
static int g_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
|
|||
|
|
int g_hdr_light_enabled = 0; // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation)
|
|||
|
|
static int g_pool_min_bundle = 2; // env: HAKMEM_POOL_MIN_BUNDLE (default 2)
|
|||
|
|
// Sampled counter updates to reduce hot-path stores: 1/2^k
|
|||
|
|
static int g_count_sample_exp = 10; // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
|
|||
|
|
static __thread uint32_t t_pool_rng = 0x243f6a88u; // per-thread RNG for sampling
|
|||
|
|
|
|||
|
|
// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
|
|||
|
|
// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap
|
|||
|
|
static size_t g_class_sizes[POOL_NUM_CLASSES] = {
|
|||
|
|
POOL_CLASS_2KB, // 2 KB
|
|||
|
|
POOL_CLASS_4KB, // 4 KB
|
|||
|
|
POOL_CLASS_8KB, // 8 KB
|
|||
|
|
POOL_CLASS_16KB, // 16 KB
|
|||
|
|
POOL_CLASS_32KB, // 32 KB
|
|||
|
|
POOL_CLASS_40KB, // 40 KB (Bridge class 0)
|
|||
|
|
POOL_CLASS_52KB // 52 KB (Bridge class 1)
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// Blocks per page (for each class)
|
|||
|
|
__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_2KB, // 32 blocks (2KiB)
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_4KB, // 16 blocks (4KiB)
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_8KB, // 8 blocks (8KiB)
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_16KB, // 4 blocks (16KiB)
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_32KB, // 2 blocks (32KiB)
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_40KB, // 1 block (40KiB Bridge)
|
|||
|
|
POOL_PAGE_SIZE / POOL_CLASS_52KB // 1 block (52KiB Bridge)
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Helper Functions
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Write minimal header for Mid allocation (fast-return friendly)
|
|||
|
|
static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
|
|||
|
|
// For Mid, prefer headerless operation when HDR_LIGHT>=1.
|
|||
|
|
// Debug or non-Mid callers can still write full headers elsewhere.
|
|||
|
|
if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path
|
|||
|
|
hdr->magic = HAKMEM_MAGIC;
|
|||
|
|
hdr->method = ALLOC_METHOD_POOL;
|
|||
|
|
hdr->size = class_sz;
|
|||
|
|
if (!g_hdr_light_enabled) {
|
|||
|
|
hdr->alloc_site = site_id;
|
|||
|
|
hdr->class_bytes = 0;
|
|||
|
|
hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Branchless LUT (Lookup Table) for O(1) class determination
|
|||
|
|
// Expanded to 53 entries for Bridge classes (40KB, 52KB)
|
|||
|
|
static const uint8_t SIZE_TO_CLASS[53] = {
|
|||
|
|
0,0,0, // 0-2KB → Class 0
|
|||
|
|
1,1, // 3-4KB → Class 1
|
|||
|
|
2,2,2,2, // 5-8KB → Class 2
|
|||
|
|
3,3,3,3,3,3,3,3, // 9-16KB → Class 3
|
|||
|
|
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, // 17-32KB → Class 4
|
|||
|
|
5,5,5,5,5,5,5,5, // 33-40KB → Class 5 (Bridge class 0)
|
|||
|
|
6,6,6,6,6,6,6,6,6,6,6,6 // 41-52KB → Class 6 (Bridge class 1)
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// Get size class index from size (0-6, or -1 if out of range)
|
|||
|
|
// Updated range check for Bridge classes (0-52KB)
|
|||
|
|
static inline int hak_pool_get_class_index(size_t size) {
|
|||
|
|
// Fast path: exact match against configured class sizes (covers Bridge classes)
|
|||
|
|
// Note: size passed here should already be a rounded class size from ACE.
|
|||
|
|
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
|
|||
|
|
size_t cs = g_class_sizes[i];
|
|||
|
|
if (cs != 0 && size == cs) return i;
|
|||
|
|
}
|
|||
|
|
// Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior)
|
|||
|
|
uint32_t kb = (uint32_t)((size + 1023) >> 10); // Round up to KB units
|
|||
|
|
return (kb < 53) ? SIZE_TO_CLASS[kb] : -1; // Expanded to 53KB for Bridge classes
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Get shard index from site_id (0-63)
|
|||
|
|
int hak_pool_get_shard_index(uintptr_t site_id) {
|
|||
|
|
if (!g_shard_mix_enabled) {
|
|||
|
|
// Legacy: Shift by 4 to reduce collision (instruction alignment)
|
|||
|
|
return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
|
|||
|
|
}
|
|||
|
|
// SplitMix64-like mixer with thread id salt for better dispersion
|
|||
|
|
uint64_t x = (uint64_t)site_id;
|
|||
|
|
uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
|
|||
|
|
x ^= (tid << 1);
|
|||
|
|
x += 0x9e3779b97f4a7c15ULL;
|
|||
|
|
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
|
|||
|
|
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
|
|||
|
|
x = (x ^ (x >> 31));
|
|||
|
|
return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Bitmap helpers (O(1) empty class detection)
|
|||
|
|
static inline void set_nonempty_bit(int class_idx, int shard_idx) {
|
|||
|
|
// Set bit: freelist[class][shard] is non-empty (atomic OR)
|
|||
|
|
atomic_fetch_or(&g_pool.nonempty_mask[class_idx], (uint64_t)(1ULL << shard_idx));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static inline void clear_nonempty_bit(int class_idx, int shard_idx) {
|
|||
|
|
// Clear bit: freelist[class][shard] is empty (atomic AND)
|
|||
|
|
atomic_fetch_and(&g_pool.nonempty_mask[class_idx], ~(uint64_t)(1ULL << shard_idx));
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static inline int is_shard_nonempty(int class_idx, int shard_idx) {
|
|||
|
|
// Check if shard has blocks (atomic load)
|
|||
|
|
uint64_t mask = atomic_load(&g_pool.nonempty_mask[class_idx]);
|
|||
|
|
return (mask & (1ULL << shard_idx)) != 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Drain remote-free MPSC stack into freelist under the shard lock
|
|||
|
|
static inline void drain_remote_locked(int class_idx, int shard_idx) {
|
|||
|
|
uintptr_t head = atomic_exchange_explicit(&g_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel);
|
|||
|
|
unsigned drained = 0;
|
|||
|
|
while (head) {
|
|||
|
|
PoolBlock* b = (PoolBlock*)head;
|
|||
|
|
head = (uintptr_t)b->next; // next pointer stored in first word
|
|||
|
|
b->next = g_pool.freelist[class_idx][shard_idx];
|
|||
|
|
g_pool.freelist[class_idx][shard_idx] = b;
|
|||
|
|
drained++;
|
|||
|
|
}
|
|||
|
|
if (drained) {
|
|||
|
|
atomic_fetch_sub_explicit(&g_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed);
|
|||
|
|
if (g_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Choose a non-empty shard near preferred using the nonempty mask. If none, return preferred.
|
|||
|
|
static inline int choose_nonempty_shard(int class_idx, int preferred) {
|
|||
|
|
uint64_t mask = atomic_load_explicit(&g_pool.nonempty_mask[class_idx], memory_order_acquire);
|
|||
|
|
if (!mask) return preferred;
|
|||
|
|
// Rotate so preferred becomes bit0
|
|||
|
|
int shift = preferred & 63;
|
|||
|
|
uint64_t rot = (mask >> shift) | (mask << (64 - shift));
|
|||
|
|
if (!rot) return preferred;
|
|||
|
|
int off = __builtin_ctzll(rot);
|
|||
|
|
return (preferred + off) & (POOL_NUM_SHARDS - 1);
|
|||
|
|
}
|
|||
|
|
// Allocate a private page for TLS active page and split into a local list
|
|||
|
|
static int alloc_tls_page(int class_idx, PoolTLSPage* ap) {
|
|||
|
|
size_t user_size = g_class_sizes[class_idx];
|
|||
|
|
size_t block_size = HEADER_SIZE + user_size;
|
|||
|
|
int blocks_per_page = POOL_PAGE_SIZE / block_size;
|
|||
|
|
if (blocks_per_page <= 0) return 0;
|
|||
|
|
void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE,
|
|||
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|||
|
|
if (!page) return 0;
|
|||
|
|
// Bump-run initialization (no per-block linking)
|
|||
|
|
ap->page = page;
|
|||
|
|
ap->bump = (char*)page;
|
|||
|
|
ap->end = (char*)page + POOL_PAGE_SIZE;
|
|||
|
|
ap->count = blocks_per_page;
|
|||
|
|
// Register page with owner (this thread) for owner-fast free detection
|
|||
|
|
mid_desc_register(page, class_idx, (uint64_t)(uintptr_t)pthread_self());
|
|||
|
|
g_pool.refills[class_idx]++;
|
|||
|
|
g_pool.total_pages_allocated++;
|
|||
|
|
g_pool.pages_by_class[class_idx]++;
|
|||
|
|
g_pool.total_bytes_allocated += POOL_PAGE_SIZE;
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Refill TLS ring/LIFO from active page without building links. Returns number added.
|
|||
|
|
static inline int refill_tls_from_active_page(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin, PoolTLSPage* ap, int need) {
|
|||
|
|
if (!ap || !ap->page || ap->count <= 0 || ap->bump >= ap->end) return 0;
|
|||
|
|
size_t blk = HEADER_SIZE + g_class_sizes[class_idx];
|
|||
|
|
int moved = 0;
|
|||
|
|
int to_add = need;
|
|||
|
|
while (to_add > 0 && ap->bump < ap->end && ap->count > 0) {
|
|||
|
|
PoolBlock* b = (PoolBlock*)(void*)ap->bump;
|
|||
|
|
if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
|
|||
|
|
ring->items[ring->top++] = b;
|
|||
|
|
} else {
|
|||
|
|
b->next = bin->lo_head; bin->lo_head = b; bin->lo_count++;
|
|||
|
|
}
|
|||
|
|
ap->bump += blk;
|
|||
|
|
ap->count--;
|
|||
|
|
moved++;
|
|||
|
|
to_add--;
|
|||
|
|
}
|
|||
|
|
if (ap->bump >= ap->end || ap->count <= 0) {
|
|||
|
|
ap->page = NULL; ap->bump = ap->end; ap->count = 0;
|
|||
|
|
}
|
|||
|
|
return moved;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
// ACE: adjust bundle factor per class based on windowed hits/misses
|
|||
|
|
static inline void pool_update_bundle_factor(int class_idx) {
|
|||
|
|
// Compute deltas since last snapshot
|
|||
|
|
uint64_t h = g_pool.hits[class_idx];
|
|||
|
|
uint64_t m = g_pool.misses[class_idx];
|
|||
|
|
uint64_t dh = h - g_pool.last_hits[class_idx];
|
|||
|
|
uint64_t dm = m - g_pool.last_misses[class_idx];
|
|||
|
|
uint64_t dt = dh + dm;
|
|||
|
|
if (dt < 256) return; // wait for window to accumulate
|
|||
|
|
|
|||
|
|
int bf = g_pool.bundle_factor[class_idx];
|
|||
|
|
if (bf <= 0) bf = 1;
|
|||
|
|
|
|||
|
|
// Ifミス優勢(ヒット率<60% かつ ミスがヒット+一定閾値超)→増やす
|
|||
|
|
if (dt > 0) {
|
|||
|
|
double hit_rate = (double)dh / (double)dt;
|
|||
|
|
if (hit_rate < 0.60 && dm > (dh + 16)) {
|
|||
|
|
if (bf < 4) bf++;
|
|||
|
|
} else if (hit_rate > 0.90 && dh > (dm + 32)) {
|
|||
|
|
if (bf > 1) bf--;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
g_pool.bundle_factor[class_idx] = bf;
|
|||
|
|
// Advance snapshot
|
|||
|
|
g_pool.last_hits[class_idx] = h;
|
|||
|
|
g_pool.last_misses[class_idx] = m;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Refill freelist by allocating a new page (64KiB)
|
|||
|
|
// Args: class_idx - size class index (0-4)
|
|||
|
|
// shard_idx - shard index (0-63)
|
|||
|
|
// Returns: 1 on success, 0 on failure
|
|||
|
|
//
|
|||
|
|
// Each block now includes AllocHeader + user data
|
|||
|
|
static int refill_freelist(int class_idx, int shard_idx) {
|
|||
|
|
if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return 0;
|
|||
|
|
if (shard_idx < 0 || shard_idx >= POOL_NUM_SHARDS) return 0;
|
|||
|
|
|
|||
|
|
size_t user_size = g_class_sizes[class_idx];
|
|||
|
|
size_t block_size = HEADER_SIZE + user_size; // Header + user data
|
|||
|
|
|
|||
|
|
// Calculate blocks per page (with header overhead)
|
|||
|
|
int blocks_per_page = POOL_PAGE_SIZE / block_size;
|
|||
|
|
if (blocks_per_page == 0) return 0; // Safety: class too large for 64KiB page
|
|||
|
|
|
|||
|
|
// Allocate page via mmap (page-granular, avoids malloc overhead)
|
|||
|
|
void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE,
|
|||
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|||
|
|
if (!page) return 0;
|
|||
|
|
|
|||
|
|
// Update bundle factor based on windowed stats
|
|||
|
|
pool_update_bundle_factor(class_idx);
|
|||
|
|
int bundles = g_pool.bundle_factor[class_idx];
|
|||
|
|
if (bundles < 1) bundles = 1;
|
|||
|
|
if (bundles > 4) bundles = 4;
|
|||
|
|
|
|||
|
|
// Soft CAP guidance: use FrozenPolicy mid_cap to modulate bundling
|
|||
|
|
// Semantics: mid_cap[class] is a soft target (in pages). We do not trim yet.
|
|||
|
|
// If at/over cap → restrict bundling to 1; if far under cap → allow bundling up to deficit (max 4).
|
|||
|
|
const FrozenPolicy* pol = hkm_policy_get();
|
|||
|
|
if (pol) {
|
|||
|
|
uint16_t cap = 0;
|
|||
|
|
if (class_idx < 5) cap = pol->mid_cap[class_idx];
|
|||
|
|
else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;
|
|||
|
|
else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;
|
|||
|
|
if (cap > 0) {
|
|||
|
|
uint64_t have = g_pool.pages_by_class[class_idx];
|
|||
|
|
if (have >= cap) {
|
|||
|
|
bundles = 1; // over cap: refill minimally
|
|||
|
|
} else {
|
|||
|
|
uint64_t deficit = (cap - have);
|
|||
|
|
if (deficit < (uint64_t)bundles) bundles = (int)deficit; // don't exceed deficit
|
|||
|
|
if (bundles < 1) bundles = 1;
|
|||
|
|
if (bundles > 4) bundles = 4;
|
|||
|
|
// Ensure at least min bundle under deficit for faster warm-up
|
|||
|
|
if (deficit >= (uint64_t)g_pool_min_bundle && bundles < g_pool_min_bundle) bundles = g_pool_min_bundle;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int pages_allocated_this_call = 0;
|
|||
|
|
for (int b = 0; b < bundles; b++) {
|
|||
|
|
// Split page into blocks and link into freelist
|
|||
|
|
PoolBlock* freelist_head = NULL;
|
|||
|
|
|
|||
|
|
for (int i = 0; i < blocks_per_page; i++) {
|
|||
|
|
void* raw_block = (char*)page + (i * block_size);
|
|||
|
|
// Prefetch next block header to reduce cache miss on link
|
|||
|
|
__builtin_prefetch((char*)raw_block + block_size, 1, 1);
|
|||
|
|
// Freelist uses raw pointer (header start). Header will be
|
|||
|
|
// constructed after pop in hak_pool_try_alloc.
|
|||
|
|
PoolBlock* block = (PoolBlock*)raw_block;
|
|||
|
|
block->next = freelist_head;
|
|||
|
|
freelist_head = block;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Prepend to existing freelist (if any)
|
|||
|
|
if (g_pool.freelist[class_idx][shard_idx]) {
|
|||
|
|
// Find tail of new list
|
|||
|
|
PoolBlock* tail = freelist_head;
|
|||
|
|
while (tail->next) {
|
|||
|
|
tail = tail->next;
|
|||
|
|
}
|
|||
|
|
tail->next = g_pool.freelist[class_idx][shard_idx];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
g_pool.freelist[class_idx][shard_idx] = freelist_head;
|
|||
|
|
// Register this 64KiB page (shared owner)
|
|||
|
|
mid_desc_register(page, class_idx, 0);
|
|||
|
|
|
|||
|
|
// Next page if bundling
|
|||
|
|
if (b + 1 < bundles) {
|
|||
|
|
page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE,
|
|||
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|||
|
|
if (!page) break;
|
|||
|
|
}
|
|||
|
|
pages_allocated_this_call++;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Set non-empty bit (freelist now has blocks)
|
|||
|
|
set_nonempty_bit(class_idx, shard_idx);
|
|||
|
|
|
|||
|
|
// Update statistics
|
|||
|
|
g_pool.refills[class_idx]++;
|
|||
|
|
g_pool.total_pages_allocated += pages_allocated_this_call;
|
|||
|
|
g_pool.pages_by_class[class_idx] += pages_allocated_this_call;
|
|||
|
|
g_pool.total_bytes_allocated += (uint64_t)pages_allocated_this_call * (uint64_t)POOL_PAGE_SIZE;
|
|||
|
|
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ===========================================================================
|
|||
|
|
// Public API
|
|||
|
|
// ===========================================================================
|
|||
|
|
|
|||
|
|
// Thread-safe initialization using pthread_once
|
|||
|
|
static pthread_once_t hak_pool_init_once_control = PTHREAD_ONCE_INIT;
|
|||
|
|
static void hak_pool_init_impl(void) {
|
|||
|
|
// NOTE: Do NOT use memset() here! It would clobber 448 mutexes during concurrent init.
|
|||
|
|
// All fields are explicitly initialized below.
|
|||
|
|
// Configure dynamic Mid classes from FrozenPolicy (index 5/6)
|
|||
|
|
const FrozenPolicy* pol = hkm_policy_get();
|
|||
|
|
if (pol && pol->mid_dyn1_bytes >= POOL_MIN_SIZE && pol->mid_dyn1_bytes <= POOL_MAX_SIZE) {
|
|||
|
|
g_class_sizes[5] = pol->mid_dyn1_bytes;
|
|||
|
|
} else {
|
|||
|
|
g_class_sizes[5] = 0; // disabled
|
|||
|
|
}
|
|||
|
|
if (pol && pol->mid_dyn2_bytes >= POOL_MIN_SIZE && pol->mid_dyn2_bytes <= POOL_MAX_SIZE) {
|
|||
|
|
g_class_sizes[6] = pol->mid_dyn2_bytes;
|
|||
|
|
} else {
|
|||
|
|
g_class_sizes[6] = 0;
|
|||
|
|
}
|
|||
|
|
// Initialize all g_pool fields explicitly (no memset!)
|
|||
|
|
for (int c = 0; c < POOL_NUM_CLASSES; c++) {
|
|||
|
|
// Initialize freelists to NULL
|
|||
|
|
for (int s = 0; s < POOL_NUM_SHARDS; s++) {
|
|||
|
|
g_pool.freelist[c][s] = NULL;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize atomic variables and locks
|
|||
|
|
atomic_store(&g_pool.nonempty_mask[c], 0);
|
|||
|
|
for (int s = 0; s < POOL_NUM_SHARDS; s++) {
|
|||
|
|
pthread_mutex_init(&g_pool.freelist_locks[c][s].m, NULL);
|
|||
|
|
atomic_store(&g_pool.remote_head[c][s], (uintptr_t)0);
|
|||
|
|
atomic_store(&g_pool.remote_count[c][s], 0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize per-class statistics
|
|||
|
|
g_pool.hits[c] = 0;
|
|||
|
|
g_pool.misses[c] = 0;
|
|||
|
|
g_pool.refills[c] = 0;
|
|||
|
|
g_pool.frees[c] = 0;
|
|||
|
|
g_pool.pages_by_class[c] = 0;
|
|||
|
|
|
|||
|
|
// Initialize ACE variables
|
|||
|
|
g_pool.bundle_factor[c] = 1;
|
|||
|
|
g_pool.last_hits[c] = 0;
|
|||
|
|
g_pool.last_misses[c] = 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Initialize global statistics
|
|||
|
|
g_pool.total_bytes_allocated = 0;
|
|||
|
|
g_pool.total_pages_allocated = 0;
|
|||
|
|
|
|||
|
|
// Initialize atomic metrics
|
|||
|
|
atomic_store(&g_pool.trylock_attempts, 0);
|
|||
|
|
atomic_store(&g_pool.trylock_success, 0);
|
|||
|
|
atomic_store(&g_pool.ring_underflow, 0);
|
|||
|
|
const char* e_tls = getenv("HAKMEM_POOL_TLS_FREE");
|
|||
|
|
g_pool.tls_free_enabled = (e_tls == NULL) ? 1 : (atoi(e_tls) != 0);
|
|||
|
|
const char* e_wrap = getenv("HAKMEM_WRAP_L2");
|
|||
|
|
g_wrap_l2_enabled = (e_wrap && atoi(e_wrap) != 0) ? 1 : 0;
|
|||
|
|
const char* e_minb = getenv("HAKMEM_POOL_MIN_BUNDLE");
|
|||
|
|
if (e_minb) { int v = atoi(e_minb); if (v >= 1 && v <= 8) g_pool_min_bundle = v; }
|
|||
|
|
const char* e_mix = getenv("HAKMEM_SHARD_MIX");
|
|||
|
|
g_shard_mix_enabled = (e_mix && atoi(e_mix) != 0) ? 1 : 0;
|
|||
|
|
const char* e_ring = getenv("HAKMEM_POOL_TLS_RING");
|
|||
|
|
if (e_ring) g_tls_ring_enabled = (atoi(e_ring) != 0);
|
|||
|
|
const char* e_hdr = getenv("HAKMEM_HDR_LIGHT");
|
|||
|
|
if (e_hdr) g_hdr_light_enabled = atoi(e_hdr); // 0=full, 1=minimal, 2=skip header writes/validation
|
|||
|
|
const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES");
|
|||
|
|
if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_trylock_probes = v; }
|
|||
|
|
const char* e_div = getenv("HAKMEM_RING_RETURN_DIV");
|
|||
|
|
if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_ring_return_div = v; }
|
|||
|
|
const char* e_lo = getenv("HAKMEM_TLS_LO_MAX");
|
|||
|
|
if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_tls_lo_max = v; }
|
|||
|
|
const char* e_cs = getenv("HAKMEM_POOL_COUNT_SAMPLE");
|
|||
|
|
if (e_cs) { int v = atoi(e_cs); if (v>=0 && v<=16) g_count_sample_exp = v; }
|
|||
|
|
const char* e_tc = getenv("HAKMEM_TC_ENABLE");
|
|||
|
|
if (e_tc) g_tc_enabled = (atoi(e_tc) != 0);
|
|||
|
|
const char* e_tcu = getenv("HAKMEM_TC_UNBOUNDED");
|
|||
|
|
if (e_tcu) g_tc_drain_unbounded = (atoi(e_tcu) != 0);
|
|||
|
|
const char* e_tcm = getenv("HAKMEM_TC_DRAIN_MAX");
|
|||
|
|
if (e_tcm) { int v = atoi(e_tcm); if (v>=0 && v<=65536) g_tc_drain_max = v; }
|
|||
|
|
const char* e_tct = getenv("HAKMEM_TC_DRAIN_TRIGGER");
|
|||
|
|
if (e_tct) { int v = atoi(e_tct); if (v>=0 && v<=POOL_L2_RING_CAP) g_tc_drain_trigger = v; }
|
|||
|
|
|
|||
|
|
// MF2: Per-Page Sharding
|
|||
|
|
const char* e_mf2 = getenv("HAKMEM_MF2_ENABLE");
|
|||
|
|
if (e_mf2 && atoi(e_mf2) != 0) {
|
|||
|
|
g_mf2_enabled = 1;
|
|||
|
|
mf2_page_registry_init();
|
|||
|
|
|
|||
|
|
// MF2 tuning parameters
|
|||
|
|
const char* e_maxq = getenv("HAKMEM_MF2_MAX_QUEUES");
|
|||
|
|
if (e_maxq) {
|
|||
|
|
int v = atoi(e_maxq);
|
|||
|
|
if (v >= 1 && v <= 256) g_mf2_max_queues = v;
|
|||
|
|
}
|
|||
|
|
const char* e_lease = getenv("HAKMEM_MF2_LEASE_MS");
|
|||
|
|
if (e_lease) {
|
|||
|
|
int v = atoi(e_lease);
|
|||
|
|
if (v >= 0 && v <= 1000) g_mf2_lease_ms = v; // 0=disabled, max 1000ms
|
|||
|
|
}
|
|||
|
|
const char* e_idle = getenv("HAKMEM_MF2_IDLE_THRESHOLD_US");
|
|||
|
|
if (e_idle) {
|
|||
|
|
int v = atoi(e_idle);
|
|||
|
|
if (v >= 0 && v <= 10000) g_mf2_idle_threshold_us = v; // 0µs~10ms
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
HAKMEM_LOG("[Pool] MF2 Per-Page Sharding enabled\n");
|
|||
|
|
HAKMEM_LOG("[MF2] max_queues=%d, lease_ms=%d, idle_threshold_us=%d\n",
|
|||
|
|
g_mf2_max_queues, g_mf2_lease_ms, g_mf2_idle_threshold_us);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
g_pool.initialized = 1;
|
|||
|
|
|
|||
|
|
HAKMEM_LOG("[Pool] Initialized (L2 Hybrid Pool)\n");
|
|||
|
|
if (g_class_sizes[5] != 0 || g_class_sizes[6] != 0) {
|
|||
|
|
HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB%s%s%s\n",
|
|||
|
|
g_class_sizes[5] ? ", dyn1=" : "",
|
|||
|
|
g_class_sizes[5] ? (const char*)"" : (g_class_sizes[6]?",":""),
|
|||
|
|
(g_class_sizes[5]||g_class_sizes[6]) ? "" : "");
|
|||
|
|
} else {
|
|||
|
|
HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB\n");
|
|||
|
|
}
|
|||
|
|
HAKMEM_LOG("[Pool] Page size: %d KB\n", POOL_PAGE_SIZE / 1024);
|
|||
|
|
HAKMEM_LOG("[Pool] Shards: %d (site-based)\n", POOL_NUM_SHARDS);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_init(void) {
|
|||
|
|
pthread_once(&hak_pool_init_once_control, hak_pool_init_impl);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
static void mf2_print_debug_stats(void) {
|
|||
|
|
if (!g_mf2_enabled) return;
|
|||
|
|
|
|||
|
|
fprintf(stderr, "\n[MF2 DEBUG STATS]\n");
|
|||
|
|
fprintf(stderr, "Alloc fast hits: %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_fast_hit));
|
|||
|
|
fprintf(stderr, "Alloc slow hits: %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_slow_hit));
|
|||
|
|
fprintf(stderr, "Page reuses: %12lu\n", (unsigned long)atomic_load(&g_mf2_page_reuse_count));
|
|||
|
|
fprintf(stderr, "New pages: %12lu\n", (unsigned long)atomic_load(&g_mf2_new_page_count));
|
|||
|
|
fprintf(stderr, "Owner frees: %12lu\n", (unsigned long)atomic_load(&g_mf2_free_owner_count));
|
|||
|
|
fprintf(stderr, "Remote frees: %12lu\n", (unsigned long)atomic_load(&g_mf2_free_remote_count));
|
|||
|
|
fprintf(stderr, "Slow checked: %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_checked_drain));
|
|||
|
|
fprintf(stderr, "Slow found rem: %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_found_remote));
|
|||
|
|
fprintf(stderr, "Full scan chk: %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_checked));
|
|||
|
|
fprintf(stderr, "Full scan rem: %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_found_remote));
|
|||
|
|
fprintf(stderr, "Eager scan: %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_scanned));
|
|||
|
|
fprintf(stderr, "Eager found: %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_found));
|
|||
|
|
fprintf(stderr, "Drain attempts: %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_attempts));
|
|||
|
|
fprintf(stderr, "Drain successes: %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_success));
|
|||
|
|
fprintf(stderr, "Remote drains: %12lu (blocks: %lu)\n",
|
|||
|
|
(unsigned long)atomic_load(&g_mf2_drain_count),
|
|||
|
|
(unsigned long)atomic_load(&g_mf2_drain_blocks));
|
|||
|
|
|
|||
|
|
// Pending queue statistics
|
|||
|
|
fprintf(stderr, "\n[PENDING QUEUE]\n");
|
|||
|
|
fprintf(stderr, "Pending enqueued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_enqueued));
|
|||
|
|
fprintf(stderr, "Pending drained: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_drained));
|
|||
|
|
fprintf(stderr, "Pending requeued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_requeued));
|
|||
|
|
|
|||
|
|
// Calculate ratios
|
|||
|
|
uint64_t total_allocs = atomic_load(&g_mf2_alloc_fast_hit) + atomic_load(&g_mf2_alloc_slow_hit);
|
|||
|
|
uint64_t total_frees = atomic_load(&g_mf2_free_owner_count) + atomic_load(&g_mf2_free_remote_count);
|
|||
|
|
if (total_allocs > 0) {
|
|||
|
|
fprintf(stderr, "\nFast path hit rate: %.2f%%\n",
|
|||
|
|
100.0 * atomic_load(&g_mf2_alloc_fast_hit) / total_allocs);
|
|||
|
|
}
|
|||
|
|
if (total_frees > 0) {
|
|||
|
|
fprintf(stderr, "Owner free rate: %.2f%%\n",
|
|||
|
|
100.0 * atomic_load(&g_mf2_free_owner_count) / total_frees);
|
|||
|
|
}
|
|||
|
|
fflush(stderr);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
__attribute__((destructor))
|
|||
|
|
static void mf2_destructor(void) {
|
|||
|
|
mf2_print_debug_stats();
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_shutdown(void) {
|
|||
|
|
if (!g_pool.initialized) return;
|
|||
|
|
|
|||
|
|
hak_pool_print_stats();
|
|||
|
|
mf2_print_debug_stats();
|
|||
|
|
|
|||
|
|
// Free all pages (walk freelists and free page heads)
|
|||
|
|
// MVP: Skip for now (pages allocated via malloc, will be freed by system)
|
|||
|
|
// Future: Track page allocations and munmap explicitly
|
|||
|
|
|
|||
|
|
g_pool.initialized = 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {
|
|||
|
|
hak_pool_init(); // pthread_once() ensures thread-safe init (no data race!)
|
|||
|
|
// P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe)
|
|||
|
|
extern int hak_in_wrapper(void);
|
|||
|
|
if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL;
|
|||
|
|
if (!hak_pool_is_poolable(size)) return NULL;
|
|||
|
|
|
|||
|
|
// Get class and shard indices
|
|||
|
|
int class_idx = hak_pool_get_class_index(size);
|
|||
|
|
if (class_idx < 0) return NULL;
|
|||
|
|
|
|||
|
|
// MF2: Per-Page Sharding path
|
|||
|
|
if (g_mf2_enabled) {
|
|||
|
|
return mf2_alloc_fast(class_idx, size, site_id);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OLD PATH: TLS fast path (ring then local LIFO); drain TC only when needed
|
|||
|
|
PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
|
|||
|
|
if (g_tc_enabled && ring->top < g_tc_drain_trigger && mid_tc_has_items(class_idx)) {
|
|||
|
|
HKM_TIME_START(t_tc_drain);
|
|||
|
|
if (mid_tc_drain_into_tls(class_idx, ring, &g_tls_bin[class_idx])) {
|
|||
|
|
HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain);
|
|||
|
|
if (ring->top > 0) {
|
|||
|
|
HKM_TIME_START(t_ring_pop0);
|
|||
|
|
PoolBlock* tlsb = ring->items[--ring->top];
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop0);
|
|||
|
|
void* raw = (void*)tlsb;
|
|||
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
mid_set_header(hdr, g_class_sizes[class_idx], site_id);
|
|||
|
|
mid_page_inuse_inc(raw);
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
|
|||
|
|
return (char*)raw + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
} else { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); }
|
|||
|
|
}
|
|||
|
|
if (g_tls_ring_enabled) {
|
|||
|
|
if (ring->top == 0) {
|
|||
|
|
atomic_fetch_add_explicit(&g_pool.ring_underflow, 1, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
if (ring->top > 0) {
|
|||
|
|
HKM_TIME_START(t_ring_pop1);
|
|||
|
|
PoolBlock* tlsb = ring->items[--ring->top];
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop1);
|
|||
|
|
void* raw = (void*)tlsb;
|
|||
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
mid_set_header(hdr, g_class_sizes[class_idx], site_id);
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
|
|||
|
|
return (char*)raw + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (g_tls_bin[class_idx].lo_head) {
|
|||
|
|
HKM_TIME_START(t_lifo_pop0);
|
|||
|
|
PoolBlock* b = g_tls_bin[class_idx].lo_head;
|
|||
|
|
g_tls_bin[class_idx].lo_head = b->next;
|
|||
|
|
if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--;
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop0);
|
|||
|
|
void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
mid_set_header(hdr, g_class_sizes[class_idx], site_id);
|
|||
|
|
mid_page_inuse_inc(raw);
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
|
|||
|
|
return (char*)raw + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Compute shard only when we need to access shared structures
|
|||
|
|
int shard_idx = hak_pool_get_shard_index(site_id);
|
|||
|
|
|
|||
|
|
// Try to batch-pop from a non-empty shard using trylock to fill TLS ring
|
|||
|
|
if (g_tls_ring_enabled) {
|
|||
|
|
int s0 = choose_nonempty_shard(class_idx, shard_idx);
|
|||
|
|
for (int probe = 0; probe < g_trylock_probes; ++probe) {
|
|||
|
|
int s = (s0 + probe) & (POOL_NUM_SHARDS - 1);
|
|||
|
|
pthread_mutex_t* l = &g_pool.freelist_locks[class_idx][s].m;
|
|||
|
|
atomic_fetch_add_explicit(&g_pool.trylock_attempts, 1, memory_order_relaxed);
|
|||
|
|
if (pthread_mutex_trylock(l) == 0) {
|
|||
|
|
atomic_fetch_add_explicit(&g_pool.trylock_success, 1, memory_order_relaxed);
|
|||
|
|
// First, drain any remote frees into freelist
|
|||
|
|
if (atomic_load_explicit(&g_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
|
|||
|
|
drain_remote_locked(class_idx, s);
|
|||
|
|
}
|
|||
|
|
PoolBlock* head = g_pool.freelist[class_idx][s];
|
|||
|
|
int to_ring = POOL_L2_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
|
|||
|
|
while (head && to_ring-- > 0) { PoolBlock* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
|
|||
|
|
while (head) { PoolBlock* nxt = head->next; head->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = head; g_tls_bin[class_idx].lo_count++; head = nxt; }
|
|||
|
|
g_pool.freelist[class_idx][s] = head;
|
|||
|
|
if (!head) clear_nonempty_bit(class_idx, s);
|
|||
|
|
pthread_mutex_unlock(l);
|
|||
|
|
if (ring->top > 0) {
|
|||
|
|
PoolBlock* tlsb = ring->items[--ring->top];
|
|||
|
|
void* raw = (void*)tlsb;
|
|||
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
mid_set_header(hdr, g_class_sizes[class_idx], site_id);
|
|||
|
|
mid_page_inuse_inc(raw);
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
|
|||
|
|
return (char*)raw + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Try TLS active pages (owner-only local bump-run, up to 3) // QW2-adjusted
|
|||
|
|
PoolTLSPage* ap = NULL;
|
|||
|
|
if (g_tls_active_page_a[class_idx].page && g_tls_active_page_a[class_idx].count > 0 && g_tls_active_page_a[class_idx].bump < g_tls_active_page_a[class_idx].end) ap = &g_tls_active_page_a[class_idx];
|
|||
|
|
else if (g_tls_active_page_b[class_idx].page && g_tls_active_page_b[class_idx].count > 0 && g_tls_active_page_b[class_idx].bump < g_tls_active_page_b[class_idx].end) ap = &g_tls_active_page_b[class_idx];
|
|||
|
|
else if (g_tls_active_page_c[class_idx].page && g_tls_active_page_c[class_idx].count > 0 && g_tls_active_page_c[class_idx].bump < g_tls_active_page_c[class_idx].end) ap = &g_tls_active_page_c[class_idx]; // QW2-adjusted
|
|||
|
|
if (ap) {
|
|||
|
|
// Opportunistically fill TLS ring from active page as well
|
|||
|
|
if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
|
|||
|
|
int need = POOL_L2_RING_CAP - ring->top;
|
|||
|
|
(void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);
|
|||
|
|
}
|
|||
|
|
PoolBlock* b = NULL;
|
|||
|
|
if (ring->top > 0) { b = ring->items[--ring->top]; }
|
|||
|
|
else if (ap->page && ap->count > 0 && ap->bump < ap->end) {
|
|||
|
|
b = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count<=0){ ap->page=NULL; ap->count=0; }
|
|||
|
|
}
|
|||
|
|
if (b) {
|
|||
|
|
void* raw = (void*)b;
|
|||
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
mid_set_header(hdr, g_class_sizes[class_idx], site_id);
|
|||
|
|
mid_page_inuse_inc(raw);
|
|||
|
|
g_pool.hits[class_idx]++;
|
|||
|
|
return (char*)raw + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Lock the shard freelist for this (class, shard)
|
|||
|
|
pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m;
|
|||
|
|
HKM_TIME_START(t_lock);
|
|||
|
|
struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1);
|
|||
|
|
(void)ts_lk1; (void)lk1; // Unused profiling variables
|
|||
|
|
pthread_mutex_lock(lock);
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_LOCK, t_lock);
|
|||
|
|
hkm_prof_end(lk1, HKP_POOL_LOCK, &ts_lk1);
|
|||
|
|
|
|||
|
|
// Try to pop from freelist
|
|||
|
|
PoolBlock* block = g_pool.freelist[class_idx][shard_idx];
|
|||
|
|
|
|||
|
|
if (!block) {
|
|||
|
|
// Before refilling, try draining remote stack and simple shard steal
|
|||
|
|
int stole = 0;
|
|||
|
|
const FrozenPolicy* pol = hkm_policy_get();
|
|||
|
|
if (pol) {
|
|||
|
|
uint16_t cap = 0;
|
|||
|
|
if (class_idx < 5) cap = pol->mid_cap[class_idx];
|
|||
|
|
else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;
|
|||
|
|
else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;
|
|||
|
|
// Drain remote stack regardless of cap (cheap and helps reuse)
|
|||
|
|
if (atomic_load_explicit(&g_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) {
|
|||
|
|
drain_remote_locked(class_idx, shard_idx);
|
|||
|
|
block = g_pool.freelist[class_idx][shard_idx];
|
|||
|
|
}
|
|||
|
|
if (!block && cap > 0 && g_pool.pages_by_class[class_idx] >= cap) {
|
|||
|
|
HKM_TIME_START(t_steal);
|
|||
|
|
for (int d = 1; d <= 4 && !stole; d++) {
|
|||
|
|
int s1 = (shard_idx + d) & (POOL_NUM_SHARDS - 1);
|
|||
|
|
int s2 = (shard_idx - d) & (POOL_NUM_SHARDS - 1);
|
|||
|
|
if (is_shard_nonempty(class_idx, s1)) {
|
|||
|
|
pthread_mutex_t* l2 = &g_pool.freelist_locks[class_idx][s1].m;
|
|||
|
|
pthread_mutex_lock(l2);
|
|||
|
|
PoolBlock* b2 = g_pool.freelist[class_idx][s1];
|
|||
|
|
if (b2) {
|
|||
|
|
g_pool.freelist[class_idx][s1] = b2->next;
|
|||
|
|
if (!g_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1);
|
|||
|
|
block = b2; stole = 1;
|
|||
|
|
}
|
|||
|
|
pthread_mutex_unlock(l2);
|
|||
|
|
}
|
|||
|
|
if (!stole && is_shard_nonempty(class_idx, s2)) {
|
|||
|
|
pthread_mutex_t* l3 = &g_pool.freelist_locks[class_idx][s2].m;
|
|||
|
|
pthread_mutex_lock(l3);
|
|||
|
|
PoolBlock* b3 = g_pool.freelist[class_idx][s2];
|
|||
|
|
if (b3) {
|
|||
|
|
g_pool.freelist[class_idx][s2] = b3->next;
|
|||
|
|
if (!g_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2);
|
|||
|
|
block = b3; stole = 1;
|
|||
|
|
}
|
|||
|
|
pthread_mutex_unlock(l3);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
HKM_TIME_END(HKM_CAT_SHARD_STEAL, t_steal);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!stole && !block) {
|
|||
|
|
// Freelist empty, refill page
|
|||
|
|
{
|
|||
|
|
// choose empty TLS slot for new page (check all 3 slots) // QW2-adjusted
|
|||
|
|
PoolTLSPage* tap = NULL;
|
|||
|
|
if (g_tls_active_page_a[class_idx].page == NULL || g_tls_active_page_a[class_idx].count == 0) tap = &g_tls_active_page_a[class_idx];
|
|||
|
|
else if (g_tls_active_page_b[class_idx].page == NULL || g_tls_active_page_b[class_idx].count == 0) tap = &g_tls_active_page_b[class_idx];
|
|||
|
|
else if (g_tls_active_page_c[class_idx].page == NULL || g_tls_active_page_c[class_idx].count == 0) tap = &g_tls_active_page_c[class_idx]; // QW2-adjusted
|
|||
|
|
else tap = &g_tls_active_page_a[class_idx]; // fallback overwrite oldest if all 3 busy
|
|||
|
|
HKM_TIME_START(t_alloc_page);
|
|||
|
|
if (alloc_tls_page(class_idx, tap)) {
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_ALLOC_TLS_PAGE, t_alloc_page);
|
|||
|
|
pthread_mutex_unlock(lock);
|
|||
|
|
// rebind to the page we just allocated and top-up ring from bump-run
|
|||
|
|
ap = tap;
|
|||
|
|
if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
|
|||
|
|
int need = POOL_L2_RING_CAP - ring->top;
|
|||
|
|
(void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);
|
|||
|
|
}
|
|||
|
|
PoolBlock* takeb = NULL;
|
|||
|
|
if (ring->top > 0) { HKM_TIME_START(t_ring_pop2); takeb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop2);}
|
|||
|
|
else if (ap->page && ap->count > 0 && ap->bump < ap->end) { takeb = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count==0){ ap->page=NULL; ap->count=0; } }
|
|||
|
|
void* raw2 = (void*)takeb;
|
|||
|
|
AllocHeader* hdr2 = (AllocHeader*)raw2;
|
|||
|
|
mid_set_header(hdr2, g_class_sizes[class_idx], site_id);
|
|||
|
|
mid_page_inuse_inc(raw2);
|
|||
|
|
g_pool.hits[class_idx]++;
|
|||
|
|
return (char*)raw2 + HEADER_SIZE;
|
|||
|
|
}
|
|||
|
|
HKM_TIME_START(t_refill);
|
|||
|
|
struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf);
|
|||
|
|
int ok = refill_freelist(class_idx, shard_idx);
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_REFILL, t_refill);
|
|||
|
|
hkm_prof_end(rf, HKP_POOL_REFILL, &ts_rf);
|
|||
|
|
if (!ok) {
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.misses[class_idx]++;
|
|||
|
|
pthread_mutex_unlock(lock);
|
|||
|
|
return NULL; // Out of memory
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Try again after refill
|
|||
|
|
block = g_pool.freelist[class_idx][shard_idx];
|
|||
|
|
if (!block) {
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.misses[class_idx]++;
|
|||
|
|
pthread_mutex_unlock(lock);
|
|||
|
|
return NULL; // Refill failed
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Pop block from freelist (block points to raw allocation = header start)
|
|||
|
|
g_pool.freelist[class_idx][shard_idx] = block->next;
|
|||
|
|
// Adopt shared page to this thread (first touch) to improve TC routing
|
|||
|
|
mid_desc_adopt(block, class_idx, (uint64_t)(uintptr_t)pthread_self());
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
|
|||
|
|
|
|||
|
|
// Clear bit if freelist becomes empty
|
|||
|
|
if (g_pool.freelist[class_idx][shard_idx] == NULL) {
|
|||
|
|
clear_nonempty_bit(class_idx, shard_idx);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
pthread_mutex_unlock(lock);
|
|||
|
|
|
|||
|
|
// Save to TLS local (ring if possible、else LIFO)、then pop
|
|||
|
|
PoolBlock* take;
|
|||
|
|
if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { HKM_CNT(HKM_CAT_TLS_FAST); ring->items[ring->top++] = block; HKM_TIME_START(t_ring_pop4); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop4); }
|
|||
|
|
else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++;
|
|||
|
|
if (g_tls_ring_enabled && ring->top > 0) { HKM_CNT(HKM_CAT_TLS_FAST); HKM_TIME_START(t_ring_pop5); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop5); }
|
|||
|
|
else { HKM_TIME_START(t_lifo_pop2); take = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = take->next; if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop2); } }
|
|||
|
|
|
|||
|
|
// Construct header fields now (freelist used header area for links)
|
|||
|
|
void* raw = (void*)take;
|
|||
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
mid_set_header(hdr, g_class_sizes[class_idx], site_id);
|
|||
|
|
mid_page_inuse_inc(raw);
|
|||
|
|
|
|||
|
|
// Calculate user pointer (skip header)
|
|||
|
|
void* user_ptr = (char*)raw + HEADER_SIZE;
|
|||
|
|
|
|||
|
|
// ゼロ化禁止(calloc以外)
|
|||
|
|
// デバッグモードのみパターン埋め
|
|||
|
|
#ifdef HAKMEM_DEBUG_SANITIZE
|
|||
|
|
memset(user_ptr, 0xA5, g_class_sizes[class_idx]); // パターン埋め
|
|||
|
|
#endif
|
|||
|
|
// 本番: ゼロ化なし(15-25% 高速化)
|
|||
|
|
|
|||
|
|
return user_ptr;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_free(void* ptr, size_t size, uintptr_t site_id) {
|
|||
|
|
if (!ptr) return;
|
|||
|
|
hak_pool_init(); // pthread_once() ensures thread-safe init (no data race!)
|
|||
|
|
if (!hak_pool_is_poolable(size)) return;
|
|||
|
|
|
|||
|
|
// MF2: Per-Page Sharding path
|
|||
|
|
if (g_mf2_enabled) {
|
|||
|
|
mf2_free(ptr);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OLD PATH: ptr is user pointer, get raw pointer (header start)
|
|||
|
|
void* raw = (char*)ptr - HEADER_SIZE;
|
|||
|
|
|
|||
|
|
// Validate header unless we can prove Mid ownership via page descriptor.
|
|||
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|||
|
|
int mid_by_desc = 0;
|
|||
|
|
MidPageDesc* d_desc = mid_desc_lookup(ptr);
|
|||
|
|
if (d_desc) mid_by_desc = 1;
|
|||
|
|
if (!mid_by_desc && g_hdr_light_enabled < 2) {
|
|||
|
|
if (hdr->magic != HAKMEM_MAGIC) {
|
|||
|
|
MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X",
|
|||
|
|
hdr->magic, HAKMEM_MAGIC);
|
|||
|
|
return; // Skip free (corruption detected)
|
|||
|
|
}
|
|||
|
|
if (hdr->method != ALLOC_METHOD_POOL) {
|
|||
|
|
MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)",
|
|||
|
|
hdr->method, ALLOC_METHOD_POOL);
|
|||
|
|
return; // Skip free (not a pool allocation)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Get class and shard indices
|
|||
|
|
int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size);
|
|||
|
|
if (class_idx < 0) return;
|
|||
|
|
|
|||
|
|
int shard_idx = hak_pool_get_shard_index(site_id);
|
|||
|
|
(void)shard_idx; // Unused in MF2 path
|
|||
|
|
|
|||
|
|
PoolBlock* block = (PoolBlock*)raw;
|
|||
|
|
if (g_pool.tls_free_enabled) {
|
|||
|
|
// Same-thread fast path: prefer TLS caches. If header lacks owner (light),
|
|||
|
|
// consult page descriptor for TLS-owned pages; otherwise fall back to remote.
|
|||
|
|
int same_thread = 0;
|
|||
|
|
if (g_hdr_light_enabled >= 1) {
|
|||
|
|
MidPageDesc* d = mid_desc_lookup(raw);
|
|||
|
|
if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) {
|
|||
|
|
same_thread = 1;
|
|||
|
|
}
|
|||
|
|
} else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) {
|
|||
|
|
same_thread = 1;
|
|||
|
|
}
|
|||
|
|
if (same_thread) {
|
|||
|
|
PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
|
|||
|
|
if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
|
|||
|
|
ring->items[ring->top++] = block;
|
|||
|
|
} else {
|
|||
|
|
// Push to TLS local LIFO; only溢れたときにremoteへ少量spill
|
|||
|
|
block->next = g_tls_bin[class_idx].lo_head;
|
|||
|
|
g_tls_bin[class_idx].lo_head = block;
|
|||
|
|
g_tls_bin[class_idx].lo_count++;
|
|||
|
|
if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
|
|||
|
|
size_t spill = g_tls_bin[class_idx].lo_count / 2;
|
|||
|
|
int shard = hak_pool_get_shard_index(site_id);
|
|||
|
|
while (spill-- && g_tls_bin[class_idx].lo_head) {
|
|||
|
|
PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; g_tls_bin[class_idx].lo_count--;
|
|||
|
|
HKM_TIME_START(t_remote_push1);
|
|||
|
|
uintptr_t old_head;
|
|||
|
|
do {
|
|||
|
|
old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
|
|||
|
|
b->next = (PoolBlock*)old_head;
|
|||
|
|
} while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed));
|
|||
|
|
atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1);
|
|||
|
|
}
|
|||
|
|
set_nonempty_bit(class_idx, shard);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} else {
|
|||
|
|
// Cross-thread: remote push to target shard
|
|||
|
|
if (g_tc_enabled) {
|
|||
|
|
uint64_t owner_tid = 0;
|
|||
|
|
if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid;
|
|||
|
|
if (owner_tid == 0) {
|
|||
|
|
MidPageDesc* d = mid_desc_lookup(raw);
|
|||
|
|
if (d) owner_tid = d->owner_tid;
|
|||
|
|
}
|
|||
|
|
if (owner_tid != 0) {
|
|||
|
|
MidTC* otc = mid_tc_lookup_by_tid(owner_tid);
|
|||
|
|
if (otc) { mid_tc_push(otc, class_idx, block); return; }
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
int shard = hak_pool_get_shard_index(site_id);
|
|||
|
|
uintptr_t old_head;
|
|||
|
|
HKM_TIME_START(t_remote_push2);
|
|||
|
|
do {
|
|||
|
|
old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
|
|||
|
|
block->next = (PoolBlock*)old_head;
|
|||
|
|
} while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));
|
|||
|
|
atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
|
|||
|
|
HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2);
|
|||
|
|
set_nonempty_bit(class_idx, shard);
|
|||
|
|
}
|
|||
|
|
} else {
|
|||
|
|
// Return to global freelist (A/B testing path)
|
|||
|
|
int shard_idx = hak_pool_get_shard_index(site_id);
|
|||
|
|
pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m;
|
|||
|
|
pthread_mutex_lock(lock);
|
|||
|
|
block->next = g_pool.freelist[class_idx][shard_idx];
|
|||
|
|
g_pool.freelist[class_idx][shard_idx] = block;
|
|||
|
|
set_nonempty_bit(class_idx, shard_idx);
|
|||
|
|
pthread_mutex_unlock(lock);
|
|||
|
|
}
|
|||
|
|
t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
|
|||
|
|
if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
|
|||
|
|
// Decrement in-use and enqueue DONTNEED if page becomes empty
|
|||
|
|
mid_page_inuse_dec_and_maybe_dn(raw);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_print_stats(void) {
|
|||
|
|
if (!g_pool.initialized) return;
|
|||
|
|
|
|||
|
|
printf("\n========================================\n");
|
|||
|
|
printf("L2 Pool Statistics\n");
|
|||
|
|
printf("========================================\n");
|
|||
|
|
|
|||
|
|
uint64_t total_hits = 0, total_misses = 0, total_refills = 0, total_frees = 0;
|
|||
|
|
|
|||
|
|
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
|
|||
|
|
if (g_class_sizes[i] == 0) continue; // skip disabled dynamic class
|
|||
|
|
total_hits += g_pool.hits[i];
|
|||
|
|
total_misses += g_pool.misses[i];
|
|||
|
|
total_refills += g_pool.refills[i];
|
|||
|
|
total_frees += g_pool.frees[i];
|
|||
|
|
|
|||
|
|
printf("Class %zu KB:\n", g_class_sizes[i] / 1024);
|
|||
|
|
printf(" Hits: %lu\n", (unsigned long)g_pool.hits[i]);
|
|||
|
|
printf(" Misses: %lu\n", (unsigned long)g_pool.misses[i]);
|
|||
|
|
printf(" Refills: %lu\n", (unsigned long)g_pool.refills[i]);
|
|||
|
|
printf(" Frees: %lu\n", (unsigned long)g_pool.frees[i]);
|
|||
|
|
|
|||
|
|
if (g_pool.hits[i] + g_pool.misses[i] > 0) {
|
|||
|
|
double hit_rate = (double)g_pool.hits[i] / (g_pool.hits[i] + g_pool.misses[i]) * 100.0;
|
|||
|
|
printf(" Hit rate: %.1f%%\n", hit_rate);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
printf("\n----------------------------------------\n");
|
|||
|
|
printf("Summary:\n");
|
|||
|
|
printf(" Total hits: %lu\n", (unsigned long)total_hits);
|
|||
|
|
printf(" Total misses: %lu\n", (unsigned long)total_misses);
|
|||
|
|
printf(" Total refills: %lu\n", (unsigned long)total_refills);
|
|||
|
|
printf(" Total frees: %lu\n", (unsigned long)total_frees);
|
|||
|
|
printf(" Pages allocated: %lu\n", (unsigned long)g_pool.total_pages_allocated);
|
|||
|
|
printf(" Bytes allocated: %lu KB\n", (unsigned long)(g_pool.total_bytes_allocated / 1024));
|
|||
|
|
|
|||
|
|
if (total_hits + total_misses > 0) {
|
|||
|
|
double hit_rate = (double)total_hits / (total_hits + total_misses) * 100.0;
|
|||
|
|
printf(" Overall hit rate: %.1f%%\n", hit_rate);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
printf("========================================\n");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_stats_snapshot(uint64_t hits[], uint64_t misses[], uint64_t refills[], uint64_t frees[]) {
|
|||
|
|
if (!g_pool.initialized) {
|
|||
|
|
// Zero out if not initialized
|
|||
|
|
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
|
|||
|
|
if (hits) hits[i] = 0;
|
|||
|
|
if (misses) misses[i] = 0;
|
|||
|
|
if (refills) refills[i] = 0;
|
|||
|
|
if (frees) frees[i] = 0;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
|
|||
|
|
if (hits) hits[i] = g_pool.hits[i];
|
|||
|
|
if (misses) misses[i] = g_pool.misses[i];
|
|||
|
|
if (refills) refills[i] = g_pool.refills[i];
|
|||
|
|
if (frees) frees[i] = g_pool.frees[i];
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_extra_metrics_snapshot(uint64_t* trylock_attempts, uint64_t* trylock_success, uint64_t* ring_underflow) {
|
|||
|
|
if (trylock_attempts) {
|
|||
|
|
*trylock_attempts = atomic_load_explicit(&g_pool.trylock_attempts, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
if (trylock_success) {
|
|||
|
|
*trylock_success = atomic_load_explicit(&g_pool.trylock_success, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
if (ring_underflow) {
|
|||
|
|
*ring_underflow = atomic_load_explicit(&g_pool.ring_underflow, memory_order_relaxed);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
int hak_pool_mid_lookup(void* ptr, size_t* out_size) {
|
|||
|
|
// CRITICAL FIX: Check MF2 registry first if MF2 is enabled
|
|||
|
|
if (g_mf2_enabled) {
|
|||
|
|
MidPage* page = mf2_addr_to_page(ptr);
|
|||
|
|
if (page) {
|
|||
|
|
int c = (int)page->class_idx;
|
|||
|
|
if (c < 0 || c >= POOL_NUM_CLASSES) return 0;
|
|||
|
|
size_t sz = g_class_sizes[c];
|
|||
|
|
if (sz == 0) return 0;
|
|||
|
|
if (out_size) *out_size = sz;
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
// Not an MF2 page - fall through to old lookup
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OLD PATH: Use mid_desc lookup
|
|||
|
|
MidPageDesc* d = mid_desc_lookup(ptr);
|
|||
|
|
if (!d) return 0;
|
|||
|
|
int c = (int)d->class_idx;
|
|||
|
|
if (c < 0 || c >= POOL_NUM_CLASSES) return 0;
|
|||
|
|
size_t sz = g_class_sizes[c];
|
|||
|
|
if (sz == 0) return 0;
|
|||
|
|
if (out_size) *out_size = sz;
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void hak_pool_free_fast(void* ptr, uintptr_t site_id) {
|
|||
|
|
if (!ptr || !g_pool.initialized) return;
|
|||
|
|
|
|||
|
|
// CRITICAL FIX: If MF2 is enabled, mid_desc_lookup will FAIL because MF2 pages
|
|||
|
|
// are registered in g_mf2_page_registry, not mid_desc! Route directly to MF2.
|
|||
|
|
if (g_mf2_enabled) {
|
|||
|
|
// Check if this is an MF2 page by looking it up in the MF2 registry
|
|||
|
|
MidPage* page = mf2_addr_to_page(ptr);
|
|||
|
|
|
|||
|
|
if (page) {
|
|||
|
|
// MF2 page found - free through MF2 path
|
|||
|
|
mf2_free(ptr);
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// Not an MF2 page - might be from old allocator or another tier
|
|||
|
|
// Fall through to old path (though this shouldn't happen if MF2 is exclusive)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// OLD PATH: Use mid_desc lookup
|
|||
|
|
MidPageDesc* d = mid_desc_lookup(ptr);
|
|||
|
|
if (!d) return;
|
|||
|
|
size_t sz = g_class_sizes[(int)d->class_idx];
|
|||
|
|
if (sz == 0) return;
|
|||
|
|
hak_pool_free(ptr, sz, site_id);
|
|||
|
|
}
|