Files
hakmem/core/hakmem_l25_pool.c
Moe Charm (CI) 52386401b3 Debug Counters Implementation - Clean History
Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 12:31:14 +09:00

1196 lines
52 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// hakmem_l25_pool.c - L2.5 LargePool Implementation (64KB-1MB)
// ============================================================================
//
// サイズクラス定義:
// ┌──────────┬─────────┬──────────────┬─────────────┐
// │ クラス │ サイズ │ 初期CAP │ ページ構成 │
// ├──────────┼─────────┼──────────────┼─────────────┤
// │ Class 0 │ 64 KB │ 8 bundles │ 1 page/b │
// │ Class 1 │ 128 KB │ 8 bundles │ 2 pages/b │
// │ Class 2 │ 256 KB │ 4 bundles │ 4 pages/b │
// │ Class 3 │ 512 KB │ 2 bundles │ 8 pages/b │
// │ Class 4 │ 1 MB │ 1 bundle │ 16 pages/b │
// └──────────┴─────────┴──────────────┴─────────────┘
//
// W_MAX_LARGE (切り上げ許容倍率):
// - 意味: 要求サイズの何倍までのクラスを許容するか
// - デフォルト: 1.30 (30%までの切り上げを許容) - **保守的**
// - 推奨値: 1.60 (60%までの切り上げを許容) - ギャップ対策
// - 例: 40KBの要求 → 64KBクラス使用OK (1.60倍 < 1.60)
// - 環境変数: HAKMEM_WMAX_LARGE=1.6 で変更可能
//
// 重要: 32-64KB ギャップ対策
// - 32KBを超える要求は、L2 Mid Poolでカバーできない
// - W_MAX_LARGE=1.30だと、32KB要求は64KBに丸められない (2.0倍 > 1.30)
// - W_MAX_LARGE=1.60に緩和することで、40KB以上を64KBクラスでカバー
// - これにより32-64KBギャップの一部を解消
//
// CAP (在庫量):
// - 意味: 各クラスで保持する最大バンドル数
// - 初期値: {8,8,4,2,1} - 保守的(フットプリント優先)
// - 推奨値: {32,32,16,8,4} - パフォーマンス優先4倍化
// - 環境変数: HAKMEM_CAP_LARGE=32,32,16,8,4 で設定
//
// TLS構造:
// - リング: POOL_L25_RING_CAPデフォルト16
// - ActiveRun: bump-run方式連続メモリから切り出し
// - LIFO overflow: リングから溢れた分
// - Remote-free: MPSC queueクロススレッドfree処理
//
// パフォーマンスチューニング:
// 1. ⭐⭐⭐ W_MAX_LARGE緩和: HAKMEM_WMAX_LARGE=1.6
// 2. ⭐⭐ 初期CAP 4倍化: HAKMEM_CAP_LARGE=32,32,16,8,4
// 3. BG drain有効化: HAKMEM_L25_BG_DRAIN=1
//
// License: MIT
// Date: 2025-10-24 (Phase 6.x - 綺麗綺麗大作戦)
#include "hakmem_l25_pool.h"
#include "hakmem_config.h"
#include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC
#include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <pthread.h>
#include <stdatomic.h>
#include "hakmem_prof.h"
#include "hakmem_debug.h"
#include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP guidance)
#include <assert.h>
// False sharing mitigation: padded mutex type (64B)
typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;
// ===========================================================================
// Internal Data Structures
// ===========================================================================
// Freelist node header (embedded in allocated bundle, same as L2 Pool pattern)
typedef struct L25Block {
struct L25Block* next; // Next block in freelist
} L25Block;
// Phase 6.17: TLS two-tier cacheリング + ローカルLIFO
#ifndef POOL_L25_RING_CAP
#define POOL_L25_RING_CAP 16
#endif
typedef struct { L25Block* items[POOL_L25_RING_CAP]; int top; } L25TLSRing;
typedef struct { L25TLSRing ring; L25Block* lo_head; size_t lo_count; } L25TLSBin;
static __thread L25TLSBin g_l25_tls_bin[L25_NUM_CLASSES];
// TLS ActiveRun (bump-run). We mmap a run that holds multiple class-sized blocks
// and hand out addresses by simple pointer arithmetic (no per-block linking).
typedef struct {
char* base; // start of run (raw, header at base)
char* cursor; // next header address to serve
char* end; // end of run (exclusive)
} L25ActiveRun;
static __thread L25ActiveRun g_l25_active[L25_NUM_CLASSES];
// Global L2.5 pool state (simplified: single-threaded for MVP)
static struct {
L25Block* freelist[L25_NUM_CLASSES][L25_NUM_SHARDS];
// Fine-grained locks per (class, shard) freelist (padded)
PaddedMutex freelist_locks[L25_NUM_CLASSES][L25_NUM_SHARDS];
// Phase 6.10.1 pattern: non-empty bitmap (O(1) empty class skip)
// Use atomic bit operations to avoid class-wide locks
atomic_uint_fast64_t nonempty_mask[L25_NUM_CLASSES]; // 1 bit per shard
// Statistics
uint64_t hits[L25_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t misses[L25_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t refills[L25_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t frees[L25_NUM_CLASSES] __attribute__((aligned(64)));
uint64_t total_bytes_allocated __attribute__((aligned(64)));
uint64_t total_bundles_allocated __attribute__((aligned(64)));
// Per-class bundle accounting (for Soft CAP guidance)
uint64_t bundles_by_class[L25_NUM_CLASSES] __attribute__((aligned(64)));
int initialized;
int demand_zero; // env: HAKMEM_L25_DZ=1
// Remote-free MPSC stacks per (class, shard)
atomic_uintptr_t remote_head[L25_NUM_CLASSES][L25_NUM_SHARDS];
atomic_uint remote_count[L25_NUM_CLASSES][L25_NUM_SHARDS];
} g_l25_pool;
static int g_wrap_l25_enabled = 0; // env: HAKMEM_WRAP_L25=1 to allow in wrappers
static int g_l25_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING
static int g_l25_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES
static int g_l25_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX
static int g_l25_ring_return_div = 3; // env: HAKMEM_RING_RETURN_DIV
static int g_l25_ring_trigger = 2; // env: HAKMEM_L25_RING_TRIGGER
static int g_l25_owner_inbound = 0; // env: HAKMEM_L25_OWNER_INBOUND (0/1)
static int g_l25_in_slots = 512; // env: HAKMEM_L25_INBOUND_SLOTS (<= compiled max)
extern int g_hdr_light_enabled; // shared with Mid pool
static int g_l25_run_blocks_override = 0; // env: HAKMEM_L25_RUN_BLOCKS (0=per-class defaults)
static int g_l25_shard_mix = 1; // env: HAKMEM_SHARD_MIX (0/1, default ON)
static int g_l25_pref_remote_first = 1; // env: HAKMEM_L25_PREF=remote|run (default remote)
static int g_l25_tc_spill = 32; // env: HAKMEM_L25_TC_SPILL (spill threshold)
static int g_l25_bg_drain_enabled = 0; // env: HAKMEM_L25_BG_DRAIN=1 to enable BG drain of remote
static int g_l25_bg_interval_ms = 5; // env: HAKMEM_L25_BG_MS
static int g_l25_bg_remote_enable = 0; // env: HAKMEM_L25_BG_REMOTE
static int g_l25_probe_auto = 0; // env: HAKMEM_L25_PROBE_AUTO
static int g_l25_remote_threshold = 32; // env: HAKMEM_L25_REMOTE_THRESHOLD
static int g_l25_bg_remote_batch = 64; // env: HAKMEM_L25_BG_REMOTE_BATCH
static pthread_t g_l25_bg_thread;
// Size class table (for reference)
static const size_t g_class_sizes[L25_NUM_CLASSES] = {
L25_CLASS_64KB,
L25_CLASS_128KB,
L25_CLASS_256KB,
L25_CLASS_512KB,
L25_CLASS_1MB
};
// Phase 6.11.5 P0: Pre-initialized header templates for fast allocation
// Reduces AllocHeader reconstruction from 100-150 cycles to 40-50 cycles
static const AllocHeader g_header_templates[L25_NUM_CLASSES] = {
{HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_64KB, 0, 0, 0}, // 64KB
{HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_128KB, 0, 0, 0}, // 128KB
{HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_256KB, 0, 0, 0}, // 256KB
{HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_512KB, 0, 0, 0}, // 512KB
{HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_1MB, 0, 0, 0} // 1MB
};
// Pages per bundle (for each class)
static const int g_pages_per_bundle[L25_NUM_CLASSES] = {
1, // 64KB = 1 × 64KB page
2, // 128KB = 2 × 64KB pages
4, // 256KB = 4 × 64KB pages
8, // 512KB = 8 × 64KB pages
16 // 1MB = 16 × 64KB pages
};
// Default blocks per bump-run per class (≈2MB per run)
static const int g_blocks_per_run_default[L25_NUM_CLASSES] = { 32, 16, 8, 4, 2 };
static inline size_t l25_stride_bytes(int class_idx) {
return HEADER_SIZE + g_class_sizes[class_idx];
}
static int g_l25_run_factor = 1; // env: HAKMEM_L25_RUN_FACTOR (1..8)
static inline int l25_blocks_per_run(int class_idx) {
if (g_l25_run_blocks_override > 0) return g_l25_run_blocks_override;
int base = g_blocks_per_run_default[class_idx];
long long val = (long long)base * (long long)g_l25_run_factor;
if (val < 1) val = 1;
if (val > 1024) val = 1024;
return (int)val;
}
static inline void l25_write_header(AllocHeader* hdr, int class_idx, uintptr_t site_id) {
if (g_hdr_light_enabled >= 2) {
return; // no writes
} else if (g_hdr_light_enabled >= 1) {
hdr->magic = HAKMEM_MAGIC;
hdr->method = ALLOC_METHOD_L25_POOL;
hdr->size = g_class_sizes[class_idx];
hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
} else {
memcpy(hdr, &g_header_templates[class_idx], sizeof(AllocHeader));
hdr->alloc_site = site_id;
hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
}
}
// ===========================================================================
// Helper Functions - inline綺麗綺麗大作戦
// ===========================================================================
// Phase 6.10.1 pattern: branchless LUT (Lookup Table) for O(1) class determination
// SIZE_TO_CLASS[i] = class for (i * 64KB)
// Index 0: invalid, 1: 64KB (class 0), 2: 128KB (class 1), 4: 256KB (class 2), etc.
static const int8_t SIZE_TO_CLASS[] = {
-1, // index 0: 0KB - invalid
0, // index 1: 64KB → Class 0
1, // index 2: 128KB → Class 1
-1, // index 3: 192KB (between 128KB and 256KB)
2, // index 4: 256KB → Class 2
-1, -1, -1, // index 5-7: 320KB-448KB
3, // index 8: 512KB → Class 3
-1, -1, -1, -1, -1, -1, -1, // index 9-15: 576KB-960KB
4 // index 16: 1MB → Class 4
};
// Get size class index from size (0-4, or -1 if out of range)
// inline綺麗綺麗: O(1) branchless lookup, zero function call overhead
static inline int hak_l25_pool_get_class_index(size_t size) {
// Round to 64KB units
size_t kb64 = (size + L25_PAGE_SIZE - 1) / L25_PAGE_SIZE;
// Direct LUT lookup (O(1), branchless)
if (kb64 == 0 || kb64 > 16) return -1;
return SIZE_TO_CLASS[kb64];
}
// Get shard index from site_id (0-63)
// inline綺麗綺麗: Same pattern as L2 Pool
static inline uint64_t splitmix64(uint64_t x) {
x += 0x9e3779b97f4a7c15ULL;
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
return x ^ (x >> 31);
}
int hak_l25_pool_get_shard_index(uintptr_t site_id) {
if (g_l25_shard_mix) {
uint64_t h = splitmix64((uint64_t)site_id);
return (int)(h & (L25_NUM_SHARDS - 1));
}
// Shift by 4 to reduce collision (instruction alignment)
return (int)((site_id >> 4) & (L25_NUM_SHARDS - 1));
}
// Phase 6.10.1 pattern: Bitmap helpers (O(1) empty class detection)
// inline綺麗綺麗: Zero overhead, perfect for hot path
static inline void set_nonempty_bit(int class_idx, int shard_idx) {
// Atomic OR with release semantics (ensures freelist write is visible)
atomic_fetch_or_explicit(&g_l25_pool.nonempty_mask[class_idx],
(uint64_t)(1ULL << shard_idx),
memory_order_release);
}
static inline void clear_nonempty_bit(int class_idx, int shard_idx) {
// Atomic AND with release semantics (ensures freelist clear is visible)
atomic_fetch_and_explicit(&g_l25_pool.nonempty_mask[class_idx],
~(uint64_t)(1ULL << shard_idx),
memory_order_release);
}
static inline int is_shard_nonempty(int class_idx, int shard_idx) {
// Atomic load with acquire semantics (ensures freelist read is valid)
uint64_t mask = atomic_load_explicit(&g_l25_pool.nonempty_mask[class_idx],
memory_order_acquire);
return (mask & (1ULL << shard_idx)) != 0;
}
// Choose a non-empty shard near preferred using the nonempty mask. If none, return preferred.
static inline int l25_choose_nonempty_shard(int class_idx, int preferred) {
uint64_t mask = atomic_load_explicit(&g_l25_pool.nonempty_mask[class_idx], memory_order_acquire);
if (!mask) return preferred;
int shift = preferred & 63;
uint64_t rot = (mask >> shift) | (mask << (64 - shift));
if (!rot) return preferred;
int off = __builtin_ctzll(rot);
return (preferred + off) & (L25_NUM_SHARDS - 1);
}
// Drain remote-free MPSC stack into freelist under the shard lock
static inline void l25_drain_remote_locked(int class_idx, int shard_idx) {
uintptr_t head = atomic_exchange_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel);
int drained = 0;
L25Block* list = (L25Block*)head;
if (list) {
// Tail-link pattern: find tail, then link entire chain at once (MT-safe)
L25Block* tail = list;
int count = 1;
while (tail->next) {
tail = tail->next;
count++;
}
// Single atomic write to freelist (prevents race with concurrent alloc)
tail->next = g_l25_pool.freelist[class_idx][shard_idx];
g_l25_pool.freelist[class_idx][shard_idx] = list;
drained = count;
}
if (drained) {
atomic_fetch_sub_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed);
if (g_l25_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx);
}
}
// =====================
// Bump-run TLS helpers
// =====================
static inline int l25_refill_tls_from_active(int class_idx, L25TLSRing* ring, int need) {
if (need <= 0) need = POOL_L25_RING_CAP;
L25ActiveRun* ar = &g_l25_active[class_idx];
if (!ar->base) return 0;
size_t stride = l25_stride_bytes(class_idx);
size_t avail = (size_t)((ar->end - ar->cursor) / (ptrdiff_t)stride);
if (avail == 0) { ar->base = ar->cursor = ar->end = NULL; return 0; }
int k = (int)((size_t)need < avail ? (size_t)need : avail);
int pushed = 0;
while (pushed < k && ring->top < POOL_L25_RING_CAP) {
L25Block* b = (L25Block*)ar->cursor;
ring->items[ring->top++] = b;
ar->cursor += stride;
pushed++;
}
if (ar->cursor >= ar->end) { ar->base = ar->cursor = ar->end = NULL; }
return pushed;
}
// Forward decl for descriptor registration
static void l25_desc_insert_range(void* base, void* end, int class_idx);
static inline int l25_alloc_new_run(int class_idx) {
int blocks = l25_blocks_per_run(class_idx);
size_t stride = l25_stride_bytes(class_idx);
size_t run_bytes = (size_t)blocks * stride;
void* raw = mmap(NULL, run_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (raw == MAP_FAILED || raw == NULL) return 0;
L25ActiveRun* ar = &g_l25_active[class_idx];
ar->base = (char*)raw;
ar->cursor = (char*)raw;
ar->end = ar->base + run_bytes;
// Register page descriptors for headerless free
l25_desc_insert_range(ar->base, ar->end, class_idx);
// Stats (best-effort)
g_l25_pool.total_bytes_allocated += run_bytes;
g_l25_pool.total_bundles_allocated += blocks;
return 1;
}
// =====================
// L2.5 Page Descriptors
// =====================
typedef struct L25PageDesc {
void* page; // 64KB-aligned page base
int class_idx; // L2.5 class index (0..4)
uintptr_t owner_tid; // Hint: owning thread (at run allocation)
struct L25PageDesc* next;
} L25PageDesc;
#define L25_DESC_BUCKETS 4096
static L25PageDesc* g_l25_desc_head[L25_DESC_BUCKETS];
static inline size_t l25_desc_hash(void* page) {
return ((((uintptr_t)page) >> 16) & (L25_DESC_BUCKETS - 1));
}
static inline void* l25_page_base(void* addr) {
return (void*)((uintptr_t)addr & ~((uintptr_t)L25_PAGE_SIZE - 1));
}
static void l25_desc_insert_range(void* base, void* end, int class_idx) {
char* p = (char*)(((uintptr_t)base) & ~((uintptr_t)L25_PAGE_SIZE - 1));
char* e = (char*)end;
uintptr_t owner = (uintptr_t)(uintptr_t)pthread_self();
for (; p < e; p += L25_PAGE_SIZE) {
size_t h = l25_desc_hash(p);
L25PageDesc* d = (L25PageDesc*)hkm_libc_malloc(sizeof(L25PageDesc)); // Phase 6.X P0 Fix
if (!d) continue; // best-effort
d->page = p;
d->class_idx = class_idx;
d->owner_tid = owner;
d->next = g_l25_desc_head[h];
g_l25_desc_head[h] = d;
}
}
static inline L25PageDesc* l25_desc_lookup_ptr(void* ptr) {
void* page = l25_page_base(ptr);
size_t h = l25_desc_hash(page);
for (L25PageDesc* d = g_l25_desc_head[h]; d; d = d->next) {
if (d->page == page) return d;
}
return NULL;
}
static inline void l25_desc_update_owner(void* ptr, uintptr_t owner) {
L25PageDesc* d = l25_desc_lookup_ptr(ptr);
if (d) d->owner_tid = owner;
}
// ------------------------------
// Owner inbound registry (per-owner MPSC stacks)
// ------------------------------
#ifndef L25_INBOUND_SLOTS
#define L25_INBOUND_SLOTS 512
#endif
typedef struct {
atomic_uintptr_t head[L25_NUM_CLASSES];
atomic_uintptr_t tid; // 0 = empty
} L25InboundSlot;
static L25InboundSlot g_l25_inbound[L25_INBOUND_SLOTS];
static inline size_t inb_hash(uintptr_t tid) {
return (size_t)((tid ^ (tid >> 17) ^ (tid << 9)));
}
static int inbound_get_slot(uintptr_t tid) {
if (tid == 0) return -1;
int limit = g_l25_in_slots;
if (limit <= 0 || limit > L25_INBOUND_SLOTS) limit = L25_INBOUND_SLOTS;
size_t h = inb_hash(tid) % (size_t)limit;
for (int i = 0; i < limit; i++) {
int idx = (int)((h + (size_t)i) % (size_t)limit);
uintptr_t cur = atomic_load_explicit(&g_l25_inbound[idx].tid, memory_order_acquire);
if (cur == tid) return idx;
if (cur == 0) {
uintptr_t zero = 0;
if (atomic_compare_exchange_weak_explicit(&g_l25_inbound[idx].tid, &zero, tid, memory_order_acq_rel, memory_order_relaxed)) {
// initialize heads lazily (they start at 0 by BSS)
return idx;
}
}
}
return -1;
}
static inline void inbound_push_block(int slot, int class_idx, L25Block* b) {
if (slot < 0) return;
uintptr_t old_head;
do {
old_head = atomic_load_explicit(&g_l25_inbound[slot].head[class_idx], memory_order_acquire);
b->next = (L25Block*)old_head;
} while (!atomic_compare_exchange_weak_explicit(&g_l25_inbound[slot].head[class_idx], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed));
}
static inline int inbound_drain_to_tls(uintptr_t self_tid, int class_idx, L25TLSRing* ring) {
if (!g_l25_owner_inbound) return 0;
int slot = inbound_get_slot(self_tid);
if (slot < 0) return 0;
uintptr_t head = atomic_exchange_explicit(&g_l25_inbound[slot].head[class_idx], (uintptr_t)0, memory_order_acq_rel);
int moved = 0;
L25Block* cur = (L25Block*)head;
while (cur) {
L25Block* nxt = cur->next;
if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = cur; }
else { cur->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = cur; g_l25_tls_bin[class_idx].lo_count++; }
moved++;
cur = nxt;
}
return moved;
}
// Exposed to hak_free_at(): headerless lookup (returns 1 on success)
int hak_l25_lookup(void* user_ptr, size_t* out_size) {
L25PageDesc* d = l25_desc_lookup_ptr(user_ptr);
if (!d) return 0;
if (out_size) *out_size = g_class_sizes[d->class_idx];
return 1;
}
// ------------------------------
// Transfer Cache (per-thread)
// ------------------------------
// Per-thread Transfer Cache as array ring (no block writes on fast-path)
#ifndef L25_TC_CAP
#define L25_TC_CAP 64
#endif
typedef struct {
L25Block* items[L25_TC_CAP];
int count; // 0..cap
} L25TCRing;
static __thread L25TCRing g_l25_tc[L25_NUM_CLASSES];
static int g_l25_tc_cap = L25_TC_CAP; // env: HAKMEM_L25_TC_CAP
static inline void l25_tc_append(int class_idx, L25Block* b) {
L25TCRing* tc = &g_l25_tc[class_idx];
if (tc->count < g_l25_tc_cap) {
tc->items[tc->count++] = b;
} else {
// overflow handled by caller via l25_tc_flush
}
}
static inline int l25_tc_flush(int class_idx, int shard_idx) {
L25TCRing* tc = &g_l25_tc[class_idx];
int n = tc->count;
if (n <= 0) return 0;
// Build a linked list from ring (LIFO order) only during flush
L25Block* head = NULL;
for (int i = 0; i < n; i++) {
L25Block* b = tc->items[i];
b->next = head;
head = b;
}
tc->count = 0;
// CRITICAL FIX: Find tail ONCE before CAS loop (prevents list corruption)
// Bug: Previous code found tail inside CAS loop, overwriting tail->next on each retry
L25Block* tail = head;
while (tail && tail->next) tail = tail->next;
// Single CAS to remote_head
uintptr_t old_head;
HKM_TIME_START(t_l25_remote_push_tc);
do {
old_head = atomic_load_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], memory_order_acquire);
// Link tail to current remote_head (safe to update on each retry)
if (tail) tail->next = (L25Block*)old_head;
} while (!atomic_compare_exchange_weak_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], &old_head, (uintptr_t)head, memory_order_release, memory_order_relaxed));
atomic_fetch_add_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], n, memory_order_relaxed);
HKM_TIME_END(HKM_CAT_L25_REMOTE_PUSH, t_l25_remote_push_tc);
set_nonempty_bit(class_idx, shard_idx);
return n;
}
// Exposed to hak_free_at(): headerless fast free using descriptor
void hak_l25_pool_free_fast(void* user_ptr, uintptr_t site_id) {
L25PageDesc* d = l25_desc_lookup_ptr(user_ptr);
if (!d) return; // unknown → drop
int class_idx = d->class_idx;
void* raw = (char*)user_ptr - HEADER_SIZE;
// Optional: demand-zero for larger classes
if (g_l25_pool.demand_zero && class_idx >= 3) {
madvise((char*)raw, HEADER_SIZE + g_class_sizes[class_idx], MADV_DONTNEED);
}
// Same-thread hint: prefer per-block owner if header present (HDR_LIGHT>=1), else page owner
uintptr_t self = (uintptr_t)(uintptr_t)pthread_self();
uintptr_t owner_hint = d->owner_tid;
if (g_hdr_light_enabled >= 1) {
AllocHeader* hdr = (AllocHeader*)raw;
owner_hint = hdr->owner_tid;
}
if (owner_hint == self) {
L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring;
if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) {
ring->items[ring->top++] = (L25Block*)raw;
} else {
L25Block* b = (L25Block*)raw;
b->next = g_l25_tls_bin[class_idx].lo_head;
g_l25_tls_bin[class_idx].lo_head = b;
g_l25_tls_bin[class_idx].lo_count++;
}
} else {
// Remote: push to per-thread TC; spill in batch when threshold reached
int shard = hak_l25_pool_get_shard_index(site_id);
L25Block* block = (L25Block*)raw;
l25_tc_append(class_idx, block);
if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) {
l25_tc_flush(class_idx, shard);
}
}
g_l25_pool.frees[class_idx]++;
}
// =====================
// BG Drain (remote → freelist)
// =====================
static void* l25_bg_main(void* arg) {
(void)arg;
struct timespec ts; ts.tv_sec = 0; ts.tv_nsec = (long)g_l25_bg_interval_ms * 1000000L;
while (g_l25_pool.initialized) {
for (int c = 0; c < L25_NUM_CLASSES; c++) {
for (int s = 0; s < L25_NUM_SHARDS; s++) {
if (atomic_load_explicit(&g_l25_pool.remote_count[c][s], memory_order_relaxed) != 0) {
pthread_mutex_t* l = &g_l25_pool.freelist_locks[c][s].m;
pthread_mutex_lock(l);
if (atomic_load_explicit(&g_l25_pool.remote_count[c][s], memory_order_relaxed) != 0) {
l25_drain_remote_locked(c, s);
}
pthread_mutex_unlock(l);
}
}
}
nanosleep(&ts, NULL);
}
return NULL;
}
// ===========================================================================
// Page Bundle Management (L2 Pool pattern)
// ===========================================================================
// Refill freelist by allocating a new page bundle
// Args: class_idx - size class index (0-4)
// shard_idx - shard index (0-63)
// Returns: 1 on success, 0 on failure
//
// Pattern: Same as L2 Pool - allocate raw memory, write header, return to freelist
static int refill_freelist(int class_idx, int shard_idx) {
if (class_idx < 0 || class_idx >= L25_NUM_CLASSES) return 0;
if (shard_idx < 0 || shard_idx >= L25_NUM_SHARDS) return 0;
size_t user_size = g_class_sizes[class_idx];
size_t bundle_size = HEADER_SIZE + user_size; // Header + user data
// Soft CAP guidance: decide how many bundles to allocate (1..2)
int bundles = 1;
const FrozenPolicy* pol = hkm_policy_get();
static int g_l25_min_bundle = -1; // lazy init from env
if (g_l25_min_bundle < 0) {
const char* e = getenv("HAKMEM_L25_MIN_BUNDLE");
int v = (e ? atoi(e) : 1);
if (v < 1) v = 1; if (v > 2) v = 2; // L2.5 is large; keep conservative
g_l25_min_bundle = v;
}
if (pol) {
uint16_t cap = pol->large_cap[class_idx];
if (cap > 0) {
uint64_t have = g_l25_pool.bundles_by_class[class_idx];
if (have >= cap) {
bundles = 1; // over cap: allocate minimally
} else {
uint64_t deficit = cap - have;
bundles = (deficit >= (uint64_t)g_l25_min_bundle) ? g_l25_min_bundle : 1;
if (bundles > 2) bundles = 2;
}
}
}
int ok_any = 0;
for (int b = 0; b < bundles; b++) {
// Allocate bundle via mmap to avoid malloc contention and allow THP policy later
void* raw = mmap(NULL, bundle_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (!raw) {
if (ok_any) break; else return 0;
}
// Write AllocHeader at start
AllocHeader* hdr = (AllocHeader*)raw;
hdr->magic = HAKMEM_MAGIC;
hdr->method = ALLOC_METHOD_L25_POOL;
hdr->size = user_size;
hdr->alloc_site = 0; // Set by hak_l25_pool_try_alloc
hdr->class_bytes = 0; // L2.5 blocks not cacheable
// Freelist uses raw pointer (header start)
L25Block* block = (L25Block*)raw;
block->next = g_l25_pool.freelist[class_idx][shard_idx];
g_l25_pool.freelist[class_idx][shard_idx] = block;
ok_any = 1;
// Set non-empty bit (freelist now has blocks)
set_nonempty_bit(class_idx, shard_idx);
// Update statistics
g_l25_pool.refills[class_idx]++;
g_l25_pool.total_bundles_allocated++;
g_l25_pool.total_bytes_allocated += bundle_size;
g_l25_pool.bundles_by_class[class_idx]++;
}
return ok_any ? 1 : 0;
}
// ===========================================================================
// Public API
// ===========================================================================
void hak_l25_pool_init(void) {
if (g_l25_pool.initialized) return;
memset(&g_l25_pool, 0, sizeof(g_l25_pool));
for (int c = 0; c < L25_NUM_CLASSES; c++) {
atomic_store(&g_l25_pool.nonempty_mask[c], 0);
for (int s = 0; s < L25_NUM_SHARDS; s++) {
pthread_mutex_init(&g_l25_pool.freelist_locks[c][s].m, NULL);
atomic_store(&g_l25_pool.remote_head[c][s], (uintptr_t)0);
atomic_store(&g_l25_pool.remote_count[c][s], 0);
}
g_l25_pool.bundles_by_class[c] = 0;
}
// Demand-zero toggle
char* dz = getenv("HAKMEM_L25_DZ");
g_l25_pool.demand_zero = (dz && atoi(dz) != 0) ? 1 : 0;
const char* e_wrap = getenv("HAKMEM_WRAP_L25");
g_wrap_l25_enabled = (e_wrap && atoi(e_wrap) != 0) ? 1 : 0;
const char* e_ring = getenv("HAKMEM_POOL_TLS_RING");
if (e_ring) g_l25_tls_ring_enabled = (atoi(e_ring) != 0);
const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES");
if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_l25_trylock_probes = v; }
const char* e_lo = getenv("HAKMEM_TLS_LO_MAX");
if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_l25_tls_lo_max = v; }
const char* e_div = getenv("HAKMEM_RING_RETURN_DIV");
if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_l25_ring_return_div = v; }
const char* e_run = getenv("HAKMEM_L25_RUN_BLOCKS");
if (e_run) { int v = atoi(e_run); if (v>=1 && v<=1024) g_l25_run_blocks_override = v; }
const char* e_rfac = getenv("HAKMEM_L25_RUN_FACTOR");
if (e_rfac) { int v = atoi(e_rfac); if (v>=1 && v<=8) g_l25_run_factor = v; }
const char* e_mix = getenv("HAKMEM_SHARD_MIX");
if (e_mix) g_l25_shard_mix = (atoi(e_mix) != 0);
const char* e_pref = getenv("HAKMEM_L25_PREF");
if (e_pref) {
if (strcmp(e_pref, "remote") == 0) g_l25_pref_remote_first = 1;
else if (strcmp(e_pref, "run") == 0) g_l25_pref_remote_first = 0;
}
const char* e_tc = getenv("HAKMEM_L25_TC_SPILL");
if (e_tc) { int v = atoi(e_tc); if (v>=0 && v<=4096) g_l25_tc_spill = v; }
const char* e_tcc = getenv("HAKMEM_L25_TC_CAP");
if (e_tcc) { int v = atoi(e_tcc); if (v>=8 && v<=L25_TC_CAP) g_l25_tc_cap = v; }
const char* e_bg = getenv("HAKMEM_L25_BG_DRAIN");
if (e_bg) g_l25_bg_drain_enabled = (atoi(e_bg) != 0);
const char* e_bgms = getenv("HAKMEM_L25_BG_MS");
if (e_bgms) { int v = atoi(e_bgms); if (v>=1 && v<=1000) g_l25_bg_interval_ms = v; }
const char* e_rtr = getenv("HAKMEM_L25_RING_TRIGGER");
if (e_rtr) { int v = atoi(e_rtr); if (v>=0 && v<=POOL_L25_RING_CAP) g_l25_ring_trigger = v; }
const char* e_inb = getenv("HAKMEM_L25_OWNER_INBOUND");
if (e_inb) g_l25_owner_inbound = (atoi(e_inb) != 0);
const char* e_ins = getenv("HAKMEM_L25_INBOUND_SLOTS");
if (e_ins) { int v = atoi(e_ins); if (v>=64 && v<=L25_INBOUND_SLOTS) g_l25_in_slots = v; }
// Safe-mode: disable aggressive remote/inbound features by default.
// Set HAKMEM_L25_REMOTE_SAFE=0 to re-enable legacy behaviour.
const char* e_safe = getenv("HAKMEM_L25_REMOTE_SAFE");
int safe_mode = 1;
if (e_safe && atoi(e_safe) == 0) safe_mode = 0;
if (safe_mode) {
g_l25_owner_inbound = 0;
g_l25_pref_remote_first = 0;
g_l25_trylock_probes = 0;
g_l25_bg_drain_enabled = 0;
g_l25_bg_remote_enable = 0;
g_l25_probe_auto = 0;
}
// init inbound table tid=0
for (int i = 0; i < g_l25_in_slots && i < L25_INBOUND_SLOTS; i++) {
atomic_store(&g_l25_inbound[i].tid, (uintptr_t)0);
for (int c = 0; c < L25_NUM_CLASSES; c++) atomic_store(&g_l25_inbound[i].head[c], (uintptr_t)0);
}
g_l25_pool.initialized = 1;
HAKMEM_LOG("[L2.5] Initialized (LargePool)\n");
HAKMEM_LOG("[L2.5] Classes: 64KB, 128KB, 256KB, 512KB, 1MB\n");
HAKMEM_LOG("[L2.5] Page size: %d KB\n", L25_PAGE_SIZE / 1024);
HAKMEM_LOG("[L2.5] Shards: %d (site-based)\n", L25_NUM_SHARDS);
if (g_l25_bg_drain_enabled) {
pthread_create(&g_l25_bg_thread, NULL, l25_bg_main, NULL);
HAKMEM_LOG("[L2.5] BG drain enabled (interval=%d ms)\n", g_l25_bg_interval_ms);
}
}
void hak_l25_pool_shutdown(void) {
if (!g_l25_pool.initialized) return;
hak_l25_pool_print_stats();
// Free all blocks (L2 Pool pattern: just free raw pointers)
for (int class_idx = 0; class_idx < L25_NUM_CLASSES; class_idx++) {
for (int shard_idx = 0; shard_idx < L25_NUM_SHARDS; shard_idx++) {
L25Block* block = g_l25_pool.freelist[class_idx][shard_idx];
while (block) {
L25Block* next = block->next;
free(block); // Free raw allocation (includes header + user data)
block = next;
}
}
}
g_l25_pool.initialized = 0;
}
void* hak_l25_pool_try_alloc(size_t size, uintptr_t site_id) {
if (!g_l25_pool.initialized) hak_l25_pool_init();
// P1.7 approach: Avoid using L2.5 during ALL wrapper calls (conservative but safe)
extern int hak_in_wrapper(void);
if (hak_in_wrapper() && !g_wrap_l25_enabled) return NULL;
if (!hak_l25_pool_is_poolable(size)) return NULL;
// Get class index (inline綺麗綺麗!)
int class_idx = hak_l25_pool_get_class_index(size);
if (class_idx < 0) return NULL;
// Inbound drain (owner inbound → TLS) when ring low
if (g_l25_owner_inbound && g_l25_tls_ring_enabled && (&g_l25_tls_bin[class_idx].ring)->top <= g_l25_ring_trigger) {
inbound_drain_to_tls((uintptr_t)(uintptr_t)pthread_self(), class_idx, &g_l25_tls_bin[class_idx].ring);
}
// TLS two-tier fast path
L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring;
if (g_l25_tls_ring_enabled && ring->top > 0) {
HKM_TIME_START(t_l25_ring_pop0);
L25Block* tlsb = ring->items[--ring->top];
HKM_TIME_END(HKM_CAT_L25_TLS_RING_POP, t_l25_ring_pop0);
void* raw = (void*)tlsb;
AllocHeader* hdr = (AllocHeader*)raw;
l25_write_header(hdr, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)raw + HEADER_SIZE;
}
L25Block* block = g_l25_tls_bin[class_idx].lo_head;
if (block) {
g_l25_tls_bin[class_idx].lo_head = block->next;
if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--;
void* raw = (void*)block; AllocHeader* hdr = (AllocHeader*)raw;
l25_write_header(hdr, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)raw + HEADER_SIZE;
}
if (!block) {
// TLS cache empty: choose order by preference (remote-first or run-first)
if (g_l25_pref_remote_first) {
// Remote-first: only if ring below trigger and remote likely non-empty
int shard_idx = hak_l25_pool_get_shard_index(site_id);
if (g_l25_tls_ring_enabled && ring->top <= g_l25_ring_trigger) {
// prefetch remote head
__builtin_prefetch((const void*)&g_l25_pool.remote_head[class_idx][shard_idx], 0, 1);
int s0 = l25_choose_nonempty_shard(class_idx, shard_idx);
for (int probe = 0; probe < g_l25_trylock_probes; ++probe) {
int s = (s0 + probe) & (L25_NUM_SHARDS - 1);
pthread_mutex_t* l = &g_l25_pool.freelist_locks[class_idx][s].m;
if (pthread_mutex_trylock(l) == 0) {
if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
l25_drain_remote_locked(class_idx, s);
}
L25Block* head = g_l25_pool.freelist[class_idx][s];
int to_ring = POOL_L25_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
while (head && to_ring-- > 0) { L25Block* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
while (head) { L25Block* nxt = head->next; head->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = head; g_l25_tls_bin[class_idx].lo_count++; head = nxt; }
g_l25_pool.freelist[class_idx][s] = head;
if (!head) clear_nonempty_bit(class_idx, s);
pthread_mutex_unlock(l);
if (ring->top > 0) {
L25Block* tlsb = ring->items[--ring->top];
void* rawA = (void*)tlsb; AllocHeader* hdrA = (AllocHeader*)rawA;
l25_write_header(hdrA, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)rawA + HEADER_SIZE;
}
}
}
}
// Fall back to bump-run ActiveRun
if (g_l25_tls_ring_enabled) {
HKM_TIME_START(t_l25_alloc_page0);
int need = POOL_L25_RING_CAP - ring->top; if (need < 1) need = POOL_L25_RING_CAP;
int pushed = l25_refill_tls_from_active(class_idx, ring, need);
if (pushed == 0) {
if (l25_alloc_new_run(class_idx)) {
pushed = l25_refill_tls_from_active(class_idx, ring, need);
}
}
HKM_TIME_END(HKM_CAT_L25_ALLOC_TLS_PAGE, t_l25_alloc_page0);
if (g_l25_tls_ring_enabled && ring->top > 0) {
L25Block* tlsb = ring->items[--ring->top];
void* raw0 = (void*)tlsb; AllocHeader* hdr0 = (AllocHeader*)raw0;
l25_write_header(hdr0, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)raw0 + HEADER_SIZE;
}
}
} else {
// Run-first (previous behavior)
if (g_l25_tls_ring_enabled) {
HKM_TIME_START(t_l25_alloc_page0);
int need = POOL_L25_RING_CAP - ring->top; if (need < 1) need = POOL_L25_RING_CAP;
int pushed = l25_refill_tls_from_active(class_idx, ring, need);
if (pushed == 0) {
if (l25_alloc_new_run(class_idx)) {
pushed = l25_refill_tls_from_active(class_idx, ring, need);
}
}
HKM_TIME_END(HKM_CAT_L25_ALLOC_TLS_PAGE, t_l25_alloc_page0);
if (g_l25_tls_ring_enabled && ring->top > 0) {
L25Block* tlsb = ring->items[--ring->top];
void* raw0 = (void*)tlsb; AllocHeader* hdr0 = (AllocHeader*)raw0;
l25_write_header(hdr0, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)raw0 + HEADER_SIZE;
}
}
}
// TLS cache still empty, refill from global freelist (slow path)
int shard_idx = hak_l25_pool_get_shard_index(site_id);
// Try batch-steal via trylock to fill TLS ring; drain remote under lock
if (g_l25_tls_ring_enabled) {
int s0 = l25_choose_nonempty_shard(class_idx, shard_idx);
for (int probe = 0; probe < g_l25_trylock_probes; ++probe) {
int s = (s0 + probe) & (L25_NUM_SHARDS - 1);
pthread_mutex_t* l = &g_l25_pool.freelist_locks[class_idx][s].m;
if (pthread_mutex_trylock(l) == 0) {
if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
l25_drain_remote_locked(class_idx, s);
}
L25Block* head = g_l25_pool.freelist[class_idx][s];
int to_ring = POOL_L25_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
while (head && to_ring-- > 0) { L25Block* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
while (head) { L25Block* nxt = head->next; head->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = head; g_l25_tls_bin[class_idx].lo_count++; head = nxt; }
g_l25_pool.freelist[class_idx][s] = head;
if (!head) clear_nonempty_bit(class_idx, s);
pthread_mutex_unlock(l);
if (ring->top > 0) {
L25Block* tlsb = ring->items[--ring->top];
void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw;
memcpy(hdr, &g_header_templates[class_idx], sizeof(AllocHeader));
if (!g_hdr_light_enabled) { hdr->alloc_site = site_id; hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); }
g_l25_pool.hits[class_idx]++;
return (char*)raw + HEADER_SIZE;
}
}
}
}
// Try to pop from global freelist (lock shard)
pthread_mutex_t* lock = &g_l25_pool.freelist_locks[class_idx][shard_idx].m;
struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1);
HKM_TIME_START(t_l25_lock);
pthread_mutex_lock(lock);
HKM_TIME_END(HKM_CAT_L25_LOCK, t_l25_lock);
hkm_prof_end(lk1, HKP_L25_LOCK, &ts_lk1);
if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) {
l25_drain_remote_locked(class_idx, shard_idx);
}
block = g_l25_pool.freelist[class_idx][shard_idx];
if (!block) {
// Try simple shard steal if over Soft CAP (avoid over-refill)
int stole = 0;
const FrozenPolicy* pol = hkm_policy_get();
if (pol) {
uint16_t cap = pol->large_cap[class_idx];
if (cap > 0 && g_l25_pool.bundles_by_class[class_idx] >= cap) {
// probe ±1..2 neighboring shards
for (int d = 1; d <= 2 && !stole; d++) {
int s1 = (shard_idx + d) & (L25_NUM_SHARDS - 1);
int s2 = (shard_idx - d) & (L25_NUM_SHARDS - 1);
if (is_shard_nonempty(class_idx, s1)) {
pthread_mutex_t* l2 = &g_l25_pool.freelist_locks[class_idx][s1].m;
pthread_mutex_lock(l2);
L25Block* b2 = g_l25_pool.freelist[class_idx][s1];
if (b2) {
g_l25_pool.freelist[class_idx][s1] = b2->next;
if (!g_l25_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1);
block = b2;
stole = 1;
}
pthread_mutex_unlock(l2);
}
if (!stole && is_shard_nonempty(class_idx, s2)) {
pthread_mutex_t* l3 = &g_l25_pool.freelist_locks[class_idx][s2].m;
pthread_mutex_lock(l3);
L25Block* b3 = g_l25_pool.freelist[class_idx][s2];
if (b3) {
g_l25_pool.freelist[class_idx][s2] = b3->next;
if (!g_l25_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2);
block = b3;
stole = 1;
}
pthread_mutex_unlock(l3);
}
}
}
}
if (!stole && !block) {
// Global freelist empty or no steal, allocate new bundle
{
struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf);
HKM_TIME_START(t_l25_refill);
int ok = refill_freelist(class_idx, shard_idx);
HKM_TIME_END(HKM_CAT_L25_REFILL, t_l25_refill);
hkm_prof_end(rf, HKP_L25_REFILL, &ts_rf);
if (!ok) {
g_l25_pool.misses[class_idx]++;
pthread_mutex_unlock(lock);
return NULL; // Out of memory
}
}
// Try again after refill
block = g_l25_pool.freelist[class_idx][shard_idx];
if (!block) {
g_l25_pool.misses[class_idx]++;
pthread_mutex_unlock(lock);
return NULL; // Refill failed
}
}
}
// Batch-pop under lock: move many blocks to TLS (ring first, then LIFO)
L25Block* head2 = g_l25_pool.freelist[class_idx][shard_idx];
if (head2) {
int to_ring2 = POOL_L25_RING_CAP - ring->top; if (to_ring2 < 0) to_ring2 = 0;
L25Block* h = head2;
// Fill ring
while (h && to_ring2-- > 0) {
L25Block* nxt = h->next;
// update owner for same-thread hint
l25_desc_update_owner((void*)h, (uintptr_t)(uintptr_t)pthread_self());
ring->items[ring->top++] = h;
h = nxt;
}
// Fill local LIFO
while (h) { L25Block* nxt = h->next; h->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = h; g_l25_tls_bin[class_idx].lo_count++; h = nxt; }
// Shard freelist becomes empty after batch-pop
g_l25_pool.freelist[class_idx][shard_idx] = NULL;
clear_nonempty_bit(class_idx, shard_idx);
}
pthread_mutex_unlock(lock);
// Fast return if ring gained items
if (g_l25_tls_ring_enabled && ring->top > 0) {
L25Block* tlsb = ring->items[--ring->top];
void* raw2 = (void*)tlsb; AllocHeader* hdr2 = (AllocHeader*)raw2;
memcpy(hdr2, &g_header_templates[class_idx], sizeof(AllocHeader));
if (!g_hdr_light_enabled) { hdr2->alloc_site = site_id; hdr2->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); }
g_l25_pool.hits[class_idx]++;
return (char*)raw2 + HEADER_SIZE;
}
// Or pop from local LIFO if available
if (g_l25_tls_bin[class_idx].lo_head) {
L25Block* b2 = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b2->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--;
void* raw3 = (void*)b2; AllocHeader* hdr3 = (AllocHeader*)raw3;
l25_write_header(hdr3, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)raw3 + HEADER_SIZE;
}
}
// Push to TLS and return one
if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = block; }
else { block->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = block; g_l25_tls_bin[class_idx].lo_count++; }
L25Block* take;
if (g_l25_tls_ring_enabled && ring->top > 0) { HKM_TIME_START(t_l25_ring_pop1); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_L25_TLS_RING_POP, t_l25_ring_pop1); }
else { HKM_TIME_START(t_l25_lifo_pop0); take = g_l25_tls_bin[class_idx].lo_head; if (take) { g_l25_tls_bin[class_idx].lo_head = take->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--; } HKM_TIME_END(HKM_CAT_L25_TLS_LIFO_POP, t_l25_lifo_pop0); }
void* raw = (void*)take; AllocHeader* hdr = (AllocHeader*)raw; l25_write_header(hdr, class_idx, site_id);
g_l25_pool.hits[class_idx]++;
return (char*)raw + HEADER_SIZE;
}
void hak_l25_pool_free(void* ptr, size_t size, uintptr_t site_id) {
if (!ptr) return;
if (!g_l25_pool.initialized) return;
if (!hak_l25_pool_is_poolable(size)) return;
// ptr is user pointer, get raw pointer (header start) - L2 Pool pattern
void* raw = (char*)ptr - HEADER_SIZE;
// Validate header
AllocHeader* hdr = (AllocHeader*)raw;
if (hdr->magic != HAKMEM_MAGIC) {
extern int g_invalid_free_log; // from hakmem.c
if (g_invalid_free_log) {
fprintf(stderr, "[L2.5] ERROR: Invalid magic 0x%X in l25_pool_free, expected 0x%X\n",
hdr->magic, HAKMEM_MAGIC);
}
return; // Skip free (corruption detected)
}
if (hdr->method != ALLOC_METHOD_L25_POOL) {
extern int g_invalid_free_log; // from hakmem.c
if (g_invalid_free_log) {
fprintf(stderr, "[L2.5] ERROR: Wrong method %d in l25_pool_free, expected L25_POOL\n",
hdr->method);
}
return; // Skip free (not an L2.5 allocation)
}
// Get class index
int class_idx = hak_l25_pool_get_class_index(size);
if (class_idx < 0) return;
// Optional: demand-zero large classes (512KB/1MB) to reduce future soft-fault cost
if (g_l25_pool.demand_zero) {
int class_idx_dz = hak_l25_pool_get_class_index(size);
if (class_idx_dz >= 3) {
madvise((char*)raw, HEADER_SIZE + size, MADV_DONTNEED);
}
}
// Same-thread hint via header owner (if light header present)
uintptr_t self = (uintptr_t)(uintptr_t)pthread_self();
if (g_hdr_light_enabled >= 1 && hdr->owner_tid == self) {
L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring;
if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = (L25Block*)raw; }
else { L25Block* b = (L25Block*)raw; b->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b; g_l25_tls_bin[class_idx].lo_count++; }
} else {
// Cross-thread path: owner inbound or TC
uintptr_t owner = 0;
if (g_hdr_light_enabled >= 1) owner = hdr->owner_tid;
if (g_l25_owner_inbound && owner != 0) {
int slot = inbound_get_slot(owner);
if (slot >= 0) {
inbound_push_block(slot, class_idx, (L25Block*)raw);
} else {
int shard = hak_l25_pool_get_shard_index(site_id);
L25Block* block = (L25Block*)raw;
l25_tc_append(class_idx, block);
if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) {
l25_tc_flush(class_idx, shard);
}
}
} else {
int shard = hak_l25_pool_get_shard_index(site_id);
L25Block* block = (L25Block*)raw;
l25_tc_append(class_idx, block);
if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) {
l25_tc_flush(class_idx, shard);
}
}
}
g_l25_pool.frees[class_idx]++;
}
// ---------------------------------------------------------------------------
// Runtime tuning setters (exposed via HAKX tuner)
// ---------------------------------------------------------------------------
void hak_l25_set_run_factor(int v) {
if (v >= 1 && v <= 8) g_l25_run_factor = v;
}
void hak_l25_set_remote_threshold(int v) {
if (v >= 1 && v <= 4096) g_l25_remote_threshold = v;
}
void hak_l25_set_bg_remote_batch(int v) {
if (v >= 1 && v <= 4096) g_l25_bg_remote_batch = v;
}
void hak_l25_set_bg_remote_enable(int on) {
g_l25_bg_remote_enable = (on != 0);
}
void hak_l25_set_pref_remote_first(int remote_first) {
g_l25_pref_remote_first = (remote_first != 0);
}
void hak_l25_pool_print_stats(void) {
if (!g_l25_pool.initialized) return;
printf("\n");
printf("========================================\n");
printf("L2.5 Pool Statistics (LargePool)\n");
printf("========================================\n");
const char* class_names[L25_NUM_CLASSES] = {
"64KB", "128KB", "256KB", "512KB", "1MB"
};
for (int i = 0; i < L25_NUM_CLASSES; i++) {
uint64_t total = g_l25_pool.hits[i] + g_l25_pool.misses[i];
double hit_rate = (total > 0) ? (100.0 * g_l25_pool.hits[i] / total) : 0.0;
printf("Class %-6s: hits=%7lu misses=%7lu refills=%7lu frees=%7lu (%.1f%% hit)\n",
class_names[i],
(unsigned long)g_l25_pool.hits[i],
(unsigned long)g_l25_pool.misses[i],
(unsigned long)g_l25_pool.refills[i],
(unsigned long)g_l25_pool.frees[i],
hit_rate);
}
printf("----------------------------------------\n");
printf("Total bytes allocated: %lu MB\n",
(unsigned long)(g_l25_pool.total_bytes_allocated / (1024 * 1024)));
printf("Total bundles allocated: %lu\n",
(unsigned long)g_l25_pool.total_bundles_allocated);
printf("========================================\n");
}
void hak_l25_pool_stats_snapshot(uint64_t hits[], uint64_t misses[], uint64_t refills[], uint64_t frees[]) {
if (!g_l25_pool.initialized) {
for (int i = 0; i < L25_NUM_CLASSES; i++) {
if (hits) hits[i] = 0;
if (misses) misses[i] = 0;
if (refills) refills[i] = 0;
if (frees) frees[i] = 0;
}
return;
}
for (int i = 0; i < L25_NUM_CLASSES; i++) {
if (hits) hits[i] = g_l25_pool.hits[i];
if (misses) misses[i] = g_l25_pool.misses[i];
if (refills) refills[i] = g_l25_pool.refills[i];
if (frees) frees[i] = g_l25_pool.frees[i];
}
}