// ============================================================================ // hakmem_l25_pool.c - L2.5 LargePool Implementation (64KB-1MB) // ============================================================================ // // サイズクラス定義: // ┌──────────┬─────────┬──────────────┬─────────────┐ // │ クラス │ サイズ │ 初期CAP │ ページ構成 │ // ├──────────┼─────────┼──────────────┼─────────────┤ // │ Class 0 │ 64 KB │ 8 bundles │ 1 page/b │ // │ Class 1 │ 128 KB │ 8 bundles │ 2 pages/b │ // │ Class 2 │ 256 KB │ 4 bundles │ 4 pages/b │ // │ Class 3 │ 512 KB │ 2 bundles │ 8 pages/b │ // │ Class 4 │ 1 MB │ 1 bundle │ 16 pages/b │ // └──────────┴─────────┴──────────────┴─────────────┘ // // W_MAX_LARGE (切り上げ許容倍率): // - 意味: 要求サイズの何倍までのクラスを許容するか // - デフォルト: 1.30 (30%までの切り上げを許容) - **保守的** // - 推奨値: 1.60 (60%までの切り上げを許容) - ギャップ対策 // - 例: 40KBの要求 → 64KBクラス使用OK (1.60倍 < 1.60) // - 環境変数: HAKMEM_WMAX_LARGE=1.6 で変更可能 // // 重要: 32-64KB ギャップ対策 // - 32KBを超える要求は、L2 Mid Poolでカバーできない // - W_MAX_LARGE=1.30だと、32KB要求は64KBに丸められない (2.0倍 > 1.30) // - W_MAX_LARGE=1.60に緩和することで、40KB以上を64KBクラスでカバー // - これにより32-64KBギャップの一部を解消 // // CAP (在庫量): // - 意味: 各クラスで保持する最大バンドル数 // - 初期値: {8,8,4,2,1} - 保守的(フットプリント優先) // - 推奨値: {32,32,16,8,4} - パフォーマンス優先(4倍化) // - 環境変数: HAKMEM_CAP_LARGE=32,32,16,8,4 で設定 // // TLS構造: // - リング: POOL_L25_RING_CAP(デフォルト16) // - ActiveRun: bump-run方式(連続メモリから切り出し) // - LIFO overflow: リングから溢れた分 // - Remote-free: MPSC queue(クロススレッドfree処理) // // パフォーマンスチューニング: // 1. ⭐⭐⭐ W_MAX_LARGE緩和: HAKMEM_WMAX_LARGE=1.6 // 2. ⭐⭐ 初期CAP 4倍化: HAKMEM_CAP_LARGE=32,32,16,8,4 // 3. BG drain有効化: HAKMEM_L25_BG_DRAIN=1 // // License: MIT // Date: 2025-10-24 (Phase 6.x - 綺麗綺麗大作戦) #include "hakmem_l25_pool.h" #include "hakmem_config.h" #include "hakmem_internal.h" // For AllocHeader and HAKMEM_MAGIC #include "box/ss_os_acquire_box.h" #include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD) #include "box/pagefault_telemetry_box.h" // Box PageFaultTelemetry (PF_BUCKET_L25) #include "page_arena.h" // Phase 24: PageArena integration for L25 #include "page_arena.h" // Phase 24: PageArena integration #include #include #include #include #include #include "hakmem_prof.h" #include "hakmem_debug.h" #include "hakmem_policy.h" // FrozenPolicy caps (Soft CAP guidance) #include // False sharing mitigation: padded mutex type (64B) typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex; // =========================================================================== // Internal Data Structures // =========================================================================== // Freelist node header (embedded in allocated bundle, same as L2 Pool pattern) typedef struct L25Block { struct L25Block* next; // Next block in freelist } L25Block; // Phase 6.17: TLS two-tier cache(リング + ローカルLIFO) #ifndef POOL_L25_RING_CAP #define POOL_L25_RING_CAP 16 #endif typedef struct { L25Block* items[POOL_L25_RING_CAP]; int top; } L25TLSRing; typedef struct { L25TLSRing ring; L25Block* lo_head; size_t lo_count; } L25TLSBin; static __thread L25TLSBin g_l25_tls_bin[L25_NUM_CLASSES]; // TLS ActiveRun (bump-run). We mmap a run that holds multiple class-sized blocks // and hand out addresses by simple pointer arithmetic (no per-block linking). typedef struct { char* base; // start of run (raw, header at base) char* cursor; // next header address to serve char* end; // end of run (exclusive) } L25ActiveRun; static __thread L25ActiveRun g_l25_active[L25_NUM_CLASSES]; // Global L2.5 pool state (simplified: single-threaded for MVP) static struct { L25Block* freelist[L25_NUM_CLASSES][L25_NUM_SHARDS]; // Fine-grained locks per (class, shard) freelist (padded) PaddedMutex freelist_locks[L25_NUM_CLASSES][L25_NUM_SHARDS]; // Phase 6.10.1 pattern: non-empty bitmap (O(1) empty class skip) // Use atomic bit operations to avoid class-wide locks atomic_uint_fast64_t nonempty_mask[L25_NUM_CLASSES]; // 1 bit per shard // Statistics uint64_t hits[L25_NUM_CLASSES] __attribute__((aligned(64))); uint64_t misses[L25_NUM_CLASSES] __attribute__((aligned(64))); uint64_t refills[L25_NUM_CLASSES] __attribute__((aligned(64))); uint64_t frees[L25_NUM_CLASSES] __attribute__((aligned(64))); uint64_t total_bytes_allocated __attribute__((aligned(64))); uint64_t total_bundles_allocated __attribute__((aligned(64))); // Per-class bundle accounting (for Soft CAP guidance) uint64_t bundles_by_class[L25_NUM_CLASSES] __attribute__((aligned(64))); int initialized; int demand_zero; // env: HAKMEM_L25_DZ=1 // Remote-free MPSC stacks per (class, shard) atomic_uintptr_t remote_head[L25_NUM_CLASSES][L25_NUM_SHARDS]; atomic_uint remote_count[L25_NUM_CLASSES][L25_NUM_SHARDS]; } g_l25_pool; static int g_wrap_l25_enabled = 1; // env: HAKMEM_WRAP_L25=0 to disable in wrappers static int g_l25_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING static int g_l25_trylock_probes = 3; // env: HAKMEM_TRYLOCK_PROBES static int g_l25_tls_lo_max = 256; // env: HAKMEM_TLS_LO_MAX static int g_l25_ring_return_div = 3; // env: HAKMEM_RING_RETURN_DIV static int g_l25_ring_trigger = 2; // env: HAKMEM_L25_RING_TRIGGER static int g_l25_owner_inbound = 0; // env: HAKMEM_L25_OWNER_INBOUND (0/1) static int g_l25_in_slots = 512; // env: HAKMEM_L25_INBOUND_SLOTS (<= compiled max) extern int g_hdr_light_enabled; // shared with Mid pool static int g_l25_run_blocks_override = 0; // env: HAKMEM_L25_RUN_BLOCKS (0=per-class defaults) static int g_l25_shard_mix = 1; // env: HAKMEM_SHARD_MIX (0/1, default ON) static int g_l25_pref_remote_first = 1; // env: HAKMEM_L25_PREF=remote|run (default remote) static int g_l25_tc_spill = 32; // env: HAKMEM_L25_TC_SPILL (spill threshold) static int g_l25_bg_drain_enabled = 0; // env: HAKMEM_L25_BG_DRAIN=1 to enable BG drain of remote static int g_l25_bg_interval_ms = 5; // env: HAKMEM_L25_BG_MS static int g_l25_bg_remote_enable = 0; // env: HAKMEM_L25_BG_REMOTE static int g_l25_probe_auto = 0; // env: HAKMEM_L25_PROBE_AUTO static int g_l25_remote_threshold = 32; // env: HAKMEM_L25_REMOTE_THRESHOLD static int g_l25_bg_remote_batch = 64; // env: HAKMEM_L25_BG_REMOTE_BATCH static pthread_t g_l25_bg_thread; // Size class table (for reference) static const size_t g_class_sizes[L25_NUM_CLASSES] = { L25_CLASS_64KB, L25_CLASS_128KB, L25_CLASS_256KB, L25_CLASS_512KB, L25_CLASS_1MB }; // Phase 6.11.5 P0: Pre-initialized header templates for fast allocation // Reduces AllocHeader reconstruction from 100-150 cycles to 40-50 cycles static const AllocHeader g_header_templates[L25_NUM_CLASSES] = { {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_64KB, 0, 0, 0}, // 64KB {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_128KB, 0, 0, 0}, // 128KB {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_256KB, 0, 0, 0}, // 256KB {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_512KB, 0, 0, 0}, // 512KB {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_1MB, 0, 0, 0} // 1MB }; // Pages per bundle (for each class) static const int g_pages_per_bundle[L25_NUM_CLASSES] = { 1, // 64KB = 1 × 64KB page 2, // 128KB = 2 × 64KB pages 4, // 256KB = 4 × 64KB pages 8, // 512KB = 8 × 64KB pages 16 // 1MB = 16 × 64KB pages }; // Default blocks per bump-run per class (≈2MB per run) static const int g_blocks_per_run_default[L25_NUM_CLASSES] = { 32, 16, 8, 4, 2 }; static inline size_t l25_stride_bytes(int class_idx) { return HEADER_SIZE + g_class_sizes[class_idx]; } static int g_l25_run_factor = 1; // env: HAKMEM_L25_RUN_FACTOR (1..8) static inline int l25_blocks_per_run(int class_idx) { if (g_l25_run_blocks_override > 0) return g_l25_run_blocks_override; int base = g_blocks_per_run_default[class_idx]; long long val = (long long)base * (long long)g_l25_run_factor; if (val < 1) val = 1; if (val > 1024) val = 1024; return (int)val; } static inline void l25_write_header(AllocHeader* hdr, int class_idx, uintptr_t site_id) { if (g_hdr_light_enabled >= 2) { return; // no writes } else if (g_hdr_light_enabled >= 1) { hdr->magic = HAKMEM_MAGIC; hdr->method = ALLOC_METHOD_L25_POOL; hdr->size = g_class_sizes[class_idx]; hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); } else { memcpy(hdr, &g_header_templates[class_idx], sizeof(AllocHeader)); hdr->alloc_site = site_id; hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); } } // =========================================================================== // Helper Functions - inline綺麗綺麗大作戦! // =========================================================================== // Phase 6.10.1 pattern: branchless LUT (Lookup Table) for O(1) class determination // SIZE_TO_CLASS[i] = class for (i * 64KB) // Index 0: invalid, 1: 64KB (class 0), 2: 128KB (class 1), 4: 256KB (class 2), etc. static const int8_t SIZE_TO_CLASS[] = { -1, // index 0: 0KB - invalid 0, // index 1: 64KB → Class 0 1, // index 2: 128KB → Class 1 -1, // index 3: 192KB (between 128KB and 256KB) 2, // index 4: 256KB → Class 2 -1, -1, -1, // index 5-7: 320KB-448KB 3, // index 8: 512KB → Class 3 -1, -1, -1, -1, -1, -1, -1, // index 9-15: 576KB-960KB 4 // index 16: 1MB → Class 4 }; // Get size class index from size (0-4, or -1 if out of range) // inline綺麗綺麗: O(1) branchless lookup, zero function call overhead static inline int hak_l25_pool_get_class_index(size_t size) { // Round to 64KB units size_t kb64 = (size + L25_PAGE_SIZE - 1) / L25_PAGE_SIZE; // Direct LUT lookup (O(1), branchless) if (kb64 == 0 || kb64 > 16) return -1; return SIZE_TO_CLASS[kb64]; } // Get shard index from site_id (0-63) // inline綺麗綺麗: Same pattern as L2 Pool static inline uint64_t splitmix64(uint64_t x) { x += 0x9e3779b97f4a7c15ULL; x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; return x ^ (x >> 31); } int hak_l25_pool_get_shard_index(uintptr_t site_id) { if (g_l25_shard_mix) { uint64_t h = splitmix64((uint64_t)site_id); return (int)(h & (L25_NUM_SHARDS - 1)); } // Shift by 4 to reduce collision (instruction alignment) return (int)((site_id >> 4) & (L25_NUM_SHARDS - 1)); } // Phase 6.10.1 pattern: Bitmap helpers (O(1) empty class detection) // inline綺麗綺麗: Zero overhead, perfect for hot path static inline void set_nonempty_bit(int class_idx, int shard_idx) { // Atomic OR with release semantics (ensures freelist write is visible) atomic_fetch_or_explicit(&g_l25_pool.nonempty_mask[class_idx], (uint64_t)(1ULL << shard_idx), memory_order_release); } static inline void clear_nonempty_bit(int class_idx, int shard_idx) { // Atomic AND with release semantics (ensures freelist clear is visible) atomic_fetch_and_explicit(&g_l25_pool.nonempty_mask[class_idx], ~(uint64_t)(1ULL << shard_idx), memory_order_release); } static inline int is_shard_nonempty(int class_idx, int shard_idx) { // Atomic load with acquire semantics (ensures freelist read is valid) uint64_t mask = atomic_load_explicit(&g_l25_pool.nonempty_mask[class_idx], memory_order_acquire); return (mask & (1ULL << shard_idx)) != 0; } // Choose a non-empty shard near preferred using the nonempty mask. If none, return preferred. static inline int l25_choose_nonempty_shard(int class_idx, int preferred) { uint64_t mask = atomic_load_explicit(&g_l25_pool.nonempty_mask[class_idx], memory_order_acquire); if (!mask) return preferred; int shift = preferred & 63; uint64_t rot = (mask >> shift) | (mask << (64 - shift)); if (!rot) return preferred; int off = __builtin_ctzll(rot); return (preferred + off) & (L25_NUM_SHARDS - 1); } // Drain remote-free MPSC stack into freelist under the shard lock static inline void l25_drain_remote_locked(int class_idx, int shard_idx) { uintptr_t head = atomic_exchange_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel); int drained = 0; L25Block* list = (L25Block*)head; if (list) { // Tail-link pattern: find tail, then link entire chain at once (MT-safe) L25Block* tail = list; int count = 1; while (tail->next) { tail = tail->next; count++; } // Single atomic write to freelist (prevents race with concurrent alloc) tail->next = g_l25_pool.freelist[class_idx][shard_idx]; g_l25_pool.freelist[class_idx][shard_idx] = list; drained = count; } if (drained) { atomic_fetch_sub_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed); if (g_l25_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx); } } // ===================== // Bump-run TLS helpers // ===================== static inline int l25_refill_tls_from_active(int class_idx, L25TLSRing* ring, int need) { if (need <= 0) need = POOL_L25_RING_CAP; L25ActiveRun* ar = &g_l25_active[class_idx]; if (!ar->base) return 0; size_t stride = l25_stride_bytes(class_idx); size_t avail = (size_t)((ar->end - ar->cursor) / (ptrdiff_t)stride); if (avail == 0) { ar->base = ar->cursor = ar->end = NULL; return 0; } int k = (int)((size_t)need < avail ? (size_t)need : avail); int pushed = 0; while (pushed < k && ring->top < POOL_L25_RING_CAP) { L25Block* b = (L25Block*)ar->cursor; ring->items[ring->top++] = b; ar->cursor += stride; pushed++; } if (ar->cursor >= ar->end) { ar->base = ar->cursor = ar->end = NULL; } return pushed; } // Forward decl for descriptor registration static void l25_desc_insert_range(void* base, void* end, int class_idx); static inline int l25_alloc_new_run(int class_idx) { int blocks = l25_blocks_per_run(class_idx); size_t stride = l25_stride_bytes(class_idx); size_t run_bytes = (size_t)blocks * stride; // Phase 24: Try PageArena first, fallback to mmap if (page_arena_enabled() && g_page_arena.hot.pages == NULL) { page_arena_init(&g_page_arena); } void* raw = page_arena_alloc_aligned(&g_page_arena, run_bytes, L25_PAGE_SIZE); if (!raw) { // PageArena cache miss → fallback to mmap raw = mmap(NULL, run_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); } if (raw == MAP_FAILED || raw == NULL) { if (g_hakem_config.ace_trace) { fprintf(stderr, "[ACE-FAIL] MapFail: class=%d size=%zu (LargePool)\n", class_idx, run_bytes); } return 0; } L25ActiveRun* ar = &g_l25_active[class_idx]; ar->base = (char*)raw; ar->cursor = (char*)raw; ar->end = ar->base + run_bytes; // Register page descriptors for headerless free l25_desc_insert_range(ar->base, ar->end, class_idx); // PageFaultTelemetry: mark all backing pages for this run (approximate) for (size_t off = 0; off < run_bytes; off += 4096) { pagefault_telemetry_touch(PF_BUCKET_L25, ar->base + off); } // Stats (best-effort) g_l25_pool.total_bytes_allocated += run_bytes; g_l25_pool.total_bundles_allocated += blocks; return 1; } // ===================== // L2.5 Page Descriptors // ===================== typedef struct L25PageDesc { void* page; // 64KB-aligned page base int class_idx; // L2.5 class index (0..4) uintptr_t owner_tid; // Hint: owning thread (at run allocation) struct L25PageDesc* next; } L25PageDesc; #define L25_DESC_BUCKETS 4096 static L25PageDesc* g_l25_desc_head[L25_DESC_BUCKETS]; static inline size_t l25_desc_hash(void* page) { return ((((uintptr_t)page) >> 16) & (L25_DESC_BUCKETS - 1)); } static inline void* l25_page_base(void* addr) { return (void*)((uintptr_t)addr & ~((uintptr_t)L25_PAGE_SIZE - 1)); } static void l25_desc_insert_range(void* base, void* end, int class_idx) { char* p = (char*)(((uintptr_t)base) & ~((uintptr_t)L25_PAGE_SIZE - 1)); char* e = (char*)end; uintptr_t owner = (uintptr_t)(uintptr_t)pthread_self(); for (; p < e; p += L25_PAGE_SIZE) { size_t h = l25_desc_hash(p); L25PageDesc* d = (L25PageDesc*)hkm_libc_malloc(sizeof(L25PageDesc)); // Phase 6.X P0 Fix if (!d) continue; // best-effort d->page = p; d->class_idx = class_idx; d->owner_tid = owner; d->next = g_l25_desc_head[h]; g_l25_desc_head[h] = d; } } static inline L25PageDesc* l25_desc_lookup_ptr(void* ptr) { void* page = l25_page_base(ptr); size_t h = l25_desc_hash(page); for (L25PageDesc* d = g_l25_desc_head[h]; d; d = d->next) { if (d->page == page) return d; } return NULL; } static inline void l25_desc_update_owner(void* ptr, uintptr_t owner) { L25PageDesc* d = l25_desc_lookup_ptr(ptr); if (d) d->owner_tid = owner; } // ------------------------------ // Owner inbound registry (per-owner MPSC stacks) // ------------------------------ #ifndef L25_INBOUND_SLOTS #define L25_INBOUND_SLOTS 512 #endif typedef struct { atomic_uintptr_t head[L25_NUM_CLASSES]; atomic_uintptr_t tid; // 0 = empty } L25InboundSlot; static L25InboundSlot g_l25_inbound[L25_INBOUND_SLOTS]; static inline size_t inb_hash(uintptr_t tid) { return (size_t)((tid ^ (tid >> 17) ^ (tid << 9))); } static int inbound_get_slot(uintptr_t tid) { if (tid == 0) return -1; int limit = g_l25_in_slots; if (limit <= 0 || limit > L25_INBOUND_SLOTS) limit = L25_INBOUND_SLOTS; size_t h = inb_hash(tid) % (size_t)limit; for (int i = 0; i < limit; i++) { int idx = (int)((h + (size_t)i) % (size_t)limit); uintptr_t cur = atomic_load_explicit(&g_l25_inbound[idx].tid, memory_order_acquire); if (cur == tid) return idx; if (cur == 0) { uintptr_t zero = 0; if (atomic_compare_exchange_weak_explicit(&g_l25_inbound[idx].tid, &zero, tid, memory_order_acq_rel, memory_order_relaxed)) { // initialize heads lazily (they start at 0 by BSS) return idx; } } } return -1; } static inline void inbound_push_block(int slot, int class_idx, L25Block* b) { if (slot < 0) return; uintptr_t old_head; do { old_head = atomic_load_explicit(&g_l25_inbound[slot].head[class_idx], memory_order_acquire); b->next = (L25Block*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_l25_inbound[slot].head[class_idx], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed)); } static inline int inbound_drain_to_tls(uintptr_t self_tid, int class_idx, L25TLSRing* ring) { if (!g_l25_owner_inbound) return 0; int slot = inbound_get_slot(self_tid); if (slot < 0) return 0; uintptr_t head = atomic_exchange_explicit(&g_l25_inbound[slot].head[class_idx], (uintptr_t)0, memory_order_acq_rel); int moved = 0; L25Block* cur = (L25Block*)head; while (cur) { L25Block* nxt = cur->next; if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = cur; } else { cur->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = cur; g_l25_tls_bin[class_idx].lo_count++; } moved++; cur = nxt; } return moved; } // Exposed to hak_free_at(): headerless lookup (returns 1 on success) int hak_l25_lookup(void* user_ptr, size_t* out_size) { L25PageDesc* d = l25_desc_lookup_ptr(user_ptr); if (!d) return 0; if (out_size) *out_size = g_class_sizes[d->class_idx]; return 1; } // ------------------------------ // Transfer Cache (per-thread) // ------------------------------ // Per-thread Transfer Cache as array ring (no block writes on fast-path) #ifndef L25_TC_CAP #define L25_TC_CAP 64 #endif typedef struct { L25Block* items[L25_TC_CAP]; int count; // 0..cap } L25TCRing; static __thread L25TCRing g_l25_tc[L25_NUM_CLASSES]; static int g_l25_tc_cap = L25_TC_CAP; // env: HAKMEM_L25_TC_CAP static inline void l25_tc_append(int class_idx, L25Block* b) { L25TCRing* tc = &g_l25_tc[class_idx]; if (tc->count < g_l25_tc_cap) { tc->items[tc->count++] = b; } else { // overflow handled by caller via l25_tc_flush } } static inline int l25_tc_flush(int class_idx, int shard_idx) { L25TCRing* tc = &g_l25_tc[class_idx]; int n = tc->count; if (n <= 0) return 0; // Build a linked list from ring (LIFO order) only during flush L25Block* head = NULL; for (int i = 0; i < n; i++) { L25Block* b = tc->items[i]; b->next = head; head = b; } tc->count = 0; // CRITICAL FIX: Find tail ONCE before CAS loop (prevents list corruption) // Bug: Previous code found tail inside CAS loop, overwriting tail->next on each retry L25Block* tail = head; while (tail && tail->next) tail = tail->next; // Single CAS to remote_head uintptr_t old_head; HKM_TIME_START(t_l25_remote_push_tc); do { old_head = atomic_load_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], memory_order_acquire); // Link tail to current remote_head (safe to update on each retry) if (tail) tail->next = (L25Block*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], &old_head, (uintptr_t)head, memory_order_release, memory_order_relaxed)); atomic_fetch_add_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], n, memory_order_relaxed); HKM_TIME_END(HKM_CAT_L25_REMOTE_PUSH, t_l25_remote_push_tc); set_nonempty_bit(class_idx, shard_idx); return n; } // Exposed to hak_free_at(): headerless fast free using descriptor void hak_l25_pool_free_fast(void* user_ptr, uintptr_t site_id) { L25PageDesc* d = l25_desc_lookup_ptr(user_ptr); if (!d) return; // unknown → drop int class_idx = d->class_idx; void* raw = (char*)user_ptr - HEADER_SIZE; // Optional: demand-zero for larger classes if (g_l25_pool.demand_zero && class_idx >= 3) { (void)ss_os_madvise_guarded((char*)raw, HEADER_SIZE + g_class_sizes[class_idx], MADV_DONTNEED, "l25_pool_dontneed_class"); } // Same-thread hint: prefer per-block owner if header present (HDR_LIGHT>=1), else page owner uintptr_t self = (uintptr_t)(uintptr_t)pthread_self(); uintptr_t owner_hint = d->owner_tid; if (g_hdr_light_enabled >= 1) { AllocHeader* hdr = (AllocHeader*)raw; owner_hint = hdr->owner_tid; } if (owner_hint == self) { L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring; if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = (L25Block*)raw; } else { L25Block* b = (L25Block*)raw; b->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b; g_l25_tls_bin[class_idx].lo_count++; } } else { // Remote: push to per-thread TC; spill in batch when threshold reached int shard = hak_l25_pool_get_shard_index(site_id); L25Block* block = (L25Block*)raw; l25_tc_append(class_idx, block); if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) { l25_tc_flush(class_idx, shard); } } g_l25_pool.frees[class_idx]++; } // ===================== // BG Drain (remote → freelist) // ===================== static void* l25_bg_main(void* arg) { (void)arg; struct timespec ts; ts.tv_sec = 0; ts.tv_nsec = (long)g_l25_bg_interval_ms * 1000000L; while (g_l25_pool.initialized) { for (int c = 0; c < L25_NUM_CLASSES; c++) { for (int s = 0; s < L25_NUM_SHARDS; s++) { if (atomic_load_explicit(&g_l25_pool.remote_count[c][s], memory_order_relaxed) != 0) { pthread_mutex_t* l = &g_l25_pool.freelist_locks[c][s].m; pthread_mutex_lock(l); if (atomic_load_explicit(&g_l25_pool.remote_count[c][s], memory_order_relaxed) != 0) { l25_drain_remote_locked(c, s); } pthread_mutex_unlock(l); } } } nanosleep(&ts, NULL); } return NULL; } // =========================================================================== // Page Bundle Management (L2 Pool pattern) // =========================================================================== // Refill freelist by allocating a new page bundle // Args: class_idx - size class index (0-4) // shard_idx - shard index (0-63) // Returns: 1 on success, 0 on failure // // Pattern: Same as L2 Pool - allocate raw memory, write header, return to freelist static int refill_freelist(int class_idx, int shard_idx) { if (class_idx < 0 || class_idx >= L25_NUM_CLASSES) return 0; if (shard_idx < 0 || shard_idx >= L25_NUM_SHARDS) return 0; size_t user_size = g_class_sizes[class_idx]; size_t bundle_size = HEADER_SIZE + user_size; // Header + user data // Soft CAP guidance: decide how many bundles to allocate (1..2) int bundles = 1; const FrozenPolicy* pol = hkm_policy_get(); static int g_l25_min_bundle = -1; // lazy init from env if (g_l25_min_bundle < 0) { const char* e = getenv("HAKMEM_L25_MIN_BUNDLE"); int v = (e ? atoi(e) : 1); if (v < 1) v = 1; if (v > 2) v = 2; // L2.5 is large; keep conservative g_l25_min_bundle = v; } if (pol) { uint16_t cap = pol->large_cap[class_idx]; if (cap > 0) { uint64_t have = g_l25_pool.bundles_by_class[class_idx]; if (have >= cap) { bundles = 1; // over cap: allocate minimally } else { uint64_t deficit = cap - have; bundles = (deficit >= (uint64_t)g_l25_min_bundle) ? g_l25_min_bundle : 1; if (bundles > 2) bundles = 2; } } } int ok_any = 0; for (int b = 0; b < bundles; b++) { // Phase 24: Try PageArena first, fallback to mmap void* raw = page_arena_alloc_aligned(&g_page_arena, bundle_size, L25_PAGE_SIZE); if (!raw) { // PageArena cache miss → fallback to mmap raw = mmap(NULL, bundle_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); } if (!raw) { if (g_hakem_config.ace_trace) { fprintf(stderr, "[ACE-FAIL] MapFail: class=%d size=%zu (LargePool Refill)\n", class_idx, bundle_size); } if (ok_any) break; else return 0; } // Write AllocHeader at start AllocHeader* hdr = (AllocHeader*)raw; hdr->magic = HAKMEM_MAGIC; hdr->method = ALLOC_METHOD_L25_POOL; hdr->size = user_size; hdr->alloc_site = 0; // Set by hak_l25_pool_try_alloc hdr->class_bytes = 0; // L2.5 blocks not cacheable // Freelist uses raw pointer (header start) L25Block* block = (L25Block*)raw; block->next = g_l25_pool.freelist[class_idx][shard_idx]; g_l25_pool.freelist[class_idx][shard_idx] = block; ok_any = 1; // Set non-empty bit (freelist now has blocks) set_nonempty_bit(class_idx, shard_idx); // Update statistics g_l25_pool.refills[class_idx]++; g_l25_pool.total_bundles_allocated++; g_l25_pool.total_bytes_allocated += bundle_size; g_l25_pool.bundles_by_class[class_idx]++; } return ok_any ? 1 : 0; } // =========================================================================== // Public API // =========================================================================== void hak_l25_pool_init(void) { if (g_l25_pool.initialized) return; memset(&g_l25_pool, 0, sizeof(g_l25_pool)); for (int c = 0; c < L25_NUM_CLASSES; c++) { atomic_store(&g_l25_pool.nonempty_mask[c], 0); for (int s = 0; s < L25_NUM_SHARDS; s++) { pthread_mutex_init(&g_l25_pool.freelist_locks[c][s].m, NULL); atomic_store(&g_l25_pool.remote_head[c][s], (uintptr_t)0); atomic_store(&g_l25_pool.remote_count[c][s], 0); } g_l25_pool.bundles_by_class[c] = 0; } // Demand-zero toggle char* dz = getenv("HAKMEM_L25_DZ"); g_l25_pool.demand_zero = (dz && atoi(dz) != 0) ? 1 : 0; const char* e_wrap = getenv("HAKMEM_WRAP_L25"); if (e_wrap) { g_wrap_l25_enabled = (atoi(e_wrap) != 0); } const char* e_ring = getenv("HAKMEM_POOL_TLS_RING"); if (e_ring) g_l25_tls_ring_enabled = (atoi(e_ring) != 0); const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES"); if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_l25_trylock_probes = v; } const char* e_lo = getenv("HAKMEM_TLS_LO_MAX"); if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_l25_tls_lo_max = v; } const char* e_div = getenv("HAKMEM_RING_RETURN_DIV"); if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_l25_ring_return_div = v; } const char* e_run = getenv("HAKMEM_L25_RUN_BLOCKS"); if (e_run) { int v = atoi(e_run); if (v>=1 && v<=1024) g_l25_run_blocks_override = v; } const char* e_rfac = getenv("HAKMEM_L25_RUN_FACTOR"); if (e_rfac) { int v = atoi(e_rfac); if (v>=1 && v<=8) g_l25_run_factor = v; } const char* e_mix = getenv("HAKMEM_SHARD_MIX"); if (e_mix) g_l25_shard_mix = (atoi(e_mix) != 0); const char* e_pref = getenv("HAKMEM_L25_PREF"); if (e_pref) { if (strcmp(e_pref, "remote") == 0) g_l25_pref_remote_first = 1; else if (strcmp(e_pref, "run") == 0) g_l25_pref_remote_first = 0; } const char* e_tc = getenv("HAKMEM_L25_TC_SPILL"); if (e_tc) { int v = atoi(e_tc); if (v>=0 && v<=4096) g_l25_tc_spill = v; } const char* e_tcc = getenv("HAKMEM_L25_TC_CAP"); if (e_tcc) { int v = atoi(e_tcc); if (v>=8 && v<=L25_TC_CAP) g_l25_tc_cap = v; } const char* e_bg = getenv("HAKMEM_L25_BG_DRAIN"); if (e_bg) g_l25_bg_drain_enabled = (atoi(e_bg) != 0); const char* e_bgms = getenv("HAKMEM_L25_BG_MS"); if (e_bgms) { int v = atoi(e_bgms); if (v>=1 && v<=1000) g_l25_bg_interval_ms = v; } const char* e_rtr = getenv("HAKMEM_L25_RING_TRIGGER"); if (e_rtr) { int v = atoi(e_rtr); if (v>=0 && v<=POOL_L25_RING_CAP) g_l25_ring_trigger = v; } const char* e_inb = getenv("HAKMEM_L25_OWNER_INBOUND"); if (e_inb) g_l25_owner_inbound = (atoi(e_inb) != 0); const char* e_ins = getenv("HAKMEM_L25_INBOUND_SLOTS"); if (e_ins) { int v = atoi(e_ins); if (v>=64 && v<=L25_INBOUND_SLOTS) g_l25_in_slots = v; } // Safe-mode: disable aggressive remote/inbound features by default. // Set HAKMEM_L25_REMOTE_SAFE=0 to re-enable legacy behaviour. const char* e_safe = getenv("HAKMEM_L25_REMOTE_SAFE"); int safe_mode = 1; if (e_safe && atoi(e_safe) == 0) safe_mode = 0; if (safe_mode) { g_l25_owner_inbound = 0; g_l25_pref_remote_first = 0; g_l25_trylock_probes = 0; g_l25_bg_drain_enabled = 0; g_l25_bg_remote_enable = 0; g_l25_probe_auto = 0; } // init inbound table tid=0 for (int i = 0; i < g_l25_in_slots && i < L25_INBOUND_SLOTS; i++) { atomic_store(&g_l25_inbound[i].tid, (uintptr_t)0); for (int c = 0; c < L25_NUM_CLASSES; c++) atomic_store(&g_l25_inbound[i].head[c], (uintptr_t)0); } g_l25_pool.initialized = 1; HAKMEM_LOG("[L2.5] Initialized (LargePool)\n"); HAKMEM_LOG("[L2.5] Classes: 64KB, 128KB, 256KB, 512KB, 1MB\n"); HAKMEM_LOG("[L2.5] Page size: %d KB\n", L25_PAGE_SIZE / 1024); HAKMEM_LOG("[L2.5] Shards: %d (site-based)\n", L25_NUM_SHARDS); if (g_l25_bg_drain_enabled) { pthread_create(&g_l25_bg_thread, NULL, l25_bg_main, NULL); HAKMEM_LOG("[L2.5] BG drain enabled (interval=%d ms)\n", g_l25_bg_interval_ms); } } void hak_l25_pool_shutdown(void) { if (!g_l25_pool.initialized) return; hak_l25_pool_print_stats(); // Free all blocks (L2 Pool pattern: just free raw pointers) for (int class_idx = 0; class_idx < L25_NUM_CLASSES; class_idx++) { for (int shard_idx = 0; shard_idx < L25_NUM_SHARDS; shard_idx++) { L25Block* block = g_l25_pool.freelist[class_idx][shard_idx]; while (block) { L25Block* next = block->next; free(block); // Free raw allocation (includes header + user data) block = next; } } } g_l25_pool.initialized = 0; } void* hak_l25_pool_try_alloc(size_t size, uintptr_t site_id) { if (!g_l25_pool.initialized) hak_l25_pool_init(); // P1.7 approach: Avoid using L2.5 during ALL wrapper calls (conservative but safe) extern int hak_in_wrapper(void); extern __thread int g_hakmem_lock_depth; int in_wrapper = hak_in_wrapper(); if (in_wrapper && g_hakmem_lock_depth > 1) return NULL; if (in_wrapper && !g_wrap_l25_enabled) return NULL; if (!hak_l25_pool_is_poolable(size)) return NULL; // Get class index (inline綺麗綺麗!) int class_idx = hak_l25_pool_get_class_index(size); if (class_idx < 0) return NULL; // Inbound drain (owner inbound → TLS) when ring low if (g_l25_owner_inbound && g_l25_tls_ring_enabled && (&g_l25_tls_bin[class_idx].ring)->top <= g_l25_ring_trigger) { inbound_drain_to_tls((uintptr_t)(uintptr_t)pthread_self(), class_idx, &g_l25_tls_bin[class_idx].ring); } // TLS two-tier fast path L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring; if (g_l25_tls_ring_enabled && ring->top > 0) { HKM_TIME_START(t_l25_ring_pop0); L25Block* tlsb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_L25_TLS_RING_POP, t_l25_ring_pop0); void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw; l25_write_header(hdr, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; } L25Block* block = g_l25_tls_bin[class_idx].lo_head; if (block) { g_l25_tls_bin[class_idx].lo_head = block->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--; void* raw = (void*)block; AllocHeader* hdr = (AllocHeader*)raw; l25_write_header(hdr, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; } if (!block) { // TLS cache empty: choose order by preference (remote-first or run-first) if (g_l25_pref_remote_first) { // Remote-first: only if ring below trigger and remote likely non-empty int shard_idx = hak_l25_pool_get_shard_index(site_id); if (g_l25_tls_ring_enabled && ring->top <= g_l25_ring_trigger) { // prefetch remote head __builtin_prefetch((const void*)&g_l25_pool.remote_head[class_idx][shard_idx], 0, 1); int s0 = l25_choose_nonempty_shard(class_idx, shard_idx); for (int probe = 0; probe < g_l25_trylock_probes; ++probe) { int s = (s0 + probe) & (L25_NUM_SHARDS - 1); pthread_mutex_t* l = &g_l25_pool.freelist_locks[class_idx][s].m; if (pthread_mutex_trylock(l) == 0) { if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) { l25_drain_remote_locked(class_idx, s); } L25Block* head = g_l25_pool.freelist[class_idx][s]; int to_ring = POOL_L25_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0; while (head && to_ring-- > 0) { L25Block* nxt = head->next; ring->items[ring->top++] = head; head = nxt; } while (head) { L25Block* nxt = head->next; head->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = head; g_l25_tls_bin[class_idx].lo_count++; head = nxt; } g_l25_pool.freelist[class_idx][s] = head; if (!head) clear_nonempty_bit(class_idx, s); pthread_mutex_unlock(l); if (ring->top > 0) { L25Block* tlsb = ring->items[--ring->top]; void* rawA = (void*)tlsb; AllocHeader* hdrA = (AllocHeader*)rawA; l25_write_header(hdrA, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)rawA + HEADER_SIZE; } } } } // Fall back to bump-run ActiveRun if (g_l25_tls_ring_enabled) { HKM_TIME_START(t_l25_alloc_page0); int need = POOL_L25_RING_CAP - ring->top; if (need < 1) need = POOL_L25_RING_CAP; int pushed = l25_refill_tls_from_active(class_idx, ring, need); if (pushed == 0) { if (l25_alloc_new_run(class_idx)) { pushed = l25_refill_tls_from_active(class_idx, ring, need); } } HKM_TIME_END(HKM_CAT_L25_ALLOC_TLS_PAGE, t_l25_alloc_page0); if (g_l25_tls_ring_enabled && ring->top > 0) { L25Block* tlsb = ring->items[--ring->top]; void* raw0 = (void*)tlsb; AllocHeader* hdr0 = (AllocHeader*)raw0; l25_write_header(hdr0, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)raw0 + HEADER_SIZE; } } } else { // Run-first (previous behavior) if (g_l25_tls_ring_enabled) { HKM_TIME_START(t_l25_alloc_page0); int need = POOL_L25_RING_CAP - ring->top; if (need < 1) need = POOL_L25_RING_CAP; int pushed = l25_refill_tls_from_active(class_idx, ring, need); if (pushed == 0) { if (l25_alloc_new_run(class_idx)) { pushed = l25_refill_tls_from_active(class_idx, ring, need); } } HKM_TIME_END(HKM_CAT_L25_ALLOC_TLS_PAGE, t_l25_alloc_page0); if (g_l25_tls_ring_enabled && ring->top > 0) { L25Block* tlsb = ring->items[--ring->top]; void* raw0 = (void*)tlsb; AllocHeader* hdr0 = (AllocHeader*)raw0; l25_write_header(hdr0, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)raw0 + HEADER_SIZE; } } } // TLS cache still empty, refill from global freelist (slow path) int shard_idx = hak_l25_pool_get_shard_index(site_id); // Try batch-steal via trylock to fill TLS ring; drain remote under lock if (g_l25_tls_ring_enabled) { int s0 = l25_choose_nonempty_shard(class_idx, shard_idx); for (int probe = 0; probe < g_l25_trylock_probes; ++probe) { int s = (s0 + probe) & (L25_NUM_SHARDS - 1); pthread_mutex_t* l = &g_l25_pool.freelist_locks[class_idx][s].m; if (pthread_mutex_trylock(l) == 0) { if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) { l25_drain_remote_locked(class_idx, s); } L25Block* head = g_l25_pool.freelist[class_idx][s]; int to_ring = POOL_L25_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0; while (head && to_ring-- > 0) { L25Block* nxt = head->next; ring->items[ring->top++] = head; head = nxt; } while (head) { L25Block* nxt = head->next; head->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = head; g_l25_tls_bin[class_idx].lo_count++; head = nxt; } g_l25_pool.freelist[class_idx][s] = head; if (!head) clear_nonempty_bit(class_idx, s); pthread_mutex_unlock(l); if (ring->top > 0) { L25Block* tlsb = ring->items[--ring->top]; void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw; memcpy(hdr, &g_header_templates[class_idx], sizeof(AllocHeader)); if (!g_hdr_light_enabled) { hdr->alloc_site = site_id; hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); } g_l25_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; } } } } // Try to pop from global freelist (lock shard) pthread_mutex_t* lock = &g_l25_pool.freelist_locks[class_idx][shard_idx].m; struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1); HKM_TIME_START(t_l25_lock); pthread_mutex_lock(lock); HKM_TIME_END(HKM_CAT_L25_LOCK, t_l25_lock); hkm_prof_end(lk1, HKP_L25_LOCK, &ts_lk1); if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) { l25_drain_remote_locked(class_idx, shard_idx); } block = g_l25_pool.freelist[class_idx][shard_idx]; if (!block) { // Try simple shard steal if over Soft CAP (avoid over-refill) int stole = 0; const FrozenPolicy* pol = hkm_policy_get(); if (pol) { uint16_t cap = pol->large_cap[class_idx]; if (cap > 0 && g_l25_pool.bundles_by_class[class_idx] >= cap) { // probe ±1..2 neighboring shards for (int d = 1; d <= 2 && !stole; d++) { int s1 = (shard_idx + d) & (L25_NUM_SHARDS - 1); int s2 = (shard_idx - d) & (L25_NUM_SHARDS - 1); if (is_shard_nonempty(class_idx, s1)) { pthread_mutex_t* l2 = &g_l25_pool.freelist_locks[class_idx][s1].m; pthread_mutex_lock(l2); L25Block* b2 = g_l25_pool.freelist[class_idx][s1]; if (b2) { g_l25_pool.freelist[class_idx][s1] = b2->next; if (!g_l25_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1); block = b2; stole = 1; } pthread_mutex_unlock(l2); } if (!stole && is_shard_nonempty(class_idx, s2)) { pthread_mutex_t* l3 = &g_l25_pool.freelist_locks[class_idx][s2].m; pthread_mutex_lock(l3); L25Block* b3 = g_l25_pool.freelist[class_idx][s2]; if (b3) { g_l25_pool.freelist[class_idx][s2] = b3->next; if (!g_l25_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2); block = b3; stole = 1; } pthread_mutex_unlock(l3); } } } } if (!stole && !block) { // Global freelist empty or no steal, allocate new bundle { struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf); HKM_TIME_START(t_l25_refill); int ok = refill_freelist(class_idx, shard_idx); HKM_TIME_END(HKM_CAT_L25_REFILL, t_l25_refill); hkm_prof_end(rf, HKP_L25_REFILL, &ts_rf); if (!ok) { g_l25_pool.misses[class_idx]++; pthread_mutex_unlock(lock); return NULL; // Out of memory } } // Try again after refill block = g_l25_pool.freelist[class_idx][shard_idx]; if (!block) { g_l25_pool.misses[class_idx]++; pthread_mutex_unlock(lock); return NULL; // Refill failed } } } // Batch-pop under lock: move many blocks to TLS (ring first, then LIFO) L25Block* head2 = g_l25_pool.freelist[class_idx][shard_idx]; if (head2) { int to_ring2 = POOL_L25_RING_CAP - ring->top; if (to_ring2 < 0) to_ring2 = 0; L25Block* h = head2; // Fill ring while (h && to_ring2-- > 0) { L25Block* nxt = h->next; // update owner for same-thread hint l25_desc_update_owner((void*)h, (uintptr_t)(uintptr_t)pthread_self()); ring->items[ring->top++] = h; h = nxt; } // Fill local LIFO while (h) { L25Block* nxt = h->next; h->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = h; g_l25_tls_bin[class_idx].lo_count++; h = nxt; } // Shard freelist becomes empty after batch-pop g_l25_pool.freelist[class_idx][shard_idx] = NULL; clear_nonempty_bit(class_idx, shard_idx); } pthread_mutex_unlock(lock); // Fast return if ring gained items if (g_l25_tls_ring_enabled && ring->top > 0) { L25Block* tlsb = ring->items[--ring->top]; void* raw2 = (void*)tlsb; AllocHeader* hdr2 = (AllocHeader*)raw2; memcpy(hdr2, &g_header_templates[class_idx], sizeof(AllocHeader)); if (!g_hdr_light_enabled) { hdr2->alloc_site = site_id; hdr2->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); } g_l25_pool.hits[class_idx]++; return (char*)raw2 + HEADER_SIZE; } // Or pop from local LIFO if available if (g_l25_tls_bin[class_idx].lo_head) { L25Block* b2 = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b2->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--; void* raw3 = (void*)b2; AllocHeader* hdr3 = (AllocHeader*)raw3; l25_write_header(hdr3, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)raw3 + HEADER_SIZE; } } // Push to TLS and return one if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = block; } else { block->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = block; g_l25_tls_bin[class_idx].lo_count++; } L25Block* take; if (g_l25_tls_ring_enabled && ring->top > 0) { HKM_TIME_START(t_l25_ring_pop1); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_L25_TLS_RING_POP, t_l25_ring_pop1); } else { HKM_TIME_START(t_l25_lifo_pop0); take = g_l25_tls_bin[class_idx].lo_head; if (take) { g_l25_tls_bin[class_idx].lo_head = take->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--; } HKM_TIME_END(HKM_CAT_L25_TLS_LIFO_POP, t_l25_lifo_pop0); } void* raw = (void*)take; AllocHeader* hdr = (AllocHeader*)raw; l25_write_header(hdr, class_idx, site_id); g_l25_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; } void hak_l25_pool_free(void* ptr, size_t size, uintptr_t site_id) { if (!ptr) return; if (!g_l25_pool.initialized) return; if (!hak_l25_pool_is_poolable(size)) return; // ptr is user pointer, get raw pointer (header start) - L2 Pool pattern void* raw = (char*)ptr - HEADER_SIZE; // Validate header AllocHeader* hdr = (AllocHeader*)raw; if (hdr->magic != HAKMEM_MAGIC) { extern int g_invalid_free_log; // from hakmem.c if (g_invalid_free_log) { fprintf(stderr, "[L2.5] ERROR: Invalid magic 0x%X in l25_pool_free, expected 0x%X\n", hdr->magic, HAKMEM_MAGIC); } return; // Skip free (corruption detected) } if (hdr->method != ALLOC_METHOD_L25_POOL) { extern int g_invalid_free_log; // from hakmem.c if (g_invalid_free_log) { fprintf(stderr, "[L2.5] ERROR: Wrong method %d in l25_pool_free, expected L25_POOL\n", hdr->method); } return; // Skip free (not an L2.5 allocation) } // Get class index int class_idx = hak_l25_pool_get_class_index(size); if (class_idx < 0) return; // Optional: demand-zero large classes (512KB/1MB) to reduce future soft-fault cost if (g_l25_pool.demand_zero) { int class_idx_dz = hak_l25_pool_get_class_index(size); if (class_idx_dz >= 3) { (void)ss_os_madvise_guarded((char*)raw, HEADER_SIZE + size, MADV_DONTNEED, "l25_pool_dontneed_size"); } } // Same-thread hint via header owner (if light header present) uintptr_t self = (uintptr_t)(uintptr_t)pthread_self(); if (g_hdr_light_enabled >= 1 && hdr->owner_tid == self) { L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring; if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = (L25Block*)raw; } else { L25Block* b = (L25Block*)raw; b->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b; g_l25_tls_bin[class_idx].lo_count++; } } else { // Cross-thread path: owner inbound or TC uintptr_t owner = 0; if (g_hdr_light_enabled >= 1) owner = hdr->owner_tid; if (g_l25_owner_inbound && owner != 0) { int slot = inbound_get_slot(owner); if (slot >= 0) { inbound_push_block(slot, class_idx, (L25Block*)raw); } else { int shard = hak_l25_pool_get_shard_index(site_id); L25Block* block = (L25Block*)raw; l25_tc_append(class_idx, block); if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) { l25_tc_flush(class_idx, shard); } } } else { int shard = hak_l25_pool_get_shard_index(site_id); L25Block* block = (L25Block*)raw; l25_tc_append(class_idx, block); if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) { l25_tc_flush(class_idx, shard); } } } g_l25_pool.frees[class_idx]++; } // --------------------------------------------------------------------------- // Runtime tuning setters (exposed via HAKX tuner) // --------------------------------------------------------------------------- void hak_l25_set_run_factor(int v) { if (v >= 1 && v <= 8) g_l25_run_factor = v; } void hak_l25_set_remote_threshold(int v) { if (v >= 1 && v <= 4096) g_l25_remote_threshold = v; } void hak_l25_set_bg_remote_batch(int v) { if (v >= 1 && v <= 4096) g_l25_bg_remote_batch = v; } void hak_l25_set_bg_remote_enable(int on) { g_l25_bg_remote_enable = (on != 0); } void hak_l25_set_pref_remote_first(int remote_first) { g_l25_pref_remote_first = (remote_first != 0); } void hak_l25_pool_print_stats(void) { if (!g_l25_pool.initialized) return; printf("\n"); printf("========================================\n"); printf("L2.5 Pool Statistics (LargePool)\n"); printf("========================================\n"); const char* class_names[L25_NUM_CLASSES] = { "64KB", "128KB", "256KB", "512KB", "1MB" }; for (int i = 0; i < L25_NUM_CLASSES; i++) { uint64_t total = g_l25_pool.hits[i] + g_l25_pool.misses[i]; double hit_rate = (total > 0) ? (100.0 * g_l25_pool.hits[i] / total) : 0.0; printf("Class %-6s: hits=%7lu misses=%7lu refills=%7lu frees=%7lu (%.1f%% hit)\n", class_names[i], (unsigned long)g_l25_pool.hits[i], (unsigned long)g_l25_pool.misses[i], (unsigned long)g_l25_pool.refills[i], (unsigned long)g_l25_pool.frees[i], hit_rate); } printf("----------------------------------------\n"); printf("Total bytes allocated: %lu MB\n", (unsigned long)(g_l25_pool.total_bytes_allocated / (1024 * 1024))); printf("Total bundles allocated: %lu\n", (unsigned long)g_l25_pool.total_bundles_allocated); printf("========================================\n"); } void hak_l25_pool_stats_snapshot(uint64_t hits[], uint64_t misses[], uint64_t refills[], uint64_t frees[]) { if (!g_l25_pool.initialized) { for (int i = 0; i < L25_NUM_CLASSES; i++) { if (hits) hits[i] = 0; if (misses) misses[i] = 0; if (refills) refills[i] = 0; if (frees) frees[i] = 0; } return; } for (int i = 0; i < L25_NUM_CLASSES; i++) { if (hits) hits[i] = g_l25_pool.hits[i]; if (misses) misses[i] = g_l25_pool.misses[i]; if (refills) refills[i] = g_l25_pool.refills[i]; if (frees) frees[i] = g_l25_pool.frees[i]; } }