Files
hakmem/core/pool_tls_arena.c

208 lines
6.8 KiB
C
Raw Normal View History

#include "pool_tls_arena.h"
#include "pool_tls.h" // For POOL_HEADER_SIZE, POOL_USE_HEADERS
Phase 24 PageArena/HotSpanBox: Mid/VM page reuse cache (structural limit identified) Summary: - Implemented PageArena (Box PA1-PA3) for Mid-Large (8-52KB) / L25 (64KB-2MB) - Integration: Pool TLS Arena + L25 alloc/refill paths - Result: Minimal impact (+4.7% Mid, 0% VM page-fault reduction) - Conclusion: Structural limit - existing Arena/Pool/L25 already optimized Implementation: 1. Box PA1: Hot Page Cache (4KB pages, LIFO stack, 1024 slots) - core/page_arena.c: hot_page_alloc/free with mutex protection - TLS cache for 4KB pages 2. Box PA2: Warm Span Cache (64KB-2MB spans, size-bucketed) - 64KB/128KB/2MB span caches (256/128/64 slots) - Size-class based allocation 3. Box PA3: Cold Path (mmap fallback) - page_arena_alloc_pages/aligned with fallback to direct mmap Integration Points: 4. Pool TLS Arena (core/pool_tls_arena.c) - chunk_ensure(): Lazy init + page_arena_alloc_pages() hook - arena_cleanup_thread(): Return chunks to PageArena if enabled - Exponential growth preserved (1MB → 8MB) 5. L25 Pool (core/hakmem_l25_pool.c) - l25_alloc_new_run(): Lazy init + page_arena_alloc_aligned() hook - refill_freelist(): PageArena allocation for bundles - 2MB run carving preserved ENV Variables: - HAKMEM_PAGE_ARENA_ENABLE=1 (default: 0, OFF) - HAKMEM_PAGE_ARENA_HOT_SIZE=1024 (default: 1024) - HAKMEM_PAGE_ARENA_WARM_64K=256 (default: 256) - HAKMEM_PAGE_ARENA_WARM_128K=128 (default: 128) - HAKMEM_PAGE_ARENA_WARM_2M=64 (default: 64) Benchmark Results: - Mid-Large MT (4T, 40K iter, 2KB): - OFF: 84,535 page-faults, 726K ops/s - ON: 84,534 page-faults, 760K ops/s (+4.7% ops, -0.001% faults) - VM Mixed (200K iter): - OFF: 102,134 page-faults, 257K ops/s - ON: 102,134 page-faults, 255K ops/s (0% change) Root Cause Analysis: - Hypothesis: 50-66% page-fault reduction (80-100K → 30-40K) - Actual: <1% page-fault reduction, minimal performance impact - Reason: Structural limit - existing Arena/Pool/L25 already highly optimized - 1MB chunk sizes with high-density linear carving - TLS ring + exponential growth minimize mmap calls - PageArena becomes double-buffering layer with no benefit - Remaining page-faults from kernel zero-clear + app access patterns Lessons Learned: 1. Mid/Large allocators already page-optimal via Arena/Pool design 2. Middle-layer caching ineffective when base layer already optimized 3. Page-fault reduction requires app-level access pattern changes 4. Tiny layer (Phase 23) remains best target for frontend optimization Next Steps: - Defer PageArena (low ROI, structural limit reached) - Focus on upper layers (allocation pattern analysis, size distribution) - Consider app-side access pattern optimization 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 03:22:27 +09:00
#include "page_arena.h" // Phase 24: PageArena integration
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <stdatomic.h>
// TLS storage (automatically zero-initialized)
__thread PoolChunk g_tls_arena[POOL_SIZE_CLASSES];
int g_arena_max_growth_level = 3; // 0:1MB,1:2MB,2:4MB,3:8MB
size_t g_arena_initial_chunk_size = (size_t)1 << 20; // 1MB
static pthread_once_t g_arena_cfg_once = PTHREAD_ONCE_INIT;
static void arena_read_env(void){
const char* s_init = getenv("HAKMEM_POOL_TLS_ARENA_MB_INIT");
const char* s_max = getenv("HAKMEM_POOL_TLS_ARENA_MB_MAX");
const char* s_gl = getenv("HAKMEM_POOL_TLS_ARENA_GROWTH_LEVELS");
if (s_init){ long v = atol(s_init); if (v>=1 && v<=64) g_arena_initial_chunk_size = (size_t)v << 20; }
if (s_max){ long v = atol(s_max); if (v>=1 && v<=1024){
size_t max_bytes = (size_t)v << 20; size_t sz = g_arena_initial_chunk_size; int lvl=0; while (sz < max_bytes && lvl<30){ sz <<= 1; lvl++; }
g_arena_max_growth_level = lvl; if (g_arena_max_growth_level<0) g_arena_max_growth_level=0; }
}
if (s_gl){ long v = atol(s_gl); if (v>=0 && v<=30) g_arena_max_growth_level = (int)v; }
}
// External imports (from pool config)
extern const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES];
// Debug stats
#ifdef POOL_TLS_ARENA_DEBUG
static __thread struct {
uint64_t mmap_calls;
uint64_t total_carved;
uint64_t chunk_exhaustions;
} g_arena_stats;
#endif
// Ensure chunk has space for at least 'needed' bytes
// Returns 0 on success, -1 on mmap failure
static int chunk_ensure(PoolChunk* chunk, size_t needed) {
// Check if current chunk has space
if (chunk->chunk_base && (chunk->offset + needed <= chunk->chunk_size)) {
return 0; // Space available
}
Phase 24 PageArena/HotSpanBox: Mid/VM page reuse cache (structural limit identified) Summary: - Implemented PageArena (Box PA1-PA3) for Mid-Large (8-52KB) / L25 (64KB-2MB) - Integration: Pool TLS Arena + L25 alloc/refill paths - Result: Minimal impact (+4.7% Mid, 0% VM page-fault reduction) - Conclusion: Structural limit - existing Arena/Pool/L25 already optimized Implementation: 1. Box PA1: Hot Page Cache (4KB pages, LIFO stack, 1024 slots) - core/page_arena.c: hot_page_alloc/free with mutex protection - TLS cache for 4KB pages 2. Box PA2: Warm Span Cache (64KB-2MB spans, size-bucketed) - 64KB/128KB/2MB span caches (256/128/64 slots) - Size-class based allocation 3. Box PA3: Cold Path (mmap fallback) - page_arena_alloc_pages/aligned with fallback to direct mmap Integration Points: 4. Pool TLS Arena (core/pool_tls_arena.c) - chunk_ensure(): Lazy init + page_arena_alloc_pages() hook - arena_cleanup_thread(): Return chunks to PageArena if enabled - Exponential growth preserved (1MB → 8MB) 5. L25 Pool (core/hakmem_l25_pool.c) - l25_alloc_new_run(): Lazy init + page_arena_alloc_aligned() hook - refill_freelist(): PageArena allocation for bundles - 2MB run carving preserved ENV Variables: - HAKMEM_PAGE_ARENA_ENABLE=1 (default: 0, OFF) - HAKMEM_PAGE_ARENA_HOT_SIZE=1024 (default: 1024) - HAKMEM_PAGE_ARENA_WARM_64K=256 (default: 256) - HAKMEM_PAGE_ARENA_WARM_128K=128 (default: 128) - HAKMEM_PAGE_ARENA_WARM_2M=64 (default: 64) Benchmark Results: - Mid-Large MT (4T, 40K iter, 2KB): - OFF: 84,535 page-faults, 726K ops/s - ON: 84,534 page-faults, 760K ops/s (+4.7% ops, -0.001% faults) - VM Mixed (200K iter): - OFF: 102,134 page-faults, 257K ops/s - ON: 102,134 page-faults, 255K ops/s (0% change) Root Cause Analysis: - Hypothesis: 50-66% page-fault reduction (80-100K → 30-40K) - Actual: <1% page-fault reduction, minimal performance impact - Reason: Structural limit - existing Arena/Pool/L25 already highly optimized - 1MB chunk sizes with high-density linear carving - TLS ring + exponential growth minimize mmap calls - PageArena becomes double-buffering layer with no benefit - Remaining page-faults from kernel zero-clear + app access patterns Lessons Learned: 1. Mid/Large allocators already page-optimal via Arena/Pool design 2. Middle-layer caching ineffective when base layer already optimized 3. Page-fault reduction requires app-level access pattern changes 4. Tiny layer (Phase 23) remains best target for frontend optimization Next Steps: - Defer PageArena (low ROI, structural limit reached) - Focus on upper layers (allocation pattern analysis, size distribution) - Consider app-side access pattern optimization 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 03:22:27 +09:00
// Phase 24: Ensure PageArena is initialized before first use
if (page_arena_enabled() && g_page_arena.hot.pages == NULL) {
page_arena_init(&g_page_arena);
}
// Need new chunk - calculate size with exponential growth
pthread_once(&g_arena_cfg_once, arena_read_env);
size_t new_size;
if (chunk->growth_level >= g_arena_max_growth_level) {
new_size = g_arena_initial_chunk_size << g_arena_max_growth_level;
} else {
new_size = g_arena_initial_chunk_size << chunk->growth_level;
chunk->growth_level++;
}
// CRITICAL FIX: DO NOT munmap old chunk!
// Reason: Live allocations may still point into it. Arena chunks are kept
// alive for the thread's lifetime and only freed at thread exit.
// This is standard arena behavior - grow but never shrink.
//
// REMOVED BUGGY CODE:
// if (chunk->chunk_base) {
// munmap(chunk->chunk_base, chunk->chunk_size); // ← SEGV! Live ptrs exist!
// }
//
// OLD CHUNK IS LEAKED INTENTIONALLY - it contains live allocations
#ifdef POOL_TLS_ARENA_DEBUG
if (chunk->chunk_base) {
g_arena_stats.chunk_exhaustions++;
}
#endif
Phase 24 PageArena/HotSpanBox: Mid/VM page reuse cache (structural limit identified) Summary: - Implemented PageArena (Box PA1-PA3) for Mid-Large (8-52KB) / L25 (64KB-2MB) - Integration: Pool TLS Arena + L25 alloc/refill paths - Result: Minimal impact (+4.7% Mid, 0% VM page-fault reduction) - Conclusion: Structural limit - existing Arena/Pool/L25 already optimized Implementation: 1. Box PA1: Hot Page Cache (4KB pages, LIFO stack, 1024 slots) - core/page_arena.c: hot_page_alloc/free with mutex protection - TLS cache for 4KB pages 2. Box PA2: Warm Span Cache (64KB-2MB spans, size-bucketed) - 64KB/128KB/2MB span caches (256/128/64 slots) - Size-class based allocation 3. Box PA3: Cold Path (mmap fallback) - page_arena_alloc_pages/aligned with fallback to direct mmap Integration Points: 4. Pool TLS Arena (core/pool_tls_arena.c) - chunk_ensure(): Lazy init + page_arena_alloc_pages() hook - arena_cleanup_thread(): Return chunks to PageArena if enabled - Exponential growth preserved (1MB → 8MB) 5. L25 Pool (core/hakmem_l25_pool.c) - l25_alloc_new_run(): Lazy init + page_arena_alloc_aligned() hook - refill_freelist(): PageArena allocation for bundles - 2MB run carving preserved ENV Variables: - HAKMEM_PAGE_ARENA_ENABLE=1 (default: 0, OFF) - HAKMEM_PAGE_ARENA_HOT_SIZE=1024 (default: 1024) - HAKMEM_PAGE_ARENA_WARM_64K=256 (default: 256) - HAKMEM_PAGE_ARENA_WARM_128K=128 (default: 128) - HAKMEM_PAGE_ARENA_WARM_2M=64 (default: 64) Benchmark Results: - Mid-Large MT (4T, 40K iter, 2KB): - OFF: 84,535 page-faults, 726K ops/s - ON: 84,534 page-faults, 760K ops/s (+4.7% ops, -0.001% faults) - VM Mixed (200K iter): - OFF: 102,134 page-faults, 257K ops/s - ON: 102,134 page-faults, 255K ops/s (0% change) Root Cause Analysis: - Hypothesis: 50-66% page-fault reduction (80-100K → 30-40K) - Actual: <1% page-fault reduction, minimal performance impact - Reason: Structural limit - existing Arena/Pool/L25 already highly optimized - 1MB chunk sizes with high-density linear carving - TLS ring + exponential growth minimize mmap calls - PageArena becomes double-buffering layer with no benefit - Remaining page-faults from kernel zero-clear + app access patterns Lessons Learned: 1. Mid/Large allocators already page-optimal via Arena/Pool design 2. Middle-layer caching ineffective when base layer already optimized 3. Page-fault reduction requires app-level access pattern changes 4. Tiny layer (Phase 23) remains best target for frontend optimization Next Steps: - Defer PageArena (low ROI, structural limit reached) - Focus on upper layers (allocation pattern analysis, size distribution) - Consider app-side access pattern optimization 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 03:22:27 +09:00
// Phase 24: Try PageArena first, fallback to mmap
void* new_base = page_arena_alloc_pages(&g_page_arena, new_size);
if (!new_base) {
// PageArena cache miss → fallback to mmap
new_base = mmap(NULL, new_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
}
if (new_base == MAP_FAILED || new_base == NULL) {
// DEBUG: Log allocation failure details
static _Atomic int alloc_fail_count = 0;
int fail_num = atomic_fetch_add(&alloc_fail_count, 1);
if (fail_num < 10) {
Phase 24 PageArena/HotSpanBox: Mid/VM page reuse cache (structural limit identified) Summary: - Implemented PageArena (Box PA1-PA3) for Mid-Large (8-52KB) / L25 (64KB-2MB) - Integration: Pool TLS Arena + L25 alloc/refill paths - Result: Minimal impact (+4.7% Mid, 0% VM page-fault reduction) - Conclusion: Structural limit - existing Arena/Pool/L25 already optimized Implementation: 1. Box PA1: Hot Page Cache (4KB pages, LIFO stack, 1024 slots) - core/page_arena.c: hot_page_alloc/free with mutex protection - TLS cache for 4KB pages 2. Box PA2: Warm Span Cache (64KB-2MB spans, size-bucketed) - 64KB/128KB/2MB span caches (256/128/64 slots) - Size-class based allocation 3. Box PA3: Cold Path (mmap fallback) - page_arena_alloc_pages/aligned with fallback to direct mmap Integration Points: 4. Pool TLS Arena (core/pool_tls_arena.c) - chunk_ensure(): Lazy init + page_arena_alloc_pages() hook - arena_cleanup_thread(): Return chunks to PageArena if enabled - Exponential growth preserved (1MB → 8MB) 5. L25 Pool (core/hakmem_l25_pool.c) - l25_alloc_new_run(): Lazy init + page_arena_alloc_aligned() hook - refill_freelist(): PageArena allocation for bundles - 2MB run carving preserved ENV Variables: - HAKMEM_PAGE_ARENA_ENABLE=1 (default: 0, OFF) - HAKMEM_PAGE_ARENA_HOT_SIZE=1024 (default: 1024) - HAKMEM_PAGE_ARENA_WARM_64K=256 (default: 256) - HAKMEM_PAGE_ARENA_WARM_128K=128 (default: 128) - HAKMEM_PAGE_ARENA_WARM_2M=64 (default: 64) Benchmark Results: - Mid-Large MT (4T, 40K iter, 2KB): - OFF: 84,535 page-faults, 726K ops/s - ON: 84,534 page-faults, 760K ops/s (+4.7% ops, -0.001% faults) - VM Mixed (200K iter): - OFF: 102,134 page-faults, 257K ops/s - ON: 102,134 page-faults, 255K ops/s (0% change) Root Cause Analysis: - Hypothesis: 50-66% page-fault reduction (80-100K → 30-40K) - Actual: <1% page-fault reduction, minimal performance impact - Reason: Structural limit - existing Arena/Pool/L25 already highly optimized - 1MB chunk sizes with high-density linear carving - TLS ring + exponential growth minimize mmap calls - PageArena becomes double-buffering layer with no benefit - Remaining page-faults from kernel zero-clear + app access patterns Lessons Learned: 1. Mid/Large allocators already page-optimal via Arena/Pool design 2. Middle-layer caching ineffective when base layer already optimized 3. Page-fault reduction requires app-level access pattern changes 4. Tiny layer (Phase 23) remains best target for frontend optimization Next Steps: - Defer PageArena (low ROI, structural limit reached) - Focus on upper layers (allocation pattern analysis, size distribution) - Consider app-side access pattern optimization 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 03:22:27 +09:00
fprintf(stderr, "[POOL_ARENA] alloc FAILED: new_size=%zu MB, growth_level=%d, errno=%d\n",
new_size / (1024*1024), chunk->growth_level, errno);
}
return -1; // OOM
}
#ifdef POOL_TLS_ARENA_DEBUG
g_arena_stats.mmap_calls++;
#endif
// Register range for owner resolution
pid_t tid = (pid_t)syscall(SYS_gettid);
pool_reg_register(new_base, new_size, tid, -1); // class-less at arena level
chunk->chunk_base = new_base;
chunk->chunk_size = new_size;
chunk->offset = 0;
return 0;
}
// Carve blocks from TLS Arena
int arena_batch_carve(int class_idx, void** out_blocks, int count) {
if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES) {
return 0; // Invalid class
}
PoolChunk* chunk = &g_tls_arena[class_idx];
size_t block_size = POOL_CLASS_SIZES[class_idx];
// Calculate allocation size with header space
#if POOL_USE_HEADERS
size_t alloc_size = block_size + POOL_HEADER_SIZE;
#else
size_t alloc_size = block_size;
#endif
// Ensure chunk has space for all blocks
size_t needed = alloc_size * count;
if (chunk_ensure(chunk, needed) != 0) {
// DEBUG: Log chunk_ensure failure
static _Atomic int ensure_fail_count = 0;
int fail_num = atomic_fetch_add(&ensure_fail_count, 1);
if (fail_num < 10) {
fprintf(stderr, "[POOL_ARENA] chunk_ensure FAILED: class=%d, block_size=%zu, count=%d, needed=%zu\n",
class_idx, block_size, count, needed);
}
return 0; // OOM
}
// Carve blocks from chunk
int carved = 0;
for (int i = 0; i < count; i++) {
if (chunk->offset + alloc_size > chunk->chunk_size) {
break; // Chunk exhausted (shouldn't happen after ensure)
}
// Return pointer AFTER header space
out_blocks[i] = (char*)chunk->chunk_base + chunk->offset
#if POOL_USE_HEADERS
+ POOL_HEADER_SIZE
#endif
;
chunk->offset += alloc_size;
carved++;
#ifdef POOL_TLS_ARENA_DEBUG
g_arena_stats.total_carved++;
#endif
}
return carved;
}
// Thread cleanup
static void __attribute__((destructor)) arena_cleanup(void) {
arena_cleanup_thread();
}
void arena_cleanup_thread(void) {
for (int i = 0; i < POOL_SIZE_CLASSES; i++) {
PoolChunk* chunk = &g_tls_arena[i];
if (chunk->chunk_base) {
pid_t tid = (pid_t)syscall(SYS_gettid);
pool_reg_unregister(chunk->chunk_base, chunk->chunk_size, tid);
Phase 24 PageArena/HotSpanBox: Mid/VM page reuse cache (structural limit identified) Summary: - Implemented PageArena (Box PA1-PA3) for Mid-Large (8-52KB) / L25 (64KB-2MB) - Integration: Pool TLS Arena + L25 alloc/refill paths - Result: Minimal impact (+4.7% Mid, 0% VM page-fault reduction) - Conclusion: Structural limit - existing Arena/Pool/L25 already optimized Implementation: 1. Box PA1: Hot Page Cache (4KB pages, LIFO stack, 1024 slots) - core/page_arena.c: hot_page_alloc/free with mutex protection - TLS cache for 4KB pages 2. Box PA2: Warm Span Cache (64KB-2MB spans, size-bucketed) - 64KB/128KB/2MB span caches (256/128/64 slots) - Size-class based allocation 3. Box PA3: Cold Path (mmap fallback) - page_arena_alloc_pages/aligned with fallback to direct mmap Integration Points: 4. Pool TLS Arena (core/pool_tls_arena.c) - chunk_ensure(): Lazy init + page_arena_alloc_pages() hook - arena_cleanup_thread(): Return chunks to PageArena if enabled - Exponential growth preserved (1MB → 8MB) 5. L25 Pool (core/hakmem_l25_pool.c) - l25_alloc_new_run(): Lazy init + page_arena_alloc_aligned() hook - refill_freelist(): PageArena allocation for bundles - 2MB run carving preserved ENV Variables: - HAKMEM_PAGE_ARENA_ENABLE=1 (default: 0, OFF) - HAKMEM_PAGE_ARENA_HOT_SIZE=1024 (default: 1024) - HAKMEM_PAGE_ARENA_WARM_64K=256 (default: 256) - HAKMEM_PAGE_ARENA_WARM_128K=128 (default: 128) - HAKMEM_PAGE_ARENA_WARM_2M=64 (default: 64) Benchmark Results: - Mid-Large MT (4T, 40K iter, 2KB): - OFF: 84,535 page-faults, 726K ops/s - ON: 84,534 page-faults, 760K ops/s (+4.7% ops, -0.001% faults) - VM Mixed (200K iter): - OFF: 102,134 page-faults, 257K ops/s - ON: 102,134 page-faults, 255K ops/s (0% change) Root Cause Analysis: - Hypothesis: 50-66% page-fault reduction (80-100K → 30-40K) - Actual: <1% page-fault reduction, minimal performance impact - Reason: Structural limit - existing Arena/Pool/L25 already highly optimized - 1MB chunk sizes with high-density linear carving - TLS ring + exponential growth minimize mmap calls - PageArena becomes double-buffering layer with no benefit - Remaining page-faults from kernel zero-clear + app access patterns Lessons Learned: 1. Mid/Large allocators already page-optimal via Arena/Pool design 2. Middle-layer caching ineffective when base layer already optimized 3. Page-fault reduction requires app-level access pattern changes 4. Tiny layer (Phase 23) remains best target for frontend optimization Next Steps: - Defer PageArena (low ROI, structural limit reached) - Focus on upper layers (allocation pattern analysis, size distribution) - Consider app-side access pattern optimization 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 03:22:27 +09:00
// Phase 24: Return to PageArena if enabled
if (page_arena_enabled()) {
page_arena_free_pages(&g_page_arena, chunk->chunk_base, chunk->chunk_size);
} else {
munmap(chunk->chunk_base, chunk->chunk_size);
}
chunk->chunk_base = NULL;
}
}
}
#ifdef POOL_TLS_ARENA_DEBUG
#include <stdio.h>
void arena_print_stats(void) {
printf("[Pool TLS Arena Stats]\n");
printf(" mmap calls: %lu\n", g_arena_stats.mmap_calls);
printf(" blocks carved: %lu\n", g_arena_stats.total_carved);
printf(" chunk exhaustions: %lu\n", g_arena_stats.chunk_exhaustions);
}
#endif