#include "pool_tls.h" #include #include #include #include #include #include #include "pool_tls_registry.h" #ifdef HAKMEM_POOL_TLS_BIND_BOX #include "pool_tls_bind.h" #else // gettid_cached is defined in pool_tls_bind.h when BIND_BOX is enabled static inline pid_t gettid_cached(void){ static __thread pid_t t=0; if (__builtin_expect(t==0,0)) t=(pid_t)syscall(SYS_gettid); return t; } #endif #include // Class sizes: 8KB, 16KB, 24KB, 32KB, 40KB, 48KB, 52KB const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = { 8192, 16384, 24576, 32768, 40960, 49152, 53248 }; // TLS state (per-thread) __thread void* g_tls_pool_head[POOL_SIZE_CLASSES]; __thread uint32_t g_tls_pool_count[POOL_SIZE_CLASSES]; // Phase 1.5b: Lazy pre-warm flag (per-thread) #ifdef HAKMEM_POOL_TLS_PREWARM __thread int g_tls_pool_prewarmed = 0; #endif // Fixed refill counts (Phase 1: no learning) static const uint32_t DEFAULT_REFILL_COUNT[POOL_SIZE_CLASSES] = { 64, 48, 32, 32, 24, 16, 16 // Larger classes = smaller refill }; // Pre-warm counts optimized for memory usage (Phase 1.5b) // Total memory: ~1.6MB per thread // Hot classes (8-24KB): 16 blocks - common in real workloads // Warm classes (32-40KB): 8 blocks // Cold classes (48-52KB): 4 blocks - rare static const int PREWARM_COUNTS[POOL_SIZE_CLASSES] = { 16, 16, 12, // Hot: 8KB, 16KB, 24KB 8, 8, // Warm: 32KB, 40KB 4, 4 // Cold: 48KB, 52KB }; // Forward declare refill function (from Box 2) extern void* pool_refill_and_alloc(int class_idx); // Size to class mapping static inline int pool_size_to_class(size_t size) { // Binary search would be overkill for 7 classes // Simple linear search with early exit if (size <= 8192) return 0; if (size <= 16384) return 1; if (size <= 24576) return 2; if (size <= 32768) return 3; if (size <= 40960) return 4; if (size <= 49152) return 5; if (size <= 53248) return 6; return -1; // Too large for Pool } // Ultra-fast allocation (5-6 cycles) void* pool_alloc(size_t size) { // Phase 1.5b: Lazy pre-warm on first allocation per thread #ifdef HAKMEM_POOL_TLS_PREWARM if (__builtin_expect(!g_tls_pool_prewarmed, 0)) { g_tls_pool_prewarmed = 1; // Set flag FIRST to prevent recursion! pool_tls_prewarm(); // Pre-populate TLS caches } #endif // Quick bounds check if (size < 8192 || size > 53248) { #if !HAKMEM_BUILD_RELEASE static _Atomic int debug_reject_count = 0; int reject_num = atomic_fetch_add(&debug_reject_count, 1); if (reject_num < 20) { fprintf(stderr, "[POOL_TLS_REJECT] size=%zu (out of bounds 8192-53248)\n", size); } #endif return NULL; } int class_idx = pool_size_to_class(size); if (class_idx < 0) return NULL; // Drain a small batch of remote frees for this class extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain); void* chain = NULL; int drained = pool_remote_pop_chain(class_idx, 32, &chain); if (drained > 0 && chain) { // Splice into TLS freelist void* tail = chain; int n = 1; while (*(void**)tail) { tail = *(void**)tail; n++; } *(void**)tail = g_tls_pool_head[class_idx]; g_tls_pool_head[class_idx] = chain; g_tls_pool_count[class_idx] += n; } void* head = g_tls_pool_head[class_idx]; if (__builtin_expect(head != NULL, 1)) { // LIKELY // Pop from freelist (3-4 instructions) g_tls_pool_head[class_idx] = *(void**)head; g_tls_pool_count[class_idx]--; #if POOL_USE_HEADERS // Write header (1 byte before ptr) *((uint8_t*)head - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx; #endif // Low-water integration: if TLS count is low, opportunistically drain remotes if (g_tls_pool_count[class_idx] < 4) { extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain); void* chain2 = NULL; int got = pool_remote_pop_chain(class_idx, 32, &chain2); if (got > 0 && chain2) { void* tail = chain2; while (*(void**)tail) tail = *(void**)tail; *(void**)tail = g_tls_pool_head[class_idx]; g_tls_pool_head[class_idx] = chain2; g_tls_pool_count[class_idx] += got; } } return head; } // Cold path: refill void* refill_ret = pool_refill_and_alloc(class_idx); if (!refill_ret) { // DEBUG: Log refill failure static _Atomic int refill_fail_count = 0; int fail_num = atomic_fetch_add(&refill_fail_count, 1); if (fail_num < 10) { fprintf(stderr, "[POOL_TLS] pool_refill_and_alloc FAILED: class=%d, size=%zu\n", class_idx, POOL_CLASS_SIZES[class_idx]); } } return refill_ret; } // Ultra-fast free (5-6 cycles) void pool_free(void* ptr) { if (!ptr) return; #if POOL_USE_HEADERS // Read class from header uint8_t header = *((uint8_t*)ptr - POOL_HEADER_SIZE); if ((header & 0xF0) != POOL_MAGIC) { // Not ours, route elsewhere return; } int class_idx = header & 0x0F; if (class_idx >= POOL_SIZE_CLASSES) return; // Invalid class #else // Need registry lookup (slower fallback) - not implemented in Phase 1 return; #endif // Owner resolution via page registry pid_t owner_tid = 0; int reg_cls = -1; if (pool_reg_lookup(ptr, &owner_tid, ®_cls)) { #ifdef HAKMEM_POOL_TLS_BIND_BOX // POOL_TLS_BIND_BOX: Fast TID comparison (no repeated gettid syscalls) if (!pool_tls_is_mine_tid(owner_tid)) { // Cross-thread free extern int pool_remote_push(int class_idx, void* ptr, int owner_tid); (void)pool_remote_push(class_idx, ptr, owner_tid); return; } // Same-thread: Continue to fast free path below #else // Original gettid comparison pid_t me = gettid_cached(); if (owner_tid != me) { // Cross-thread free extern int pool_remote_push(int class_idx, void* ptr, int owner_tid); (void)pool_remote_push(class_idx, ptr, owner_tid); return; } #endif } // Same-thread: Push to TLS freelist (2-3 instructions) *(void**)ptr = g_tls_pool_head[class_idx]; g_tls_pool_head[class_idx] = ptr; g_tls_pool_count[class_idx]++; // Phase 1: No drain logic (keep it simple) } // Install refilled chain (called by Box 2) void pool_install_chain(int class_idx, void* chain, int count) { if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES) return; g_tls_pool_head[class_idx] = chain; g_tls_pool_count[class_idx] = count; } // Get refill count for a class int pool_get_refill_count(int class_idx) { if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES) return 0; return DEFAULT_REFILL_COUNT[class_idx]; } // Thread init/cleanup void pool_thread_init(void) { memset(g_tls_pool_head, 0, sizeof(g_tls_pool_head)); memset(g_tls_pool_count, 0, sizeof(g_tls_pool_count)); } void pool_thread_cleanup(void) { // Phase 1: No cleanup (keep it simple) // TODO: Drain back to global pool } // Pre-warm TLS cache (Phase 1.5b optimization) // Eliminates cold-start penalty by pre-populating TLS freelists // Expected improvement: +180-740% (based on Phase 7 Task 3 success) void pool_tls_prewarm(void) { // Forward declare refill function (from Box 2) extern void* backend_batch_carve(int class_idx, int count); for (int class_idx = 0; class_idx < POOL_SIZE_CLASSES; class_idx++) { int count = PREWARM_COUNTS[class_idx]; // Directly refill TLS cache (bypass alloc/free during init) // This avoids issues with g_initializing=1 affecting routing void* chain = backend_batch_carve(class_idx, count); if (chain) { // Install entire chain directly into TLS pool_install_chain(class_idx, chain, count); } // If OOM, continue with other classes (graceful degradation) } }