Tiny: fix header/stride mismatch and harden refill paths
- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte header during allocation, but linear carve/refill and initial slab capacity still used bare class block sizes. This mismatch could overrun slab usable space and corrupt freelists, causing reproducible SEGV at ~100k iters. Changes - Superslab: compute capacity with effective stride (block_size + header for classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a debug-only bound check in superslab_alloc_from_slab() to fail fast if carve would exceed usable bytes. - Refill (non-P0 and P0): use header-aware stride for all linear carving and TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h also uses stride, not raw class size. - Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes before splicing into freelist (already present). Notes - This unifies the memory layout across alloc/linear-carve/refill with a single stride definition and keeps class7 (1024B) headerless as designed. - Debug builds add fail-fast checks; release builds remain lean. Next - Re-run Tiny benches (256/1024B) in debug to confirm stability, then in release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0 to isolate P0 batch carve, and continue reducing branch-miss as planned.
This commit is contained in:
@ -2,6 +2,14 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
#include "pool_tls_registry.h"
|
||||
|
||||
static inline pid_t gettid_cached(void){
|
||||
static __thread pid_t t=0; if (__builtin_expect(t==0,0)) t=(pid_t)syscall(SYS_gettid); return t;
|
||||
}
|
||||
#include <stdio.h>
|
||||
|
||||
// Class sizes: 8KB, 16KB, 24KB, 32KB, 40KB, 48KB, 52KB
|
||||
const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
|
||||
@ -12,11 +20,27 @@ const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
|
||||
__thread void* g_tls_pool_head[POOL_SIZE_CLASSES];
|
||||
__thread uint32_t g_tls_pool_count[POOL_SIZE_CLASSES];
|
||||
|
||||
// Phase 1.5b: Lazy pre-warm flag (per-thread)
|
||||
#ifdef HAKMEM_POOL_TLS_PREWARM
|
||||
__thread int g_tls_pool_prewarmed = 0;
|
||||
#endif
|
||||
|
||||
// Fixed refill counts (Phase 1: no learning)
|
||||
static const uint32_t DEFAULT_REFILL_COUNT[POOL_SIZE_CLASSES] = {
|
||||
64, 48, 32, 32, 24, 16, 16 // Larger classes = smaller refill
|
||||
};
|
||||
|
||||
// Pre-warm counts optimized for memory usage (Phase 1.5b)
|
||||
// Total memory: ~1.6MB per thread
|
||||
// Hot classes (8-24KB): 16 blocks - common in real workloads
|
||||
// Warm classes (32-40KB): 8 blocks
|
||||
// Cold classes (48-52KB): 4 blocks - rare
|
||||
static const int PREWARM_COUNTS[POOL_SIZE_CLASSES] = {
|
||||
16, 16, 12, // Hot: 8KB, 16KB, 24KB
|
||||
8, 8, // Warm: 32KB, 40KB
|
||||
4, 4 // Cold: 48KB, 52KB
|
||||
};
|
||||
|
||||
// Forward declare refill function (from Box 2)
|
||||
extern void* pool_refill_and_alloc(int class_idx);
|
||||
|
||||
@ -36,12 +60,34 @@ static inline int pool_size_to_class(size_t size) {
|
||||
|
||||
// Ultra-fast allocation (5-6 cycles)
|
||||
void* pool_alloc(size_t size) {
|
||||
// Phase 1.5b: Lazy pre-warm on first allocation per thread
|
||||
#ifdef HAKMEM_POOL_TLS_PREWARM
|
||||
if (__builtin_expect(!g_tls_pool_prewarmed, 0)) {
|
||||
g_tls_pool_prewarmed = 1; // Set flag FIRST to prevent recursion!
|
||||
pool_tls_prewarm(); // Pre-populate TLS caches
|
||||
}
|
||||
#endif
|
||||
|
||||
// Quick bounds check
|
||||
if (size < 8192 || size > 53248) return NULL;
|
||||
|
||||
int class_idx = pool_size_to_class(size);
|
||||
if (class_idx < 0) return NULL;
|
||||
|
||||
// Drain a small batch of remote frees for this class
|
||||
extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
|
||||
void* chain = NULL;
|
||||
int drained = pool_remote_pop_chain(class_idx, 32, &chain);
|
||||
if (drained > 0 && chain) {
|
||||
// Splice into TLS freelist
|
||||
void* tail = chain;
|
||||
int n = 1;
|
||||
while (*(void**)tail) { tail = *(void**)tail; n++; }
|
||||
*(void**)tail = g_tls_pool_head[class_idx];
|
||||
g_tls_pool_head[class_idx] = chain;
|
||||
g_tls_pool_count[class_idx] += n;
|
||||
}
|
||||
|
||||
void* head = g_tls_pool_head[class_idx];
|
||||
|
||||
if (__builtin_expect(head != NULL, 1)) { // LIKELY
|
||||
@ -54,6 +100,17 @@ void* pool_alloc(size_t size) {
|
||||
*((uint8_t*)head - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx;
|
||||
#endif
|
||||
|
||||
// Low-water integration: if TLS count is low, opportunistically drain remotes
|
||||
if (g_tls_pool_count[class_idx] < 4) {
|
||||
extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
|
||||
void* chain2 = NULL; int got = pool_remote_pop_chain(class_idx, 32, &chain2);
|
||||
if (got > 0 && chain2) {
|
||||
void* tail = chain2; while (*(void**)tail) tail = *(void**)tail;
|
||||
*(void**)tail = g_tls_pool_head[class_idx];
|
||||
g_tls_pool_head[class_idx] = chain2;
|
||||
g_tls_pool_count[class_idx] += got;
|
||||
}
|
||||
}
|
||||
return head;
|
||||
}
|
||||
|
||||
@ -78,8 +135,18 @@ void pool_free(void* ptr) {
|
||||
// Need registry lookup (slower fallback) - not implemented in Phase 1
|
||||
return;
|
||||
#endif
|
||||
// Owner resolution via page registry
|
||||
pid_t owner_tid=0; int reg_cls=-1;
|
||||
if (pool_reg_lookup(ptr, &owner_tid, ®_cls)){
|
||||
pid_t me = gettid_cached();
|
||||
if (owner_tid != me){
|
||||
extern int pool_remote_push(int class_idx, void* ptr, int owner_tid);
|
||||
(void)pool_remote_push(class_idx, ptr, owner_tid);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Push to freelist (2-3 instructions)
|
||||
// Same-thread: Push to TLS freelist (2-3 instructions)
|
||||
*(void**)ptr = g_tls_pool_head[class_idx];
|
||||
g_tls_pool_head[class_idx] = ptr;
|
||||
g_tls_pool_count[class_idx]++;
|
||||
@ -109,4 +176,25 @@ void pool_thread_init(void) {
|
||||
void pool_thread_cleanup(void) {
|
||||
// Phase 1: No cleanup (keep it simple)
|
||||
// TODO: Drain back to global pool
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-warm TLS cache (Phase 1.5b optimization)
|
||||
// Eliminates cold-start penalty by pre-populating TLS freelists
|
||||
// Expected improvement: +180-740% (based on Phase 7 Task 3 success)
|
||||
void pool_tls_prewarm(void) {
|
||||
// Forward declare refill function (from Box 2)
|
||||
extern void* backend_batch_carve(int class_idx, int count);
|
||||
|
||||
for (int class_idx = 0; class_idx < POOL_SIZE_CLASSES; class_idx++) {
|
||||
int count = PREWARM_COUNTS[class_idx];
|
||||
|
||||
// Directly refill TLS cache (bypass alloc/free during init)
|
||||
// This avoids issues with g_initializing=1 affecting routing
|
||||
void* chain = backend_batch_carve(class_idx, count);
|
||||
if (chain) {
|
||||
// Install entire chain directly into TLS
|
||||
pool_install_chain(class_idx, chain, count);
|
||||
}
|
||||
// If OOM, continue with other classes (graceful degradation)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user