Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.
This commit is contained in:
Moe Charm (CI)
2025-11-09 18:55:50 +09:00
parent ab68ee536d
commit 1010a961fb
171 changed files with 10238 additions and 634 deletions

View File

@ -2,6 +2,14 @@
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "pool_tls_registry.h"
static inline pid_t gettid_cached(void){
static __thread pid_t t=0; if (__builtin_expect(t==0,0)) t=(pid_t)syscall(SYS_gettid); return t;
}
#include <stdio.h>
// Class sizes: 8KB, 16KB, 24KB, 32KB, 40KB, 48KB, 52KB
const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
@ -12,11 +20,27 @@ const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
__thread void* g_tls_pool_head[POOL_SIZE_CLASSES];
__thread uint32_t g_tls_pool_count[POOL_SIZE_CLASSES];
// Phase 1.5b: Lazy pre-warm flag (per-thread)
#ifdef HAKMEM_POOL_TLS_PREWARM
__thread int g_tls_pool_prewarmed = 0;
#endif
// Fixed refill counts (Phase 1: no learning)
static const uint32_t DEFAULT_REFILL_COUNT[POOL_SIZE_CLASSES] = {
64, 48, 32, 32, 24, 16, 16 // Larger classes = smaller refill
};
// Pre-warm counts optimized for memory usage (Phase 1.5b)
// Total memory: ~1.6MB per thread
// Hot classes (8-24KB): 16 blocks - common in real workloads
// Warm classes (32-40KB): 8 blocks
// Cold classes (48-52KB): 4 blocks - rare
static const int PREWARM_COUNTS[POOL_SIZE_CLASSES] = {
16, 16, 12, // Hot: 8KB, 16KB, 24KB
8, 8, // Warm: 32KB, 40KB
4, 4 // Cold: 48KB, 52KB
};
// Forward declare refill function (from Box 2)
extern void* pool_refill_and_alloc(int class_idx);
@ -36,12 +60,34 @@ static inline int pool_size_to_class(size_t size) {
// Ultra-fast allocation (5-6 cycles)
void* pool_alloc(size_t size) {
// Phase 1.5b: Lazy pre-warm on first allocation per thread
#ifdef HAKMEM_POOL_TLS_PREWARM
if (__builtin_expect(!g_tls_pool_prewarmed, 0)) {
g_tls_pool_prewarmed = 1; // Set flag FIRST to prevent recursion!
pool_tls_prewarm(); // Pre-populate TLS caches
}
#endif
// Quick bounds check
if (size < 8192 || size > 53248) return NULL;
int class_idx = pool_size_to_class(size);
if (class_idx < 0) return NULL;
// Drain a small batch of remote frees for this class
extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
void* chain = NULL;
int drained = pool_remote_pop_chain(class_idx, 32, &chain);
if (drained > 0 && chain) {
// Splice into TLS freelist
void* tail = chain;
int n = 1;
while (*(void**)tail) { tail = *(void**)tail; n++; }
*(void**)tail = g_tls_pool_head[class_idx];
g_tls_pool_head[class_idx] = chain;
g_tls_pool_count[class_idx] += n;
}
void* head = g_tls_pool_head[class_idx];
if (__builtin_expect(head != NULL, 1)) { // LIKELY
@ -54,6 +100,17 @@ void* pool_alloc(size_t size) {
*((uint8_t*)head - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx;
#endif
// Low-water integration: if TLS count is low, opportunistically drain remotes
if (g_tls_pool_count[class_idx] < 4) {
extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
void* chain2 = NULL; int got = pool_remote_pop_chain(class_idx, 32, &chain2);
if (got > 0 && chain2) {
void* tail = chain2; while (*(void**)tail) tail = *(void**)tail;
*(void**)tail = g_tls_pool_head[class_idx];
g_tls_pool_head[class_idx] = chain2;
g_tls_pool_count[class_idx] += got;
}
}
return head;
}
@ -78,8 +135,18 @@ void pool_free(void* ptr) {
// Need registry lookup (slower fallback) - not implemented in Phase 1
return;
#endif
// Owner resolution via page registry
pid_t owner_tid=0; int reg_cls=-1;
if (pool_reg_lookup(ptr, &owner_tid, &reg_cls)){
pid_t me = gettid_cached();
if (owner_tid != me){
extern int pool_remote_push(int class_idx, void* ptr, int owner_tid);
(void)pool_remote_push(class_idx, ptr, owner_tid);
return;
}
}
// Push to freelist (2-3 instructions)
// Same-thread: Push to TLS freelist (2-3 instructions)
*(void**)ptr = g_tls_pool_head[class_idx];
g_tls_pool_head[class_idx] = ptr;
g_tls_pool_count[class_idx]++;
@ -109,4 +176,25 @@ void pool_thread_init(void) {
void pool_thread_cleanup(void) {
// Phase 1: No cleanup (keep it simple)
// TODO: Drain back to global pool
}
}
// Pre-warm TLS cache (Phase 1.5b optimization)
// Eliminates cold-start penalty by pre-populating TLS freelists
// Expected improvement: +180-740% (based on Phase 7 Task 3 success)
void pool_tls_prewarm(void) {
// Forward declare refill function (from Box 2)
extern void* backend_batch_carve(int class_idx, int count);
for (int class_idx = 0; class_idx < POOL_SIZE_CLASSES; class_idx++) {
int count = PREWARM_COUNTS[class_idx];
// Directly refill TLS cache (bypass alloc/free during init)
// This avoids issues with g_initializing=1 affecting routing
void* chain = backend_batch_carve(class_idx, count);
if (chain) {
// Install entire chain directly into TLS
pool_install_chain(class_idx, chain, count);
}
// If OOM, continue with other classes (graceful degradation)
}
}