Tiny: adopt boundary consolidation + class7 simple batch refill + branch hints
- Adopt boundary: keep drain→bind safety checks and mark remote pending as UNLIKELY in superslab_alloc_from_slab(). - Class7 (1024B): add simple batch SLL refill path prioritizing linear carve; reduces branchy steps for hot 1KB path. - Branch hints: favor linear alloc and mark freelist paths as unlikely where appropriate. A/B (1T, cpu2, 500k iters, with HAKMEM_TINY_ASSUME_1T=1) - 256B: ~81.3ms (down from ~83.2ms after fast_cap), cycles ~60.0M, branch‑miss ~11.07%. - 1024B: ~72.8ms (down from ~73.5ms), cycles ~27.0M, branch‑miss ~11.08%. Note: Branch miss remains ~11%; next steps: unify adopt calls across all registry paths, trim debug-only checks from hot path, and consider further fast path specialization for class 5–6 to reduce mixed‑path divergence.
This commit is contained in:
@ -204,6 +204,46 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
|
|||||||
TinySlabMeta* meta = tls->meta;
|
TinySlabMeta* meta = tls->meta;
|
||||||
if (!meta) return 0;
|
if (!meta) return 0;
|
||||||
|
|
||||||
|
// Class7 special-case: simple batch refill (favor linear carve, minimal branching)
|
||||||
|
if (__builtin_expect(class_idx == 7, 0)) {
|
||||||
|
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
|
||||||
|
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
|
||||||
|
if (room <= 0) return 0;
|
||||||
|
int take = max_take < room ? max_take : room;
|
||||||
|
int taken = 0;
|
||||||
|
size_t bs = g_tiny_class_sizes[class_idx];
|
||||||
|
for (; taken < take;) {
|
||||||
|
// Linear first (LIKELY for class7)
|
||||||
|
if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
|
||||||
|
uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
|
||||||
|
void* p = (void*)(base + ((size_t)meta->used * bs));
|
||||||
|
meta->used++;
|
||||||
|
*(void**)p = g_tls_sll_head[class_idx];
|
||||||
|
g_tls_sll_head[class_idx] = p;
|
||||||
|
g_tls_sll_count[class_idx]++;
|
||||||
|
ss_active_inc(tls->ss);
|
||||||
|
taken++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Freelist fallback
|
||||||
|
if (__builtin_expect(meta->freelist != NULL, 0)) {
|
||||||
|
void* p = meta->freelist;
|
||||||
|
meta->freelist = *(void**)p;
|
||||||
|
meta->used++;
|
||||||
|
*(void**)p = g_tls_sll_head[class_idx];
|
||||||
|
g_tls_sll_head[class_idx] = p;
|
||||||
|
g_tls_sll_count[class_idx]++;
|
||||||
|
ss_active_inc(tls->ss);
|
||||||
|
taken++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Need another slab with space
|
||||||
|
if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
|
||||||
|
meta = tls->meta; // refresh after refill
|
||||||
|
}
|
||||||
|
return taken;
|
||||||
|
}
|
||||||
|
|
||||||
// Compute how many we can actually push into SLL without overflow
|
// Compute how many we can actually push into SLL without overflow
|
||||||
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
|
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
|
||||||
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
|
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
|
||||||
@ -214,11 +254,11 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
|
|||||||
size_t bs = g_tiny_class_sizes[class_idx];
|
size_t bs = g_tiny_class_sizes[class_idx];
|
||||||
while (taken < take) {
|
while (taken < take) {
|
||||||
void* p = NULL;
|
void* p = NULL;
|
||||||
if (meta->freelist) {
|
if (__builtin_expect(meta->freelist != NULL, 0)) {
|
||||||
p = meta->freelist; meta->freelist = *(void**)p; meta->used++;
|
p = meta->freelist; meta->freelist = *(void**)p; meta->used++;
|
||||||
// Track active blocks reserved into TLS SLL
|
// Track active blocks reserved into TLS SLL
|
||||||
ss_active_inc(tls->ss);
|
ss_active_inc(tls->ss);
|
||||||
} else if (meta->used < meta->capacity) {
|
} else if (__builtin_expect(meta->used < meta->capacity, 1)) {
|
||||||
void* slab_start = tiny_slab_base_for(tls->ss, tls->slab_idx);
|
void* slab_start = tiny_slab_base_for(tls->ss, tls->slab_idx);
|
||||||
p = (char*)slab_start + ((size_t)meta->used * bs);
|
p = (char*)slab_start + ((size_t)meta->used * bs);
|
||||||
meta->used++;
|
meta->used++;
|
||||||
|
|||||||
@ -16,8 +16,8 @@
|
|||||||
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
|
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
|
||||||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||||||
|
|
||||||
// Ensure remote queue is drained before handing blocks back to TLS
|
// Ensure remote queue is drained before handing blocks back to TLS (UNLIKELY in 1T)
|
||||||
if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) {
|
if (__builtin_expect(atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0, 0)) {
|
||||||
uint32_t self_tid = tiny_self_u32();
|
uint32_t self_tid = tiny_self_u32();
|
||||||
SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
|
SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
|
||||||
if (slab_is_valid(&h)) {
|
if (slab_is_valid(&h)) {
|
||||||
@ -68,7 +68,7 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
|
|||||||
|
|
||||||
// Phase 6.24: Linear allocation mode (freelist == NULL)
|
// Phase 6.24: Linear allocation mode (freelist == NULL)
|
||||||
// This avoids the 4000-8000 cycle cost of building freelist on init
|
// This avoids the 4000-8000 cycle cost of building freelist on init
|
||||||
if (meta->freelist == NULL && meta->used < meta->capacity) {
|
if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
|
||||||
// Linear allocation: use canonical tiny_slab_base_for() only
|
// Linear allocation: use canonical tiny_slab_base_for() only
|
||||||
size_t block_size = g_tiny_class_sizes[ss->size_class];
|
size_t block_size = g_tiny_class_sizes[ss->size_class];
|
||||||
uint8_t* base = tiny_slab_base_for(ss, slab_idx);
|
uint8_t* base = tiny_slab_base_for(ss, slab_idx);
|
||||||
@ -80,7 +80,7 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Freelist mode (after first free())
|
// Freelist mode (after first free())
|
||||||
if (meta->freelist) {
|
if (__builtin_expect(meta->freelist != NULL, 0)) {
|
||||||
void* block = meta->freelist;
|
void* block = meta->freelist;
|
||||||
|
|
||||||
// CORRUPTION DEBUG: Validate freelist head before popping
|
// CORRUPTION DEBUG: Validate freelist head before popping
|
||||||
|
|||||||
Reference in New Issue
Block a user