Phase 9-2: Remove Legacy Backend & Unify to Shared Pool (50M ops/s)

- Removed Legacy Backend fallback; Shared Pool is now the sole backend.
- Removed Soft Cap limit in Shared Pool to allow full memory management.
- Implemented EMPTY slab recycling with batched meta->used decrement in remote drain.
- Updated tiny_free_local_box to return is_empty status for safe recycling.
- Fixed race condition in release path by removing from legacy list early.
- Achieved 50.3M ops/s in WS8192 benchmark (+200% vs baseline).
This commit is contained in:
Moe Charm (CI)
2025-12-01 13:47:23 +09:00
parent 3a040a545a
commit 0bc33dc4f5
7 changed files with 92 additions and 102 deletions

View File

@ -19,11 +19,11 @@ void tiny_failfast_log(const char* stage,
void* ptr,
void* prev);
void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid) {
int tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid) {
extern _Atomic uint64_t g_free_local_box_calls;
atomic_fetch_add_explicit(&g_free_local_box_calls, 1, memory_order_relaxed);
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) return;
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return 0;
if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) return 0;
(void)my_tid;
// ✅ Phase E1-CORRECT: ALL classes have headers, calculate BASE pointer once
@ -177,11 +177,16 @@ void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void*
// Track local free (debug helpers may be no-op)
tiny_remote_track_on_local_free(ss, slab_idx, ptr, "local_free", my_tid);
meta->used--;
// BUGFIX Phase 9-2: Use atomic_fetch_sub to detect 1->0 transition reliably
// meta->used--; // old
uint16_t prev_used = atomic_fetch_sub_explicit(&meta->used, 1, memory_order_release);
int is_empty = (prev_used == 1); // Transitioned from 1 to 0
ss_active_dec_one(ss);
// Phase 12-1.1: EMPTY slab detection (immediate reuse optimization)
if (meta->used == 0) {
if (is_empty) {
// Slab became EMPTY → mark for highest-priority reuse
ss_mark_slab_empty(ss, slab_idx);
@ -206,4 +211,6 @@ void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void*
uint8_t cls0 = (meta && meta->class_idx < TINY_NUM_CLASSES) ? meta->class_idx : 0;
tiny_free_publish_first_free((int)cls0, ss, slab_idx);
}
return is_empty;
}

View File

@ -4,5 +4,6 @@
#include "hakmem_tiny_superslab.h"
// Perform same-thread freelist push. On first-free (prev==NULL), publishes via Ready/Mailbox.
void tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid);
// Returns: 1 if slab transitioned to EMPTY (used=0), 0 otherwise.
int tiny_free_local_box(SuperSlab* ss, int slab_idx, TinySlabMeta* meta, void* ptr, uint32_t my_tid);

View File

@ -204,7 +204,8 @@ static inline uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size) {
// Call tiny_free_local_box() to:
// 1. Push block to slab freelist
// 2. Decrement meta->used (THIS IS THE KEY!)
tiny_free_local_box(ss, slab_idx, meta, user_ptr, my_tid);
// Phase 9-2 FIX: Capture 'is_empty' return value to detect ownership of 1->0 transition
int is_empty = tiny_free_local_box(ss, slab_idx, meta, user_ptr, my_tid);
#if !HAKMEM_BUILD_RELEASE
// Trace drain operation (debug only)
@ -220,15 +221,9 @@ static inline uint32_t tiny_tls_sll_drain(int class_idx, uint32_t batch_size) {
drained++;
// Phase 9-2: Track touched slab for later EMPTY check
// We track (ss, slab_idx) pairs to check after loop completes
int already_tracked = 0;
for (int t = 0; t < num_touched; t++) {
if (touched[t].ss == ss && touched[t].slab_idx == slab_idx) {
already_tracked = 1;
break;
}
}
if (!already_tracked && num_touched < MAX_TOUCHED_SLABS) {
// CRITICAL FIX: Only recycle if WE caused the transition to EMPTY (is_empty == 1)
// This prevents multiple threads from racing to release the same slab
if (is_empty && num_touched < MAX_TOUCHED_SLABS) {
touched[num_touched].ss = ss;
touched[num_touched].slab_idx = slab_idx;
num_touched++;

View File

@ -418,19 +418,8 @@ stage2_fallback:
}
// Before creating a new SuperSlab, consult learning-layer soft cap.
// If current active slots for this class already exceed the policy cap,
// fail early so caller can fall back to legacy backend.
uint32_t limit = sp_class_active_limit(class_idx);
if (limit > 0) {
uint32_t cur = g_shared_pool.class_active_slots[class_idx];
if (cur >= limit) {
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
return -1; // Soft cap reached for this class
}
}
// Phase 9-2: Soft Cap removed to allow Shared Pool to fully replace Legacy Backend.
// We now rely on LRU eviction and EMPTY recycling to manage memory pressure.
// Create metadata for this new SuperSlab
SharedSSMeta* new_meta = sp_meta_find_or_create(new_ss);

View File

@ -26,6 +26,23 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)
return;
}
// Phase 9-2 FIX: Promote Legacy SuperSlabs to Shared Pool on first recycle
// If we are recycling a slot from a Legacy SS, we must remove it from the
// Legacy list (g_superslab_heads) to prevent Legacy Backend from allocating
// from it simultaneously (Double Allocation Race).
// This effectively transfers ownership to Shared Pool.
extern void remove_superslab_from_legacy_head(SuperSlab* ss);
remove_superslab_from_legacy_head(ss);
// BUGFIX: Re-check used count after removal. Legacy Backend might have
// allocated from this slab while we were waiting for the lock in remove().
TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
if (atomic_load_explicit(&slab_meta->used, memory_order_acquire) != 0) {
// Legacy Backend stole this slab. It's now an orphan (removed from list).
// We abort recycling. It will be recycled when Legacy frees it later.
return;
}
// Debug logging
#if !HAKMEM_BUILD_RELEASE
static int dbg = -1;
@ -46,9 +63,9 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)
pthread_mutex_lock(&g_shared_pool.alloc_lock);
TinySlabMeta* slab_meta = &ss->slabs[slab_idx];
// TinySlabMeta* slab_meta = &ss->slabs[slab_idx]; // Already declared above
if (slab_meta->used != 0) {
// Not actually empty; nothing to do
// Not actually empty (double check under lock)
if (g_lock_stats_enabled == 1) {
atomic_fetch_add(&g_lock_release_count, 1);
}
@ -160,15 +177,28 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx)
pthread_mutex_unlock(&g_shared_pool.alloc_lock);
// Remove from legacy backend list (if present) to prevent dangling pointers
extern void remove_superslab_from_legacy_head(SuperSlab* ss);
remove_superslab_from_legacy_head(ss);
// Remove from legacy backend list (moved to top of function)
// extern void remove_superslab_from_legacy_head(SuperSlab* ss);
// remove_superslab_from_legacy_head(ss);
// Free SuperSlab:
// 1. Try LRU cache (hak_ss_lru_push) - lazy deallocation
// 2. Or munmap if LRU is full - eager deallocation
// BUGFIX: Double check total_active_blocks. Legacy Backend might have
// allocated from ANOTHER slab in this SS just before we removed it.
// If so, we must NOT free the SS.
if (atomic_load(&ss->total_active_blocks) == 0) {
extern void superslab_free(SuperSlab* ss);
superslab_free(ss);
} else {
#if !HAKMEM_BUILD_RELEASE
if (dbg == 1) {
fprintf(stderr, "[SP_SLOT_RELEASE] SKIP free ss=%p: total_active_blocks=%u > 0\n",
(void*)ss, atomic_load(&ss->total_active_blocks));
}
#endif
}
return;
}

View File

@ -28,6 +28,9 @@ void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
g_superslab_heads[class_idx] = head;
}
// LOCK expansion_lock to protect list traversal (vs remove_superslab_from_legacy_head)
pthread_mutex_lock(&head->expansion_lock);
SuperSlab* chunk = head->current_chunk ? head->current_chunk : head->first_chunk;
while (chunk) {
@ -62,12 +65,19 @@ void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
meta->used++;
atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed);
// UNLOCK before return
pthread_mutex_unlock(&head->expansion_lock);
HAK_RET_ALLOC_BLOCK_TRACED(class_idx, base, ALLOC_PATH_BACKEND);
}
}
chunk = chunk->next_chunk;
}
// UNLOCK before expansion (which takes lock internally)
pthread_mutex_unlock(&head->expansion_lock);
if (expand_superslab_head(head) < 0) {
return NULL;
}
@ -212,74 +222,23 @@ void* hak_tiny_alloc_superslab_backend_shared(int class_idx)
* Box API entry:
* - Single front-door for tiny-side Superslab allocations.
*
* Phase 9-2 Root Fix: Shared Pool backend unified mode (default ON)
* Phase 9-2 Final: Shared Pool ONLY (Legacy Backend Removed)
* Policy:
* - HAKMEM_TINY_SS_SHARED=2 (default) → Shared Pool backend ONLY (no legacy fallback)
* - HAKMEM_TINY_SS_SHARED=1 → Shared Pool backend with legacy fallback (testing mode)
* - HAKMEM_TINY_SS_SHARED=0 → Legacy backend only (compatibility mode)
*
* Root Cause: Legacy backend (g_superslab_heads) has TLS_SLL_DUP issue
* Solution: Disable legacy backend by default, keep as "reversible box" via env var
* - HAKMEM_TINY_SS_SHARED is now ignored (or used only for logging).
* - Always uses Shared Pool backend.
* - Legacy backend (g_superslab_heads) is no longer used for allocation.
*/
void* hak_tiny_alloc_superslab_box(int class_idx)
{
static int g_ss_shared_mode = -1;
static _Atomic uint32_t g_ss_backend_log = 0;
if (__builtin_expect(g_ss_shared_mode == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_SS_SHARED");
if (!e || !*e) {
g_ss_shared_mode = 2; // Phase 9-2 Root Fix: Shared Pool ONLY (no legacy fallback)
} else {
int v = atoi(e);
g_ss_shared_mode = v; // 0=legacy only, 1=shared+fallback, 2=shared only
}
#if !HAKMEM_BUILD_RELEASE
const char* mode_str = (g_ss_shared_mode == 2) ? "shared_only" :
(g_ss_shared_mode == 1) ? "shared+fallback" : "legacy_only";
fprintf(stderr, "[SS_BACKEND] Mode: %s (HAKMEM_TINY_SS_SHARED=%d)\n", mode_str, g_ss_shared_mode);
#endif
}
// Mode 2: Shared Pool ONLY (default, no legacy fallback)
if (g_ss_shared_mode == 2) {
// Always use Shared Pool (Mode 2 equivalent)
void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
if (p != NULL) {
uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
if (p == NULL) {
static _Atomic uint32_t g_ss_oom_log = 0;
uint32_t n = atomic_fetch_add_explicit(&g_ss_oom_log, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr, "[SS_BACKEND] shared_only cls=%d ptr=%p\n", class_idx, p);
fprintf(stderr, "[SS_BACKEND] shared_fail→NULL (OOM) cls=%d\n", class_idx);
}
}
return p;
}
// Phase 9-2: NO fallback to legacy - return NULL on failure
uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr, "[SS_BACKEND] shared_fail→NULL (no legacy) cls=%d\n", class_idx);
}
return NULL;
}
// Mode 1: Shared Pool with legacy fallback (testing mode)
if (g_ss_shared_mode == 1) {
void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
if (p != NULL) {
uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr, "[SS_BACKEND] shared cls=%d ptr=%p\n", class_idx, p);
}
return p;
}
// Fallback to legacy
uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr, "[SS_BACKEND] shared_fail→legacy cls=%d\n", class_idx);
}
return hak_tiny_alloc_superslab_backend_legacy(class_idx);
}
// Mode 0: Legacy backend only (compatibility mode)
uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr, "[SS_BACKEND] legacy cls=%d\n", class_idx);
}
return hak_tiny_alloc_superslab_backend_legacy(class_idx);
}

View File

@ -27,6 +27,8 @@ void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMe
// and splice in front of current freelist preserving relative order.
void* prev = meta->freelist;
int cls = (int)meta->class_idx;
uint16_t drained_count = 0; // Phase 9-2: Batched used decrement
HAK_CHECK_CLASS_IDX(cls, "_ss_remote_drain_to_freelist_unsafe");
if (__builtin_expect(cls < 0 || cls >= TINY_NUM_CLASSES, 0)) {
static _Atomic int g_remote_drain_cls_oob = 0;
@ -104,14 +106,21 @@ void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMe
tiny_next_write(cls, (void*)cur, prev);
prev = (void*)cur;
cur = next;
drained_count++;
}
meta->freelist = prev;
// Reset remote count after full drain
atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release);
// Phase 9-2: Try to recycle slab if EMPTY after remote drain
// This fixes the bug where EMPTY slabs accumulate and never get returned to freelist
// Phase 9-2: Batched decrement of used count (Atomic)
// Remote frees don't decrement used until they land in freelist.
if (drained_count > 0) {
uint16_t old_used = atomic_fetch_sub_explicit(&meta->used, drained_count, memory_order_release);
// If used became 0 (old_used == drained_count), try to recycle
if (old_used == drained_count) {
SLAB_TRY_RECYCLE(ss, slab_idx, meta);
}
}
// Update freelist/nonempty visibility bits
uint32_t bit = (1u << slab_idx);