diff --git a/core/hakmem_super_registry.c b/core/hakmem_super_registry.c index f98c1e4e..897fe93b 100644 --- a/core/hakmem_super_registry.c +++ b/core/hakmem_super_registry.c @@ -8,13 +8,21 @@ SuperRegEntry g_super_reg[SUPER_REG_SIZE]; pthread_mutex_t g_super_reg_lock = PTHREAD_MUTEX_INITIALIZER; int g_super_reg_initialized = 0; +// Per-class registry storage (Phase 6: Registry Optimization) +SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; +int g_super_reg_class_size[TINY_NUM_CLASSES]; + // Initialize registry (call once at startup) void hak_super_registry_init(void) { if (g_super_reg_initialized) return; - // Zero-initialize all entries + // Zero-initialize all entries (hash table) memset(g_super_reg, 0, sizeof(g_super_reg)); + // Zero-initialize per-class registry (Phase 6: Registry Optimization) + memset(g_super_reg_by_class, 0, sizeof(g_super_reg_by_class)); + memset(g_super_reg_class_size, 0, sizeof(g_super_reg_class_size)); + // Memory fence to ensure initialization is visible to all threads atomic_thread_fence(memory_order_release); @@ -25,6 +33,7 @@ void hak_super_registry_init(void) { // CRITICAL: Call AFTER SuperSlab is fully initialized // Publish order: ss init → release fence → base write // Phase 8.3: ACE - lg_size aware registration +// Phase 6: Registry Optimization - Also add to per-class registry for fast refill scan int hak_super_register(uintptr_t base, SuperSlab* ss) { if (!g_super_reg_initialized) { hak_super_registry_init(); @@ -35,7 +44,8 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) { int lg = ss->lg_size; // Phase 8.3: Get lg_size from SuperSlab int h = hak_super_hash(base, lg); - // Linear probing to find empty slot + // Step 1: Register in hash table (for address → SuperSlab lookup) + int hash_registered = 0; for (int i = 0; i < SUPER_MAX_PROBE; i++) { SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK]; @@ -52,33 +62,67 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) { atomic_store_explicit((_Atomic uintptr_t*)&e->base, base, memory_order_release); - pthread_mutex_unlock(&g_super_reg_lock); - return 1; + hash_registered = 1; + break; } if (e->base == base && e->lg_size == lg) { // Already registered (duplicate registration) - pthread_mutex_unlock(&g_super_reg_lock); - return 1; + hash_registered = 1; + break; + } + } + + if (!hash_registered) { + // Hash table full (probing limit reached) + pthread_mutex_unlock(&g_super_reg_lock); + fprintf(stderr, "HAKMEM: SuperSlab registry full! Increase SUPER_REG_SIZE\n"); + return 0; + } + + // Step 2: Register in per-class registry (Phase 6: Registry Optimization) + // Purpose: Enable O(class_size) refill scan instead of O(262K) + int class_idx = ss->size_class; + if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) { + int size = g_super_reg_class_size[class_idx]; + if (size < SUPER_REG_PER_CLASS) { + // Check for duplicate registration + int already_in_class = 0; + for (int i = 0; i < size; i++) { + if (g_super_reg_by_class[class_idx][i] == ss) { + already_in_class = 1; + break; + } + } + + if (!already_in_class) { + // Add to per-class registry + g_super_reg_by_class[class_idx][size] = ss; + g_super_reg_class_size[class_idx]++; + } + } else { + // Per-class registry full (should be rare) + fprintf(stderr, "HAKMEM: Per-class registry full for class %d! " + "Increase SUPER_REG_PER_CLASS\n", class_idx); } } - // Registry full (probing limit reached) pthread_mutex_unlock(&g_super_reg_lock); - fprintf(stderr, "HAKMEM: SuperSlab registry full! Increase SUPER_REG_SIZE\n"); - return 0; + return 1; } // Unregister SuperSlab (mutex-protected) // CRITICAL: Call BEFORE munmap to prevent reader segfault // Unpublish order: base = 0 (release) → munmap outside this function // Phase 8.3: ACE - Try both lg_sizes (we don't know which one was used) +// Phase 6: Registry Optimization - Also remove from per-class registry void hak_super_unregister(uintptr_t base) { if (!g_super_reg_initialized) return; pthread_mutex_lock(&g_super_reg_lock); - // Try both 1MB (20) and 2MB (21) alignments + // Step 1: Find and remove from hash table + SuperSlab* ss = NULL; // Save SuperSlab pointer for per-class removal for (int lg = 20; lg <= 21; lg++) { int h = hak_super_hash(base, lg); @@ -88,6 +132,9 @@ void hak_super_unregister(uintptr_t base) { if (e->base == base && e->lg_size == lg) { // Found entry to remove + // Save SuperSlab pointer BEFORE clearing (for per-class removal) + ss = atomic_load_explicit(&e->ss, memory_order_acquire); + // Step 1: Clear SuperSlab pointer (atomic, prevents TOCTOU race) atomic_store_explicit(&e->ss, NULL, memory_order_release); @@ -98,8 +145,8 @@ void hak_super_unregister(uintptr_t base) { // Step 3: Clear lg_size (optional cleanup) e->lg_size = 0; - pthread_mutex_unlock(&g_super_reg_lock); - return; + // Found in hash table, continue to per-class removal + goto hash_removed; } if (e->base == 0) { @@ -109,6 +156,32 @@ void hak_super_unregister(uintptr_t base) { } } +hash_removed: + // Step 2: Remove from per-class registry (Phase 6: Registry Optimization) + if (ss && ss->magic == SUPERSLAB_MAGIC) { + int class_idx = ss->size_class; + if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) { + int size = g_super_reg_class_size[class_idx]; + + // Linear scan to find and remove SuperSlab from per-class array + for (int i = 0; i < size; i++) { + if (g_super_reg_by_class[class_idx][i] == ss) { + // Found: Remove by shifting last element to this position + g_super_reg_class_size[class_idx]--; + int new_size = g_super_reg_class_size[class_idx]; + + // Swap with last element (O(1) removal, order doesn't matter) + if (i != new_size) { + g_super_reg_by_class[class_idx][i] = + g_super_reg_by_class[class_idx][new_size]; + } + g_super_reg_by_class[class_idx][new_size] = NULL; + break; + } + } + } + } + pthread_mutex_unlock(&g_super_reg_lock); // Not found is not an error (could be duplicate unregister) } diff --git a/core/hakmem_super_registry.h b/core/hakmem_super_registry.h index d38bee8e..5f50b1c3 100644 --- a/core/hakmem_super_registry.h +++ b/core/hakmem_super_registry.h @@ -27,6 +27,13 @@ #define SUPER_REG_MASK (SUPER_REG_SIZE - 1) #define SUPER_MAX_PROBE 8 // Linear probing limit +// Per-class registry for fast refill scan (Phase 6: Registry Optimization) +// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class +// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries) +// - Refill scan: O(class_size) instead of O(262144) +// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s) +#define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads) + // Registry entry: base address → SuperSlab pointer mapping typedef struct { uintptr_t base; // Aligned base address (1MB or 2MB, 0 = empty slot) @@ -40,6 +47,17 @@ extern SuperRegEntry g_super_reg[SUPER_REG_SIZE]; extern pthread_mutex_t g_super_reg_lock; extern int g_super_reg_initialized; +// Per-class registry for fast refill scan (Phase 6: Registry Optimization) +// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB) +// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot) +// - g_super_reg_class_size[class] = number of active SuperSlabs for this class +// - Protected by g_super_reg_lock (shared with main registry) +#ifndef TINY_NUM_CLASSES +#define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet +#endif +extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; +extern int g_super_reg_class_size[TINY_NUM_CLASSES]; + // Initialize registry (call once at startup) void hak_super_registry_init(void); diff --git a/core/hakmem_tiny_free.inc b/core/hakmem_tiny_free.inc index 6d9fce92..6553601e 100644 --- a/core/hakmem_tiny_free.inc +++ b/core/hakmem_tiny_free.inc @@ -910,37 +910,39 @@ static SuperSlab* superslab_refill(int class_idx) { // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan) // This reduces pressure to allocate new SS when other threads freed blocks. + // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan if (!tls->ss) { - // Best-effort: scan a small window of registry for our class - extern SuperRegEntry g_super_reg[]; - int scanned = 0; + // Phase 6: Use per-class registry (262K → ~10-100 entries per class!) + extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; + extern int g_super_reg_class_size[TINY_NUM_CLASSES]; + const int scan_max = tiny_reg_scan_max(); - for (int i = 0; i < SUPER_REG_SIZE && scanned < scan_max; i++) { - SuperRegEntry* e = &g_super_reg[i]; - uintptr_t base = atomic_load_explicit((_Atomic uintptr_t*)&e->base, memory_order_acquire); - if (base == 0) continue; - SuperSlab* ss = atomic_load_explicit(&e->ss, memory_order_acquire); + int reg_size = g_super_reg_class_size[class_idx]; + int scan_limit = (scan_max < reg_size) ? scan_max : reg_size; + + for (int i = 0; i < scan_limit; i++) { + SuperSlab* ss = g_super_reg_by_class[class_idx][i]; if (!ss || ss->magic != SUPERSLAB_MAGIC) continue; - if ((int)ss->size_class != class_idx) { scanned++; continue; } + // Note: class_idx check is not needed (per-class registry!) + // Pick first slab with freelist (Box 4: 所有権取得 + remote check) - int reg_cap = ss_slabs_capacity(ss); - uint32_t self_tid = tiny_self_u32(); - for (int s = 0; s < reg_cap; s++) { - if (ss->slabs[s].freelist) { - SlabHandle h = slab_try_acquire(ss, s, self_tid); - if (slab_is_valid(&h)) { - slab_drain_remote_full(&h); - if (slab_is_safe_to_bind(&h)) { - tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); - tiny_tls_bind_slab(tls, ss, s); - return ss; + int reg_cap = ss_slabs_capacity(ss); + uint32_t self_tid = tiny_self_u32(); + for (int s = 0; s < reg_cap; s++) { + if (ss->slabs[s].freelist) { + SlabHandle h = slab_try_acquire(ss, s, self_tid); + if (slab_is_valid(&h)) { + slab_drain_remote_full(&h); + if (slab_is_safe_to_bind(&h)) { + tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); + tiny_tls_bind_slab(tls, ss, s); + return ss; + } + slab_release(&h); } - slab_release(&h); } } } - scanned++; - } } // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window diff --git a/core/tiny_mmap_gate.h b/core/tiny_mmap_gate.h index d5f8dd91..4b89d5d2 100644 --- a/core/tiny_mmap_gate.h +++ b/core/tiny_mmap_gate.h @@ -40,17 +40,22 @@ static inline SuperSlab* tiny_must_adopt_gate(int class_idx, TinyTLSSlab* tls) { if (ss) return ss; // Registry small-window adopt (one pass, limited scan) + // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan { + // Phase 6: Use per-class registry (262K → ~10-100 entries per class!) + extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; + extern int g_super_reg_class_size[TINY_NUM_CLASSES]; + uint32_t self_tid = tiny_self_u32(); - int scanned = 0; const int scan_max = tiny_reg_scan_max(); - for (int i = 0; i < SUPER_REG_SIZE && scanned < scan_max; i++) { - SuperRegEntry* e = &g_super_reg[i]; - uintptr_t base = atomic_load_explicit((_Atomic uintptr_t*)&e->base, memory_order_acquire); - if (base == 0) continue; - SuperSlab* cand = atomic_load_explicit(&e->ss, memory_order_acquire); + int reg_size = g_super_reg_class_size[class_idx]; + int scan_limit = (scan_max < reg_size) ? scan_max : reg_size; + + for (int i = 0; i < scan_limit; i++) { + SuperSlab* cand = g_super_reg_by_class[class_idx][i]; if (!cand || cand->magic != SUPERSLAB_MAGIC) continue; - if ((int)cand->size_class != class_idx) { scanned++; continue; } + // Note: class_idx check is not needed (per-class registry!) + int cap = ss_slabs_capacity(cand); for (int s = 0; s < cap; s++) { // Box: Try to acquire ownership @@ -67,7 +72,6 @@ static inline SuperSlab* tiny_must_adopt_gate(int class_idx, TinyTLSSlab* tls) { slab_release(&h); } } - scanned++; } } return NULL;