Files
hakmem/core/hakmem_super_registry.h

211 lines
8.6 KiB
C
Raw Normal View History

#pragma once
#include <stdio.h>
#include <stdlib.h>
// Phase 1: SuperSlab Registry - Thread-safe O(1) lookup for SuperSlab ownership
//
// Purpose: Replace mincore() syscall (50-100ns) with userspace hash table lookup
// Performance: ~5-10ns per lookup, 10-20x faster than mincore()
//
// Thread Safety:
// - Readers: Lock-free with acquire semantics
// - Writers: Mutex-protected with release semantics
// - Publish order: ss initialization → release fence → base write
// - Unpublish order: base = 0 (release) → munmap (prevents reader deref)
#include <stdatomic.h>
#include <pthread.h>
#include <stdint.h>
#include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC
#include "box/ss_addr_map_box.h" // Phase 9-1: O(1) hash table lookup
#include "box/super_reg_box.h" // Phase X: profile-aware logical registry sizing
Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update Add comprehensive design docs and research boxes: - docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation - docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs - docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research - docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design - docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings - docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results - docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation Research boxes (SS page table): - core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate - core/box/ss_pt_types_box.h: 2-level page table structures - core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation - core/box/ss_pt_register_box.h: Page table registration - core/box/ss_pt_impl.c: Global definitions Updates: - docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars - core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration - core/box/pool_mid_inuse_deferred_box.h: Deferred API updates - core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection - core/hakmem_super_registry: SS page table integration Current Status: - FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption - ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box - Next: Optimization roadmap per ROI (mimalloc gap 2.5x) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-13 05:35:46 +09:00
#include "box/ss_pt_lookup_box.h" // Phase 9-2: O(1) page table lookup
#include "box/ss_pt_env_box.h" // Phase 9-2: ENV gate for PT vs hash
// Registry configuration
// Increased from 4096 to 32768 to avoid registry exhaustion under
// high-churn microbenchmarks (e.g., larson with many active SuperSlabs).
// Still a power of two for fast masking.
#define SUPER_REG_SIZE 1048576 // Power of 2 for fast modulo (1M entries)
#define SUPER_REG_MASK (SUPER_REG_SIZE - 1)
#define SUPER_MAX_PROBE 32 // Linear probing limit (increased from 8 for Phase 15 fix)
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
// - Refill scan: O(class_size) instead of O(262144)
// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
#define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads)
// Registry entry: base address → SuperSlab pointer mapping
typedef struct SuperRegEntry {
_Atomic(uintptr_t) base; // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync]
_Atomic(SuperSlab*) ss; // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race)
uint8_t lg_size; // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB)
uint8_t _pad[7]; // Padding to 24 bytes (cache-friendly)
} SuperRegEntry;
// Global registry (lock-free reads, mutex-protected writes)
extern pthread_mutex_t g_super_reg_lock;
extern int g_super_reg_initialized;
// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
// - Protected by g_super_reg_lock (shared with main registry)
#ifndef TINY_NUM_CLASSES
#define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet
#endif
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
Phase 9: SuperSlab Lazy Deallocation + mincore removal Goal: Eliminate syscall overhead (99.2% CPU) to approach System malloc performance Implementation: 1. mincore removal (100% elimination) - Deleted: hakmem_internal.h hak_is_memory_readable() syscall - Deleted: tiny_free_fast_v2.inc.h safety checks - Alternative: Internal metadata (Registry + Header magic validation) - Result: 841 mincore calls → 0 calls ✅ 2. SuperSlab Lazy Deallocation - Added LRU Cache Manager (470 lines in hakmem_super_registry.c) - Extended SuperSlab: last_used_ns, generation, lru_prev/next - Deallocation policy: Count/Memory/TTL based eviction - Environment variables: * HAKMEM_SUPERSLAB_MAX_CACHED=256 (default) * HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 (default) * HAKMEM_SUPERSLAB_TTL_SEC=60 (default) 3. Integration - superslab_allocate: Try LRU cache first before mmap - superslab_free: Push to LRU cache instead of immediate munmap - Lazy deallocation: Defer munmap until cache limits exceeded Performance Results (100K iterations, 256B allocations): Before (Phase 7-8): - Performance: 2.76M ops/s - Syscalls: 3,412 (mmap:1,250, munmap:1,321, mincore:841) After (Phase 9): - Performance: 9.71M ops/s (+251%) 🏆 - Syscalls: 1,729 (mmap:877, munmap:852, mincore:0) (-49%) Key Achievements: - ✅ mincore: 100% elimination (841 → 0) - ✅ mmap: -30% reduction (1,250 → 877) - ✅ munmap: -35% reduction (1,321 → 852) - ✅ Total syscalls: -49% reduction (3,412 → 1,729) - ✅ Performance: +251% improvement (2.76M → 9.71M ops/s) System malloc comparison: - HAKMEM: 9.71M ops/s - System malloc: 90.04M ops/s - Achievement: 10.8% (target: 93%) Next optimization: - Further mmap/munmap reduction (1,729 vs System's 13 = 133x gap) - Pre-warm LRU cache - Adaptive LRU sizing - Per-class LRU cache Production ready with recommended settings: export HAKMEM_SUPERSLAB_MAX_CACHED=256 export HAKMEM_SUPERSLAB_MAX_MEMORY_MB=512 ./bench_random_mixed_hakmem 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 14:05:39 +09:00
// ============================================================================
// Phase 9: Lazy Deallocation - LRU Cache Manager
// ============================================================================
// Global LRU cache for empty SuperSlabs (lazy deallocation)
typedef struct {
SuperSlab* lru_head; // LRU list head (most recently used)
SuperSlab* lru_tail; // LRU list tail (least recently used)
uint32_t total_count; // Total SuperSlabs in cache
uint32_t max_cached; // Maximum cached SuperSlabs (default: 256)
uint64_t total_memory_mb; // Total memory in cache (MB)
uint64_t max_memory_mb; // Maximum memory limit (MB, default: 512)
uint64_t ttl_ns; // Time-to-live (nanoseconds, default: 60s)
uint32_t generation; // Current generation counter
} SuperSlabLRUCache;
extern SuperSlabLRUCache g_ss_lru_cache;
// Initialize LRU cache (called once at startup)
void hak_ss_lru_init(void);
// Try to reuse a cached SuperSlab (returns NULL if cache is empty)
SuperSlab* hak_ss_lru_pop(uint8_t size_class);
// Add SuperSlab to LRU cache (returns 1 if cached, 0 if evicted immediately)
int hak_ss_lru_push(SuperSlab* ss);
// Evict old SuperSlabs based on policy (TTL, max_cached, max_memory_mb)
void hak_ss_lru_evict(void);
// Mark SuperSlab as recently used (update timestamp, move to head)
void hak_ss_lru_touch(SuperSlab* ss);
// ============================================================================
// Phase 11: SuperSlab Prewarm - Eliminate mmap/munmap bottleneck
// ============================================================================
// Prewarm: Allocate SuperSlabs at startup and add to LRU cache
void hak_ss_prewarm_init(void);
// Prewarm specific size class with count SuperSlabs
void hak_ss_prewarm_class(int size_class, uint32_t count);
// Prewarm all classes (counts[i] = number of SuperSlabs for class i)
void hak_ss_prewarm_all(const uint32_t counts[TINY_NUM_CLASSES]);
// Initialize registry (call once at startup)
void hak_super_registry_init(void);
// Hash function for aligned addresses (variable size)
static inline int hak_super_hash(uintptr_t base, int lg_size) {
// Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB)
return (int)((base >> lg_size) & super_reg_effective_mask());
}
// Lookup SuperSlab by pointer (lock-free, thread-safe)
// Returns: SuperSlab* if found, NULL otherwise
Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update Add comprehensive design docs and research boxes: - docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation - docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs - docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research - docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design - docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings - docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results - docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation Research boxes (SS page table): - core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate - core/box/ss_pt_types_box.h: 2-level page table structures - core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation - core/box/ss_pt_register_box.h: Page table registration - core/box/ss_pt_impl.c: Global definitions Updates: - docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars - core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration - core/box/pool_mid_inuse_deferred_box.h: Deferred API updates - core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection - core/hakmem_super_registry: SS page table integration Current Status: - FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption - ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box - Next: Optimization roadmap per ROI (mimalloc gap 2.5x) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-13 05:35:46 +09:00
// Phase 9-2: Dispatch between page table (O(1) absolute) vs hash table (O(1) amortized)
static inline SuperSlab* hak_super_lookup(void* ptr) {
if (!g_super_reg_initialized) return NULL;
Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update Add comprehensive design docs and research boxes: - docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation - docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs - docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research - docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design - docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings - docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results - docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation Research boxes (SS page table): - core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate - core/box/ss_pt_types_box.h: 2-level page table structures - core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation - core/box/ss_pt_register_box.h: Page table registration - core/box/ss_pt_impl.c: Global definitions Updates: - docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars - core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration - core/box/pool_mid_inuse_deferred_box.h: Deferred API updates - core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection - core/hakmem_super_registry: SS page table integration Current Status: - FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption - ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box - Next: Optimization roadmap per ROI (mimalloc gap 2.5x) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-13 05:35:46 +09:00
SuperSlab* ss = NULL;
// Phase 9-2: Try page table first if enabled
if (hak_ss_lookup_pt_enabled()) {
ss = ss_pt_lookup(ptr);
if (ss) return ss;
// Fallback to hash on miss (out_of_range or not registered)
}
// Phase 9-1: Use hash table lookup
// Replaces old linear probing (50-80 cycles → 10-20 cycles)
Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update Add comprehensive design docs and research boxes: - docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation - docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs - docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research - docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design - docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings - docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results - docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation Research boxes (SS page table): - core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate - core/box/ss_pt_types_box.h: 2-level page table structures - core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation - core/box/ss_pt_register_box.h: Page table registration - core/box/ss_pt_impl.c: Global definitions Updates: - docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars - core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration - core/box/pool_mid_inuse_deferred_box.h: Deferred API updates - core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection - core/hakmem_super_registry: SS page table integration Current Status: - FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption - ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box - Next: Optimization roadmap per ROI (mimalloc gap 2.5x) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-13 05:35:46 +09:00
ss = ss_map_lookup(&g_ss_addr_map, ptr);
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
// Fallback: If hash map misses (e.g., map not populated yet), probe the
// legacy registry table to avoid NULL for valid SuperSlabs.
if (__builtin_expect(ss == NULL, 0)) {
SuperRegEntry* reg = super_reg_entries();
if (!reg) return NULL;
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
uintptr_t p = (uintptr_t)ptr;
for (int lg = SUPERSLAB_LG_MIN; lg <= SUPERSLAB_LG_MAX; lg++) {
uintptr_t base = p & ~(((uintptr_t)1 << lg) - 1);
int h = hak_super_hash(base, lg);
int eff_mask = super_reg_effective_mask();
int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE
? SUPER_MAX_PROBE
: super_reg_effective_size();
for (int i = 0; i < probe_limit; i++) {
SuperRegEntry* e = &reg[(h + i) & eff_mask];
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
uintptr_t reg_base = atomic_load_explicit(&e->base, memory_order_acquire);
if (reg_base == 0) {
break; // empty slot
}
if (reg_base == base && e->lg_size == lg) {
ss = atomic_load_explicit(&e->ss, memory_order_acquire);
goto reg_probe_done;
}
}
}
reg_probe_done:
;
}
#if !HAKMEM_BUILD_RELEASE
// Debug logging (ENV-gated)
static __thread int s_dbg = -1;
if (__builtin_expect(s_dbg == -1, 0)) {
const char* e = getenv("HAKMEM_SUPER_LOOKUP_DEBUG");
s_dbg = (e && *e && *e != '0') ? 1 : 0;
}
if (s_dbg == 1) {
if (ss) {
fprintf(stderr, "[SUPER_LOOKUP] ptr=%p -> ss=%p (hash table hit)\n", ptr, (void*)ss);
} else {
fprintf(stderr, "[SUPER_LOOKUP] ptr=%p -> NULL (hash table miss)\n", ptr);
}
}
#endif
// Magic check for safety (same as before)
if (ss && ss->magic != SUPERSLAB_MAGIC) {
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[SUPER_LOOKUP] WARNING: ss=%p has bad magic=%llx (being freed)\n",
(void*)ss, (unsigned long long)ss->magic);
#endif
return NULL; // Being freed
}
return ss;
}
// Register SuperSlab (mutex-protected, called after SuperSlab initialization)
// Returns: 1 on success, 0 if registry is full
int hak_super_register(uintptr_t base, SuperSlab* ss);
// Unregister SuperSlab (mutex-protected, MUST call before munmap)
// Critical: base = 0 happens BEFORE munmap to prevent reader segfault
void hak_super_unregister(uintptr_t base);
// Debug: Get registry statistics
typedef struct {
int total_slots;
int used_slots;
int max_probe_depth;
} SuperRegStats;
void hak_super_registry_stats(SuperRegStats* stats);