Files
hakmem/core/superslab_allocate.c

342 lines
13 KiB
C
Raw Normal View History

// superslab_allocate.c - SuperSlab allocation and deallocation
// Purpose: Main allocation/free entry points for SuperSlabs
// License: MIT
// Date: 2025-11-28
#include "hakmem_tiny_superslab_internal.h"
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
#include "box/ss_addr_map_box.h"
// ============================================================================
// SuperSlab Allocation (2MB aligned)
// ============================================================================
SuperSlab* superslab_allocate(uint8_t size_class) {
// Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗
static int fault_rate = -1; // -1=unparsed, 0=disabled, >0=rate
static __thread unsigned long fault_tick = 0;
if (__builtin_expect(fault_rate == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE");
if (e && *e) {
int v = atoi(e); if (v < 0) v = 0; fault_rate = v;
} else {
fault_rate = 0;
}
}
if (fault_rate > 0) {
unsigned long t = ++fault_tick;
if ((t % (unsigned long)fault_rate) == 0ul) {
return NULL; // simulate OOM
}
}
// Optional env clamp for SuperSlab size
static int env_parsed = 0;
static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_DEFAULT; // Start with default (2MB)
static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX;
if (!env_parsed) {
char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB");
if (maxmb) {
int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21;
}
char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB");
if (minmb) {
int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21;
}
if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env;
const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG");
if (force_lg_env && *force_lg_env) {
int v = atoi(force_lg_env);
if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) {
g_ss_force_lg = v;
g_ss_min_lg_env = g_ss_max_lg_env = v;
}
}
size_t precharge_default = 0;
const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE");
if (precharge_env && *precharge_env) {
long v = atol(precharge_env);
if (v < 0) v = 0;
precharge_default = (size_t)v;
if (v > 0) {
atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
}
}
size_t cache_default = 0;
const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE");
if (cache_env && *cache_env) {
long v = atol(cache_env);
if (v < 0) v = 0;
cache_default = (size_t)v;
}
for (int i = 0; i < 8; i++) {
g_ss_cache_cap[i] = cache_default;
g_ss_precharge_target[i] = precharge_default;
}
for (int i = 0; i < 8; i++) {
char name[64];
snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i);
char* cap_env = getenv(name);
if (cap_env && *cap_env) {
long v = atol(cap_env);
if (v < 0) v = 0;
g_ss_cache_cap[i] = (size_t)v;
}
snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i);
char* pre_env = getenv(name);
if (pre_env && *pre_env) {
long v = atol(pre_env);
if (v < 0) v = 0;
g_ss_precharge_target[i] = (size_t)v;
if (v > 0) {
atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
}
}
if (g_ss_cache_cap[i] > 0 || g_ss_precharge_target[i] > 0) {
g_ss_cache_enabled = 1;
}
}
const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE");
if (populate_env && atoi(populate_env) != 0) {
atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
}
env_parsed = 1;
}
uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class);
if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env;
if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env;
size_t ss_size = (size_t)1 << lg; // 2^20 = 1MB, 2^21 = 2MB
uintptr_t ss_mask = ss_size - 1;
int from_cache = 0;
void* ptr = NULL;
// Debug logging flag (lazy init)
static __thread int dbg = -1;
#if HAKMEM_BUILD_RELEASE
dbg = 0;
#else
if (__builtin_expect(dbg == -1, 0)) {
const char* e = getenv("HAKMEM_SS_PREWARM_DEBUG");
dbg = (e && *e && *e != '0') ? 1 : 0;
}
#endif
// Phase 9: Try LRU cache first (lazy deallocation)
SuperSlab* cached_ss = hak_ss_lru_pop(size_class);
if (cached_ss) {
ptr = (void*)cached_ss;
from_cache = 1;
// Debug logging for REFILL from LRU
if (dbg == 1) {
fprintf(stderr, "[REFILL] class=%d from_lru=1 ss=%p\n",
size_class, (void*)cached_ss);
}
// Skip old cache path - LRU cache takes priority
} else if (g_ss_cache_enabled && size_class < 8) {
// Fallback to old cache (will be deprecated)
ss_cache_precharge(size_class, ss_size, ss_mask);
SuperslabCacheEntry* old_cached = ss_cache_pop(size_class);
if (old_cached) {
ptr = (void*)old_cached;
from_cache = 1;
// Debug logging for REFILL from prewarm (old cache is essentially prewarm)
if (dbg == 1) {
fprintf(stderr, "[REFILL] class=%d from_prewarm=1 ss=%p\n",
size_class, (void*)old_cached);
}
}
}
if (!ptr) {
int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel);
ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate);
if (!ptr) {
return NULL;
}
// Debug logging for REFILL with new allocation
if (dbg == 1) {
fprintf(stderr, "[REFILL] class=%d new_alloc=1 ss=%p\n",
size_class, (void*)ptr);
}
}
// Initialize SuperSlab header (Phase 12: no global size_class field)
SuperSlab* ss = (SuperSlab*)ptr;
ss->magic = SUPERSLAB_MAGIC;
ss->active_slabs = 0;
ss->lg_size = lg; // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB)
ss->slab_bitmap = 0;
ss->nonempty_mask = 0; // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask
ss->partial_epoch = 0;
ss->publish_hint = 0xFF;
// Initialize atomics explicitly
atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed);
atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed);
atomic_store_explicit(&ss->listed, 0, memory_order_relaxed);
ss->partial_next = NULL;
// Phase 9: Initialize LRU fields
ss->last_used_ns = 0;
ss->generation = 0;
ss->lru_prev = NULL;
ss->lru_next = NULL;
// Phase 3d-C: Initialize Hot/Cold Split fields
ss->hot_count = 0;
ss->cold_count = 0;
for (int i = 0; i < 16; i++) {
ss->hot_indices[i] = 0;
ss->cold_indices[i] = 0;
}
// Initialize all slab metadata (only up to max slabs for this size)
int max_slabs = (int)(ss_size / SLAB_SIZE);
// PERF_OPT: memset removed - mmap() already returns zero-initialized pages
// Previous memset calls consumed 23.83% CPU time (perf analysis 2025-11-28)
// Measured improvement: +1.3% throughput (71.86M → 72.78M ops/s)
// Note: ASan/debug builds may need these, but production mmap guarantees zero pages
// memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta));
// memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t));
// memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t));
// memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t));
for (int i = 0; i < max_slabs; i++) {
// Phase 1: Atomic initialization (freelist + used are now _Atomic)
slab_freelist_store_relaxed(&ss->slabs[i], NULL); // Explicit NULL (redundant after memset, but clear intent)
atomic_store_explicit(&ss->slabs[i].used, 0, memory_order_relaxed);
ss->slabs[i].capacity = 0;
ss->slabs[i].owner_tid_low = 0;
// Initialize remote queue atomics (memset already zeroed, but use proper atomic init)
atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed);
atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed);
atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed);
}
if (from_cache) {
ss_stats_cache_reuse();
}
// Phase 8.3: Update ACE current_lg to match allocated size
g_ss_ace[size_class].current_lg = lg;
// Phase 1: Register SuperSlab in global registry for fast lookup
// CRITICAL: Register AFTER full initialization (ss structure is ready)
uintptr_t base = (uintptr_t)ss;
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
int reg_ok = hak_super_register(base, ss);
if (!reg_ok) {
// Registry full - this is a fatal error
fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss);
// Still return ss to avoid memory leak, but lookups may fail
}
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
do {
static _Atomic uint32_t g_ss_reg_log_shot = 0;
uint32_t shot = atomic_fetch_add_explicit(&g_ss_reg_log_shot, 1, memory_order_relaxed);
if (shot < 4) {
fprintf(stderr,
"[SS_REG_DEBUG] class=%u ss=%p reg_ok=%d map_count=%zu\n",
(unsigned)size_class,
(void*)ss,
reg_ok,
g_ss_addr_map.count);
fflush(stderr);
}
} while (0);
return ss;
}
// ============================================================================
// SuperSlab Deallocation
// ============================================================================
void superslab_free(SuperSlab* ss) {
if (!ss || ss->magic != SUPERSLAB_MAGIC) {
return; // Invalid SuperSlab
}
WIP: Add TLS SLL validation and SuperSlab registry fallback ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 20:42:28 +09:00
do {
static _Atomic uint32_t g_ss_free_log = 0;
uint32_t shot = atomic_fetch_add_explicit(&g_ss_free_log, 1, memory_order_relaxed);
if (shot < 6) {
fprintf(stderr, "[SS_FREE_EARLY] ss=%p lg_size=%d active_slabs=%u map_count=%zu\n",
(void*)ss,
ss->lg_size,
ss->active_slabs,
g_ss_addr_map.count);
fflush(stderr);
}
} while (0);
// ADD DEBUG LOGGING
static __thread int dbg = -1;
#if HAKMEM_BUILD_RELEASE
dbg = 0;
#else
if (__builtin_expect(dbg == -1, 0)) {
const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
dbg = (e && *e && *e != '0') ? 1 : 0;
}
#endif
if (dbg == 1) {
fprintf(stderr, "[SS_FREE] CALLED: ss=%p lg_size=%d active_slabs=%u\n",
(void*)ss, ss->lg_size, ss->active_slabs);
}
// Phase 9: Lazy Deallocation - try to cache in LRU instead of munmap
size_t ss_size = (size_t)1 << ss->lg_size;
// Phase 1: Unregister SuperSlab from registry FIRST
// CRITICAL: Must unregister BEFORE adding to LRU cache
// Reason: Cached SuperSlabs should NOT be found by lookups
uintptr_t base = (uintptr_t)ss;
hak_super_unregister(base);
// Memory fence to ensure unregister is visible
atomic_thread_fence(memory_order_release);
// Phase 9: Try LRU cache first (lazy deallocation)
// NOTE: LRU cache keeps magic=SUPERSLAB_MAGIC for validation
// Magic will be cleared on eviction or reuse
int lru_cached = hak_ss_lru_push(ss);
if (dbg == 1) {
fprintf(stderr, "[SS_FREE] hak_ss_lru_push() returned %d\n", lru_cached);
}
if (lru_cached) {
// Successfully cached in LRU - defer munmap
return;
}
// LRU cache full or disabled - try old cache using head class_idx (if known)
int old_cached = ss_cache_push(0, ss);
if (old_cached) {
ss_stats_cache_store();
return;
}
// Both caches full - immediately free to OS (eager deallocation)
// Clear magic to prevent use-after-free
ss->magic = 0;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p size=%zu active=%u (LRU full)\n",
(void*)ss, ss_size,
atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed));
#endif
munmap(ss, ss_size);
// Update statistics for actual release to OS
pthread_mutex_lock(&g_superslab_lock);
g_superslabs_freed++;
// Phase 12: we no longer track per-SS size_class on header; skip g_ss_freed_by_class here
g_bytes_allocated -= ss_size;
pthread_mutex_unlock(&g_superslab_lock);
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n",
(unsigned long long)g_superslabs_freed);
#endif
}