// hakmem_tiny_superslab.c - SuperSlab allocator implementation (Phase 6.22) // Purpose: 2MB aligned slab allocation with fast pointer→slab lookup // License: MIT // Date: 2025-10-24 #include "hakmem_tiny_superslab.h" #include "box/ss_hot_cold_box.h" // Phase 3d-C: Hot/Cold Split #include "hakmem_super_registry.h" // Phase 1: Registry integration #include "hakmem_tiny.h" // For tiny_self_u32 #include "hakmem_tiny_config.h" // For extern g_tiny_class_sizes #include "hakmem_shared_pool.h" // Phase 12: Shared SuperSlab pool backend (skeleton) #include #include #include #include #include #include // getenv, atoi #include #include #include // getrlimit for OOM diagnostics #include #include "hakmem_internal.h" // HAKMEM_LOG for release-silent logging #include "tiny_region_id.h" // For HEADER_MAGIC / HEADER_CLASS_MASK (restore header on remote-drain) #include "hakmem_tiny_integrity.h" // HAK_CHECK_CLASS_IDX #include "box/tiny_next_ptr_box.h" // For tiny_next_write #include "box/slab_freelist_atomic.h" // Phase 1: Atomic freelist accessor static int g_ss_force_lg = -1; static _Atomic int g_ss_populate_once = 0; // Forward: decide next SuperSlab lg for a class (ACE-aware, clamped) static inline uint8_t hak_tiny_superslab_next_lg(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return SUPERSLAB_LG_DEFAULT; } // Prefer ACE target if within allowed range uint8_t t = atomic_load_explicit((_Atomic uint8_t*)&g_ss_ace[class_idx].target_lg, memory_order_relaxed); if (t < SUPERSLAB_LG_MIN || t > SUPERSLAB_LG_MAX) { return SUPERSLAB_LG_DEFAULT; } return t; } // ============================================================================ // Global Statistics // ============================================================================ static pthread_mutex_t g_superslab_lock = PTHREAD_MUTEX_INITIALIZER; uint64_t g_superslabs_allocated = 0; // Non-static for debugging uint64_t g_superslabs_freed = 0; // Phase 7.6: Non-static for test access uint64_t g_bytes_allocated = 0; // Non-static for debugging // ============================================================================ // Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads // ============================================================================ SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS] = {NULL}; // Debug counters _Atomic uint64_t g_ss_active_dec_calls = 0; _Atomic uint64_t g_hak_tiny_free_calls = 0; _Atomic uint64_t g_ss_remote_push_calls = 0; // Free path instrumentation (lightweight, for OOM/route diagnosis) _Atomic uint64_t g_free_ss_enter = 0; // hak_tiny_free_superslab() entries _Atomic uint64_t g_free_local_box_calls = 0; // same-thread freelist pushes _Atomic uint64_t g_free_remote_box_calls = 0; // cross-thread remote pushes // Per-class counters for gating/metrics (Tiny classes = 8) uint64_t g_ss_alloc_by_class[8] = {0}; uint64_t g_ss_freed_by_class[8] = {0}; typedef struct SuperslabCacheEntry { struct SuperslabCacheEntry* next; } SuperslabCacheEntry; static SuperslabCacheEntry* g_ss_cache_head[8] = {0}; static size_t g_ss_cache_count[8] = {0}; static size_t g_ss_cache_cap[8] = {0}; static size_t g_ss_precharge_target[8] = {0}; static _Atomic int g_ss_precharge_done[8] = {0}; static int g_ss_cache_enabled = 0; static pthread_once_t g_ss_cache_once = PTHREAD_ONCE_INIT; static pthread_mutex_t g_ss_cache_lock[8]; uint64_t g_ss_cache_hits[8] = {0}; uint64_t g_ss_cache_misses[8] = {0}; uint64_t g_ss_cache_puts[8] = {0}; uint64_t g_ss_cache_drops[8] = {0}; uint64_t g_ss_cache_precharged[8] = {0}; uint64_t g_superslabs_reused = 0; uint64_t g_superslabs_cached = 0; static void ss_cache_global_init(void) { for (int i = 0; i < 8; i++) { pthread_mutex_init(&g_ss_cache_lock[i], NULL); } } static inline void ss_cache_ensure_init(void) { pthread_once(&g_ss_cache_once, ss_cache_global_init); } static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate); static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask); static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class); static int ss_cache_push(uint8_t size_class, SuperSlab* ss); // Drain remote MPSC stack into freelist (ownership already verified by caller) void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta) { if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss) || !meta) return; static _Atomic uint32_t g_remote_drain_diag_once = 0; static int g_remote_drain_diag_en = -1; // Atomically take the whole remote list uintptr_t head = atomic_exchange_explicit(&ss->remote_heads[slab_idx], 0, memory_order_acq_rel); if (head == 0) return; // Convert remote stack (offset 0 next) into freelist encoding via Box API // and splice in front of current freelist preserving relative order. void* prev = meta->freelist; int cls = (int)meta->class_idx; HAK_CHECK_CLASS_IDX(cls, "_ss_remote_drain_to_freelist_unsafe"); if (__builtin_expect(cls < 0 || cls >= TINY_NUM_CLASSES, 0)) { static _Atomic int g_remote_drain_cls_oob = 0; if (atomic_fetch_add_explicit(&g_remote_drain_cls_oob, 1, memory_order_relaxed) == 0) { fprintf(stderr, "[REMOTE_DRAIN_CLASS_OOB] ss=%p slab_idx=%d meta=%p cls=%d head=%#lx\n", (void*)ss, slab_idx, (void*)meta, cls, (unsigned long)head); } return; } uintptr_t cur = head; while (cur != 0) { uintptr_t next = *(uintptr_t*)cur; // remote-next stored at offset 0 if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_SLL_DIAG"); g_remote_drain_diag_en = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(g_remote_drain_diag_en, 0)) { uintptr_t addr = (uintptr_t)next; if (addr != 0 && (addr < 4096 || addr > 0x00007fffffffffffULL)) { uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed); if (shot < 8) { fprintf(stderr, "[REMOTE_DRAIN_NEXT_INVALID] cls=%d slab=%d cur=%p next=%p head=%#lx prev=%p count=%u\n", cls, slab_idx, (void*)cur, (void*)next, (unsigned long)head, prev, (unsigned)meta->used); } } #if HAKMEM_TINY_HEADER_CLASSIDX int hdr_cls = tiny_region_id_read_header((uint8_t*)cur + 1); if (hdr_cls >= 0 && hdr_cls != cls) { uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed); if (shot < 8) { fprintf(stderr, "[REMOTE_DRAIN_HDR_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d head=%#lx\n", cls, slab_idx, (void*)cur, hdr_cls, (int)meta->class_idx, (unsigned long)head); } } #endif } #if HAKMEM_TINY_HEADER_CLASSIDX // Cross-check header vs meta before writing next (even if diag is off) { int hdr_cls_pre = tiny_region_id_read_header((uint8_t*)cur + 1); if (hdr_cls_pre >= 0 && hdr_cls_pre != cls) { static _Atomic uint32_t g_hdr_meta_mismatch_rd = 0; uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_mismatch_rd, 1, memory_order_relaxed); if (n < 16) { fprintf(stderr, "[REMOTE_DRAIN_HDR_META_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d\n", cls, slab_idx, (void*)cur, hdr_cls_pre, (int)meta->class_idx); } } } #endif // Restore header for header-classes (class 1-6) which were clobbered by remote push #if HAKMEM_TINY_HEADER_CLASSIDX if (cls != 0) { uint8_t expected = (uint8_t)(HEADER_MAGIC | (cls & HEADER_CLASS_MASK)); *(uint8_t*)(uintptr_t)cur = expected; } #endif // Rewrite next pointer to Box representation for this class tiny_next_write(cls, (void*)cur, prev); prev = (void*)cur; cur = next; } meta->freelist = prev; // Reset remote count after full drain atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release); // Update freelist/nonempty visibility bits uint32_t bit = (1u << slab_idx); atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release); atomic_fetch_or_explicit(&ss->nonempty_mask, bit, memory_order_release); } static inline void ss_stats_os_alloc(uint8_t size_class, size_t ss_size) { pthread_mutex_lock(&g_superslab_lock); g_superslabs_allocated++; if (size_class < 8) { g_ss_alloc_by_class[size_class]++; } g_bytes_allocated += ss_size; pthread_mutex_unlock(&g_superslab_lock); } static inline void ss_stats_cache_reuse(void) { pthread_mutex_lock(&g_superslab_lock); g_superslabs_reused++; pthread_mutex_unlock(&g_superslab_lock); } static inline void ss_stats_cache_store(void) { pthread_mutex_lock(&g_superslab_lock); g_superslabs_cached++; pthread_mutex_unlock(&g_superslab_lock); } // ============================================================================ // Phase 8.3: ACE (Adaptive Cache Engine) State // ============================================================================ SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS] = {{0}}; // Phase 8.3: hak_now_ns() is now defined in hakmem_tiny_superslab.h as static inline // ============================================================================ // Diagnostics // ============================================================================ static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) { static int logged = 0; if (logged) return; logged = 1; // CRITICAL FIX: Increment lock depth FIRST before any LIBC calls // fopen/fclose/getrlimit/fprintf all may call malloc internally // Must bypass HAKMEM wrapper to avoid header mismatch crash extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; // Force wrapper to use __libc_malloc struct rlimit rl = {0}; if (getrlimit(RLIMIT_AS, &rl) != 0) { rl.rlim_cur = RLIM_INFINITY; rl.rlim_max = RLIM_INFINITY; } unsigned long vm_size_kb = 0; unsigned long vm_rss_kb = 0; FILE* status = fopen("/proc/self/status", "r"); if (status) { char line[256]; while (fgets(line, sizeof(line), status)) { if (strncmp(line, "VmSize:", 7) == 0) { (void)sscanf(line + 7, "%lu", &vm_size_kb); } else if (strncmp(line, "VmRSS:", 6) == 0) { (void)sscanf(line + 6, "%lu", &vm_rss_kb); } } fclose(status); } // CRITICAL FIX: Do NOT decrement lock_depth yet! // fprintf() below may call malloc for buffering char rl_cur_buf[32]; char rl_max_buf[32]; if (rl.rlim_cur == RLIM_INFINITY) { strcpy(rl_cur_buf, "inf"); } else { snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur); } if (rl.rlim_max == RLIM_INFINITY) { strcpy(rl_max_buf, "inf"); } else { snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max); } #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu " "alloc=%llu freed=%llu bytes=%llu " "RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n", err, ss_size, alloc_size, (unsigned long long)g_superslabs_allocated, (unsigned long long)g_superslabs_freed, (unsigned long long)g_bytes_allocated, rl_cur_buf, rl_max_buf, vm_size_kb, vm_rss_kb); #endif g_hakmem_lock_depth--; // Now safe to restore (all libc calls complete) } // Global counters for debugging (non-static for external access) _Atomic uint64_t g_ss_mmap_count = 0; _Atomic uint64_t g_final_fallback_mmap_count = 0; static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) { void* ptr = NULL; static int log_count = 0; #ifdef MAP_ALIGNED_SUPER int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER; #ifdef MAP_POPULATE if (populate) { map_flags |= MAP_POPULATE; } #endif ptr = mmap(NULL, ss_size, PROT_READ | PROT_WRITE, map_flags, -1, 0); if (ptr != MAP_FAILED) { atomic_fetch_add(&g_ss_mmap_count, 1); if (((uintptr_t)ptr & ss_mask) == 0) { ss_stats_os_alloc(size_class, ss_size); return ptr; } munmap(ptr, ss_size); ptr = NULL; } else { log_superslab_oom_once(ss_size, ss_size, errno); } #endif size_t alloc_size = ss_size * 2; int flags = MAP_PRIVATE | MAP_ANONYMOUS; #ifdef MAP_POPULATE if (populate) { flags |= MAP_POPULATE; } #endif void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, flags, -1, 0); if (raw != MAP_FAILED) { uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1; #if !HAKMEM_BUILD_RELEASE if (log_count < 10) { fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n", (unsigned long)count, size_class, ss_size); log_count++; } #endif } if (raw == MAP_FAILED) { log_superslab_oom_once(ss_size, alloc_size, errno); return NULL; } uintptr_t raw_addr = (uintptr_t)raw; uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask; ptr = (void*)aligned_addr; size_t prefix_size = aligned_addr - raw_addr; if (prefix_size > 0) { munmap(raw, prefix_size); } size_t suffix_size = alloc_size - prefix_size - ss_size; if (suffix_size > 0) { if (populate) { #ifdef MADV_DONTNEED madvise((char*)ptr + ss_size, suffix_size, MADV_DONTNEED); #endif } else { munmap((char*)ptr + ss_size, suffix_size); } } ss_stats_os_alloc(size_class, ss_size); return ptr; } static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask) { if (!g_ss_cache_enabled) return; if (size_class >= 8) return; if (g_ss_precharge_target[size_class] == 0) return; if (atomic_load_explicit(&g_ss_precharge_done[size_class], memory_order_acquire)) return; ss_cache_ensure_init(); pthread_mutex_lock(&g_ss_cache_lock[size_class]); size_t target = g_ss_precharge_target[size_class]; size_t cap = g_ss_cache_cap[size_class]; size_t desired = target; if (cap != 0 && desired > cap) { desired = cap; } while (g_ss_cache_count[size_class] < desired) { void* raw = ss_os_acquire(size_class, ss_size, ss_mask, 1); if (!raw) { break; } SuperslabCacheEntry* entry = (SuperslabCacheEntry*)raw; entry->next = g_ss_cache_head[size_class]; g_ss_cache_head[size_class] = entry; g_ss_cache_count[size_class]++; g_ss_cache_precharged[size_class]++; } atomic_store_explicit(&g_ss_precharge_done[size_class], 1, memory_order_release); pthread_mutex_unlock(&g_ss_cache_lock[size_class]); } static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class) { if (!g_ss_cache_enabled) return NULL; if (size_class >= 8) return NULL; ss_cache_ensure_init(); pthread_mutex_lock(&g_ss_cache_lock[size_class]); SuperslabCacheEntry* entry = g_ss_cache_head[size_class]; if (entry) { g_ss_cache_head[size_class] = entry->next; if (g_ss_cache_count[size_class] > 0) { g_ss_cache_count[size_class]--; } entry->next = NULL; g_ss_cache_hits[size_class]++; } else { g_ss_cache_misses[size_class]++; } pthread_mutex_unlock(&g_ss_cache_lock[size_class]); return entry; } static int ss_cache_push(uint8_t size_class, SuperSlab* ss) { if (!g_ss_cache_enabled) return 0; if (size_class >= 8) return 0; ss_cache_ensure_init(); pthread_mutex_lock(&g_ss_cache_lock[size_class]); size_t cap = g_ss_cache_cap[size_class]; if (cap != 0 && g_ss_cache_count[size_class] >= cap) { g_ss_cache_drops[size_class]++; pthread_mutex_unlock(&g_ss_cache_lock[size_class]); return 0; } SuperslabCacheEntry* entry = (SuperslabCacheEntry*)ss; entry->next = g_ss_cache_head[size_class]; g_ss_cache_head[size_class] = entry; g_ss_cache_count[size_class]++; g_ss_cache_puts[size_class]++; pthread_mutex_unlock(&g_ss_cache_lock[size_class]); return 1; } /* * Legacy backend for hak_tiny_alloc_superslab_box(). * * Phase 12 Stage A/B: * - Uses per-class SuperSlabHead (g_superslab_heads) as the implementation. * - Callers MUST use hak_tiny_alloc_superslab_box() and never touch this directly. * - Later Stage C: this function will be replaced by a shared_pool backend. */ static SuperSlabHead* init_superslab_head(int class_idx); static int expand_superslab_head(SuperSlabHead* head); static void* hak_tiny_alloc_superslab_backend_legacy(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } SuperSlabHead* head = g_superslab_heads[class_idx]; if (!head) { head = init_superslab_head(class_idx); if (!head) { return NULL; } g_superslab_heads[class_idx] = head; } SuperSlab* chunk = head->current_chunk ? head->current_chunk : head->first_chunk; while (chunk) { int cap = ss_slabs_capacity(chunk); for (int slab_idx = 0; slab_idx < cap; slab_idx++) { TinySlabMeta* meta = &chunk->slabs[slab_idx]; // Skip slabs that belong to a different class (or are uninitialized). if (meta->class_idx != (uint8_t)class_idx) { continue; } if (meta->capacity == 0) { continue; } if (meta->used < meta->capacity) { size_t stride = tiny_block_stride_for_class(class_idx); size_t offset = (size_t)meta->used * stride; uint8_t* base = (uint8_t*)chunk + SUPERSLAB_SLAB0_DATA_OFFSET + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE + offset; meta->used++; atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed); return (void*)base; } } chunk = chunk->next_chunk; } if (expand_superslab_head(head) < 0) { return NULL; } SuperSlab* new_chunk = head->current_chunk; if (!new_chunk) { return NULL; } int cap2 = ss_slabs_capacity(new_chunk); for (int slab_idx = 0; slab_idx < cap2; slab_idx++) { TinySlabMeta* meta = &new_chunk->slabs[slab_idx]; if (meta->capacity == 0) continue; if (meta->used < meta->capacity) { size_t stride = tiny_block_stride_for_class(class_idx); size_t offset = (size_t)meta->used * stride; uint8_t* base = (uint8_t*)new_chunk + SUPERSLAB_SLAB0_DATA_OFFSET + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE + offset; meta->used++; atomic_fetch_add_explicit(&new_chunk->total_active_blocks, 1, memory_order_relaxed); return (void*)base; } } return NULL; } /* * Shared pool backend for hak_tiny_alloc_superslab_box(). * * Phase 12-2: * - Uses SharedSuperSlabPool (g_shared_pool) to obtain a SuperSlab/slab * for the requested class_idx. * - This backend EXPRESSLY owns only: * - choosing (ss, slab_idx) via shared_pool_acquire_slab() * - initializing that slab's TinySlabMeta via superslab_init_slab() * and nothing else; all callers must go through hak_tiny_alloc_superslab_box(). * * - For now this is a minimal, conservative implementation: * - One linear bump-run is carved from the acquired slab using tiny_block_stride_for_class(). * - No complex per-slab freelist or refill policy yet (Phase 12-3+). * - If shared_pool_acquire_slab() fails, we fall back to legacy backend. */ static void* hak_tiny_alloc_superslab_backend_shared(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } SuperSlab* ss = NULL; int slab_idx = -1; if (shared_pool_acquire_slab(class_idx, &ss, &slab_idx) != 0 || !ss) { // Shared pool could not provide a slab; caller may choose to fall back. return NULL; } TinySlabMeta* meta = &ss->slabs[slab_idx]; // Defensive: shared_pool must either hand us an UNASSIGNED slab or one // already bound to this class. Anything else is a hard bug. if (meta->class_idx != 255 && meta->class_idx != (uint8_t)class_idx) { #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[HAKMEM][SS_SHARED] BUG: acquire_slab mismatch: cls=%d meta->class_idx=%u slab_idx=%d ss=%p\n", class_idx, (unsigned)meta->class_idx, slab_idx, (void*)ss); #endif return NULL; } // Initialize slab geometry once for this class. if (meta->capacity == 0) { size_t block_size = g_tiny_class_sizes[class_idx]; // owner_tid_low is advisory; we can use 0 in this backend. superslab_init_slab(ss, slab_idx, block_size, 0); meta = &ss->slabs[slab_idx]; // CRITICAL FIX: Always set class_idx after init to avoid C0/C7 confusion. // New SuperSlabs start with meta->class_idx=0 (mmap zero-init). // Must explicitly set to requested class, not just when class_idx==255. meta->class_idx = (uint8_t)class_idx; } // Final contract check before computing addresses. if (meta->class_idx != (uint8_t)class_idx || meta->capacity == 0 || meta->used > meta->capacity) { #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[HAKMEM][SS_SHARED] BUG: invalid slab meta before alloc: " "cls=%d slab_idx=%d meta_cls=%u used=%u cap=%u ss=%p\n", class_idx, slab_idx, (unsigned)meta->class_idx, (unsigned)meta->used, (unsigned)meta->capacity, (void*)ss); #endif return NULL; } // Simple bump allocation within this slab. if (meta->used >= meta->capacity) { // Slab exhausted: in minimal Phase12-2 backend we do not loop; // caller or future logic must acquire another slab. return NULL; } size_t stride = tiny_block_stride_for_class(class_idx); size_t offset = (size_t)meta->used * stride; // Phase 12-2 minimal geometry: // - slab 0 data offset via SUPERSLAB_SLAB0_DATA_OFFSET // - subsequent slabs at fixed SUPERSLAB_SLAB_USABLE_SIZE strides. size_t slab_base_off = SUPERSLAB_SLAB0_DATA_OFFSET + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE; uint8_t* base = (uint8_t*)ss + slab_base_off + offset; meta->used++; atomic_fetch_add_explicit(&ss->total_active_blocks, 1, memory_order_relaxed); return (void*)base; } /* * Box API entry: * - Single front-door for tiny-side Superslab allocations. * * Phase 12 policy: * - HAKMEM_TINY_SS_SHARED=0 → legacy backendのみ(回帰確認用) * - HAKMEM_TINY_SS_SHARED=1 → shared backendを優先し、失敗時のみ legacy にフォールバック */ void* hak_tiny_alloc_superslab_box(int class_idx) { static int g_ss_shared_mode = -1; static _Atomic uint32_t g_ss_backend_log = 0; if (__builtin_expect(g_ss_shared_mode == -1, 0)) { const char* e = getenv("HAKMEM_TINY_SS_SHARED"); if (!e || !*e) { g_ss_shared_mode = 1; // デフォルト: shared 有効 } else { int v = atoi(e); g_ss_shared_mode = (v != 0) ? 1 : 0; } } if (g_ss_shared_mode == 1) { void* p = hak_tiny_alloc_superslab_backend_shared(class_idx); if (p != NULL) { uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed); if (n < 4) { fprintf(stderr, "[SS_BACKEND] shared cls=%d ptr=%p\n", class_idx, p); } return p; } // shared backend が失敗した場合は安全側で legacy にフォールバック uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed); if (n < 4) { fprintf(stderr, "[SS_BACKEND] shared_fail→legacy cls=%d\n", class_idx); } return hak_tiny_alloc_superslab_backend_legacy(class_idx); } // shared OFF 時は legacy のみ uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed); if (n < 4) { fprintf(stderr, "[SS_BACKEND] legacy cls=%d\n", class_idx); } return hak_tiny_alloc_superslab_backend_legacy(class_idx); } // ============================================================================ // SuperSlab Allocation (2MB aligned) // ============================================================================ SuperSlab* superslab_allocate(uint8_t size_class) { // Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗 static int fault_rate = -1; // -1=unparsed, 0=disabled, >0=rate static __thread unsigned long fault_tick = 0; if (__builtin_expect(fault_rate == -1, 0)) { const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE"); if (e && *e) { int v = atoi(e); if (v < 0) v = 0; fault_rate = v; } else { fault_rate = 0; } } if (fault_rate > 0) { unsigned long t = ++fault_tick; if ((t % (unsigned long)fault_rate) == 0ul) { return NULL; // simulate OOM } } // Optional env clamp for SuperSlab size static int env_parsed = 0; static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_DEFAULT; // Start with default (2MB) static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX; if (!env_parsed) { char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB"); if (maxmb) { int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21; } char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB"); if (minmb) { int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21; } if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env; const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG"); if (force_lg_env && *force_lg_env) { int v = atoi(force_lg_env); if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) { g_ss_force_lg = v; g_ss_min_lg_env = g_ss_max_lg_env = v; } } size_t precharge_default = 0; const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE"); if (precharge_env && *precharge_env) { long v = atol(precharge_env); if (v < 0) v = 0; precharge_default = (size_t)v; if (v > 0) { atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed); } } size_t cache_default = 0; const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE"); if (cache_env && *cache_env) { long v = atol(cache_env); if (v < 0) v = 0; cache_default = (size_t)v; } for (int i = 0; i < 8; i++) { g_ss_cache_cap[i] = cache_default; g_ss_precharge_target[i] = precharge_default; } for (int i = 0; i < 8; i++) { char name[64]; snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i); char* cap_env = getenv(name); if (cap_env && *cap_env) { long v = atol(cap_env); if (v < 0) v = 0; g_ss_cache_cap[i] = (size_t)v; } snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i); char* pre_env = getenv(name); if (pre_env && *pre_env) { long v = atol(pre_env); if (v < 0) v = 0; g_ss_precharge_target[i] = (size_t)v; if (v > 0) { atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed); } } if (g_ss_cache_cap[i] > 0 || g_ss_precharge_target[i] > 0) { g_ss_cache_enabled = 1; } } const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE"); if (populate_env && atoi(populate_env) != 0) { atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed); } env_parsed = 1; } uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class); if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env; if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env; size_t ss_size = (size_t)1 << lg; // 2^20 = 1MB, 2^21 = 2MB uintptr_t ss_mask = ss_size - 1; int from_cache = 0; void* ptr = NULL; // Debug logging flag (lazy init) static __thread int dbg = -1; if (__builtin_expect(dbg == -1, 0)) { const char* e = getenv("HAKMEM_SS_PREWARM_DEBUG"); dbg = (e && *e && *e != '0') ? 1 : 0; } // Phase 9: Try LRU cache first (lazy deallocation) SuperSlab* cached_ss = hak_ss_lru_pop(size_class); if (cached_ss) { ptr = (void*)cached_ss; from_cache = 1; // Debug logging for REFILL from LRU if (dbg == 1) { fprintf(stderr, "[REFILL] class=%d from_lru=1 ss=%p\n", size_class, (void*)cached_ss); } // Skip old cache path - LRU cache takes priority } else if (g_ss_cache_enabled && size_class < 8) { // Fallback to old cache (will be deprecated) ss_cache_precharge(size_class, ss_size, ss_mask); SuperslabCacheEntry* old_cached = ss_cache_pop(size_class); if (old_cached) { ptr = (void*)old_cached; from_cache = 1; // Debug logging for REFILL from prewarm (old cache is essentially prewarm) if (dbg == 1) { fprintf(stderr, "[REFILL] class=%d from_prewarm=1 ss=%p\n", size_class, (void*)old_cached); } } } if (!ptr) { int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel); ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate); if (!ptr) { return NULL; } // Debug logging for REFILL with new allocation if (dbg == 1) { fprintf(stderr, "[REFILL] class=%d new_alloc=1 ss=%p\n", size_class, (void*)ptr); } } // Initialize SuperSlab header (Phase 12: no global size_class field) SuperSlab* ss = (SuperSlab*)ptr; ss->magic = SUPERSLAB_MAGIC; ss->active_slabs = 0; ss->lg_size = lg; // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB) ss->slab_bitmap = 0; ss->nonempty_mask = 0; // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask ss->partial_epoch = 0; ss->publish_hint = 0xFF; // Initialize atomics explicitly atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed); atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed); atomic_store_explicit(&ss->listed, 0, memory_order_relaxed); ss->partial_next = NULL; // Phase 9: Initialize LRU fields ss->last_used_ns = 0; ss->generation = 0; ss->lru_prev = NULL; ss->lru_next = NULL; // Phase 3d-C: Initialize Hot/Cold Split fields ss->hot_count = 0; ss->cold_count = 0; for (int i = 0; i < 16; i++) { ss->hot_indices[i] = 0; ss->cold_indices[i] = 0; } // Initialize all slab metadata (only up to max slabs for this size) int max_slabs = (int)(ss_size / SLAB_SIZE); // DEFENSIVE FIX: Zero all slab metadata arrays to prevent ANY uninitialized pointers // This catches the 0xa2a2a2a2a2a2a2a2 pattern bug (ASan/debug fill pattern) // Even though mmap should return zeroed pages, sanitizers may fill with debug patterns memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta)); memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t)); memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t)); memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t)); for (int i = 0; i < max_slabs; i++) { // Phase 1: Atomic initialization (freelist + used are now _Atomic) slab_freelist_store_relaxed(&ss->slabs[i], NULL); // Explicit NULL (redundant after memset, but clear intent) atomic_store_explicit(&ss->slabs[i].used, 0, memory_order_relaxed); ss->slabs[i].capacity = 0; ss->slabs[i].owner_tid_low = 0; // Initialize remote queue atomics (memset already zeroed, but use proper atomic init) atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed); atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed); atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed); } if (from_cache) { ss_stats_cache_reuse(); } // Phase 8.3: Update ACE current_lg to match allocated size g_ss_ace[size_class].current_lg = lg; // Phase 1: Register SuperSlab in global registry for fast lookup // CRITICAL: Register AFTER full initialization (ss structure is ready) uintptr_t base = (uintptr_t)ss; if (!hak_super_register(base, ss)) { // Registry full - this is a fatal error fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss); // Still return ss to avoid memory leak, but lookups may fail } return ss; } // ============================================================================ // Phase 2a: Dynamic Expansion - Chunk Management Functions // ============================================================================ // Initialize SuperSlabHead for a class SuperSlabHead* init_superslab_head(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } // Allocate SuperSlabHead structure SuperSlabHead* head = (SuperSlabHead*)calloc(1, sizeof(SuperSlabHead)); if (!head) { extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate SuperSlabHead for class %d\n", class_idx); g_hakmem_lock_depth--; return NULL; } head->class_idx = (uint8_t)class_idx; atomic_store_explicit(&head->total_chunks, 0, memory_order_relaxed); head->first_chunk = NULL; head->current_chunk = NULL; pthread_mutex_init(&head->expansion_lock, NULL); // Allocate initial chunk(s) // Hot classes (1, 4, 6) get 2 initial chunks to reduce contention int initial_chunks = 1; // Phase 2a: Start with 1 chunk for all classes (expansion will handle growth) // This reduces startup memory overhead while still allowing unlimited growth initial_chunks = 1; for (int i = 0; i < initial_chunks; i++) { if (expand_superslab_head(head) < 0) { extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate initial chunk %d for class %d\n", i, class_idx); g_hakmem_lock_depth--; // Cleanup on failure SuperSlab* chunk = head->first_chunk; while (chunk) { SuperSlab* next = chunk->next_chunk; superslab_free(chunk); chunk = next; } pthread_mutex_destroy(&head->expansion_lock); free(head); return NULL; } } extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[HAKMEM] Initialized SuperSlabHead for class %d: %zu initial chunks\n", class_idx, atomic_load_explicit(&head->total_chunks, memory_order_relaxed)); #endif g_hakmem_lock_depth--; return head; } // Expand SuperSlabHead by allocating and linking a new chunk int expand_superslab_head(SuperSlabHead* head) { if (!head) { return -1; } // Allocate new chunk via existing superslab_allocate SuperSlab* new_chunk = superslab_allocate(head->class_idx); if (!new_chunk) { #if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate new chunk for class %d (system OOM)\n", head->class_idx); g_hakmem_lock_depth--; #endif return -1; // True OOM (system out of memory) } // CRITICAL FIX: Initialize slab 0 so bitmap != 0x00000000 // Phase 2a chunks must have at least one usable slab after allocation size_t block_size = g_tiny_class_sizes[head->class_idx]; // Use pthread_self() directly since tiny_self_u32() is static inline in hakmem_tiny.c uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self(); superslab_init_slab(new_chunk, 0, block_size, owner_tid); // Initialize the next_chunk link to NULL new_chunk->next_chunk = NULL; // Thread-safe linking pthread_mutex_lock(&head->expansion_lock); if (head->current_chunk) { // Find the tail of the list (optimization: could cache tail pointer) SuperSlab* tail = head->current_chunk; while (tail->next_chunk) { tail = tail->next_chunk; } tail->next_chunk = new_chunk; } else { // First chunk head->first_chunk = new_chunk; } // Update current chunk to new chunk (for fast allocation) head->current_chunk = new_chunk; // Increment total chunks atomically size_t old_count = atomic_fetch_add_explicit(&head->total_chunks, 1, memory_order_relaxed); size_t new_count = old_count + 1; pthread_mutex_unlock(&head->expansion_lock); #if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] Expanded SuperSlabHead for class %d: %zu chunks now (bitmap=0x%08x)\n", head->class_idx, new_count, new_chunk->slab_bitmap); g_hakmem_lock_depth--; #endif return 0; } // Find which chunk a pointer belongs to SuperSlab* find_chunk_for_ptr(void* ptr, int class_idx) { if (!ptr || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) { return NULL; } SuperSlabHead* head = g_superslab_heads[class_idx]; if (!head) { return NULL; } uintptr_t ptr_addr = (uintptr_t)ptr; // Walk the chunk list SuperSlab* chunk = head->first_chunk; while (chunk) { // Check if ptr is within this chunk's memory range // Each chunk is aligned to SUPERSLAB_SIZE (1MB or 2MB) uintptr_t chunk_start = (uintptr_t)chunk; size_t chunk_size = (size_t)1 << chunk->lg_size; // Use actual chunk size uintptr_t chunk_end = chunk_start + chunk_size; if (ptr_addr >= chunk_start && ptr_addr < chunk_end) { // Found the chunk return chunk; } chunk = chunk->next_chunk; } return NULL; // Not found in any chunk } // ============================================================================ // SuperSlab Deallocation // ============================================================================ void superslab_free(SuperSlab* ss) { if (!ss || ss->magic != SUPERSLAB_MAGIC) { return; // Invalid SuperSlab } // ADD DEBUG LOGGING static __thread int dbg = -1; if (__builtin_expect(dbg == -1, 0)) { const char* e = getenv("HAKMEM_SS_FREE_DEBUG"); dbg = (e && *e && *e != '0') ? 1 : 0; } if (dbg == 1) { fprintf(stderr, "[SS_FREE] CALLED: ss=%p lg_size=%d active_slabs=%u\n", (void*)ss, ss->lg_size, ss->active_slabs); } // Phase 9: Lazy Deallocation - try to cache in LRU instead of munmap size_t ss_size = (size_t)1 << ss->lg_size; // Phase 1: Unregister SuperSlab from registry FIRST // CRITICAL: Must unregister BEFORE adding to LRU cache // Reason: Cached SuperSlabs should NOT be found by lookups uintptr_t base = (uintptr_t)ss; hak_super_unregister(base); // Memory fence to ensure unregister is visible atomic_thread_fence(memory_order_release); // Phase 9: Try LRU cache first (lazy deallocation) // NOTE: LRU cache keeps magic=SUPERSLAB_MAGIC for validation // Magic will be cleared on eviction or reuse int lru_cached = hak_ss_lru_push(ss); if (dbg == 1) { fprintf(stderr, "[SS_FREE] hak_ss_lru_push() returned %d\n", lru_cached); } if (lru_cached) { // Successfully cached in LRU - defer munmap return; } // LRU cache full or disabled - try old cache using head class_idx (if known) int old_cached = ss_cache_push(0, ss); if (old_cached) { ss_stats_cache_store(); return; } // Both caches full - immediately free to OS (eager deallocation) // Clear magic to prevent use-after-free ss->magic = 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p size=%zu active=%u (LRU full)\n", (void*)ss, ss_size, atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed)); #endif munmap(ss, ss_size); // Update statistics for actual release to OS pthread_mutex_lock(&g_superslab_lock); g_superslabs_freed++; // Phase 12: we no longer track per-SS size_class on header; skip g_ss_freed_by_class here g_bytes_allocated -= ss_size; pthread_mutex_unlock(&g_superslab_lock); #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n", (unsigned long long)g_superslabs_freed); #endif } // ============================================================================ // Slab Initialization within SuperSlab // ============================================================================ void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_t owner_tid) { if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) { return; } // Phase E1-CORRECT unified geometry: // - block_size is the TOTAL stride for this class (g_tiny_class_sizes[cls]) // - usable bytes are determined by slab index (slab0 vs others) // - capacity = usable / stride for ALL classes (including former C7) size_t usable_size = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE; size_t stride = block_size; uint16_t capacity = (uint16_t)(usable_size / stride); #if !HAKMEM_BUILD_RELEASE if (slab_idx == 0) { fprintf(stderr, "[SUPERSLAB_INIT] slab 0: usable_size=%zu stride=%zu capacity=%u\n", usable_size, stride, (unsigned)capacity); } #endif TinySlabMeta* meta = &ss->slabs[slab_idx]; meta->freelist = NULL; // NULL = linear allocation mode meta->used = 0; meta->capacity = capacity; meta->carved = 0; meta->owner_tid_low = (uint8_t)(owner_tid & 0xFFu); // Fail-safe: stamp class_idx from geometry (stride → class). // This ensures legacy/shared/legacy-refill paths all end with a correct class. for (int i = 0; i < TINY_NUM_CLASSES; i++) { if (g_tiny_class_sizes[i] == stride) { meta->class_idx = (uint8_t)i; break; } } superslab_activate_slab(ss, slab_idx); } // ============================================================================ // Slab Bitmap Management // ============================================================================ void superslab_activate_slab(SuperSlab* ss, int slab_idx) { if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) { return; } uint32_t mask = 1u << slab_idx; if ((ss->slab_bitmap & mask) == 0) { ss->slab_bitmap |= mask; ss->active_slabs++; // Phase 3d-C: Update hot/cold indices after activating new slab ss_update_hot_cold_indices(ss); } } void superslab_deactivate_slab(SuperSlab* ss, int slab_idx) { if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) { return; } uint32_t mask = 1u << slab_idx; if (ss->slab_bitmap & mask) { ss->slab_bitmap &= ~mask; ss->active_slabs--; } } int superslab_find_free_slab(SuperSlab* ss) { if (!ss) return -1; if ((int)ss->active_slabs >= ss_slabs_capacity(ss)) { return -1; // No free slabs } // Find first 0 bit in bitmap int cap = ss_slabs_capacity(ss); for (int i = 0; i < cap; i++) { if ((ss->slab_bitmap & (1u << i)) == 0) { return i; } } return -1; } // ============================================================================ // Statistics / Debugging // ============================================================================ void superslab_print_stats(SuperSlab* ss) { if (!ss || ss->magic != SUPERSLAB_MAGIC) { printf("Invalid SuperSlab\n"); return; } printf("=== SuperSlab Stats ===\n"); printf("Address: %p\n", (void*)ss); // Phase 12: per-SS size_class removed; classes are per-slab via meta->class_idx. printf("Active slabs: %u / %d\n", ss->active_slabs, ss_slabs_capacity(ss)); printf("Bitmap: 0x%08X\n", ss->slab_bitmap); printf("\nPer-slab details:\n"); for (int i = 0; i < ss_slabs_capacity(ss); i++) { if (ss->slab_bitmap & (1u << i)) { TinySlabMeta* meta = &ss->slabs[i]; printf(" Slab %2d: used=%u/%u freelist=%p class=%u owner_tid_low=%u\n", i, meta->used, meta->capacity, meta->freelist, (unsigned)meta->class_idx, (unsigned)meta->owner_tid_low); } } printf("\n"); } // Global statistics void superslab_print_global_stats(void) { pthread_mutex_lock(&g_superslab_lock); printf("=== Global SuperSlab Stats ===\n"); printf("SuperSlabs allocated: %lu\n", g_superslabs_allocated); printf("SuperSlabs freed: %lu\n", g_superslabs_freed); printf("SuperSlabs active: %lu\n", g_superslabs_allocated - g_superslabs_freed); printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024)); pthread_mutex_unlock(&g_superslab_lock); } // ============================================================================ // Phase 8.3: ACE Statistics / Debugging // ============================================================================ void superslab_ace_print_stats(void) { printf("=== ACE (Adaptive Cache Engine) Stats ===\n"); const char* class_names[8] = {"8B", "16B", "24B", "32B", "40B", "48B", "56B", "64B"}; printf("Class Curr Targ Hot Allocs Refills Spills LiveBlks\n"); printf("--------------------------------------------------------------\n"); for (int i = 0; i < TINY_NUM_CLASSES_SS; i++) { SuperSlabACEState* c = &g_ss_ace[i]; printf("%-6s %2uMB %2uMB %4u %7u %8u %7u %9u\n", class_names[i], (1u << c->current_lg) / (1024 * 1024), (1u << c->target_lg) / (1024 * 1024), c->hot_score, c->alloc_count, c->refill_count, c->spill_count, c->live_blocks); } printf("\n"); } // ============================================================================ // Phase 8.3: ACE Tick Function (Promotion/Demotion Logic) // ============================================================================ #define ACE_TICK_NS (150ULL * 1000 * 1000) // 150ms tick interval #define ACE_COOLDOWN_NS (800ULL * 1000 * 1000) // 0.8s cooldown (anti-oscillation) // Simplified thresholds for refill activity #define HI_REFILL(k) (g_ss_ace[k].refill_count > 64) // High refill rate #define MID_REFILL(k) (g_ss_ace[k].refill_count > 16) // Medium refill rate // Object sizes per class (for capacity calculation) // Must match TINY size classes: 8, 16, 24, 32, 40, 48, 56, 64 bytes static const int g_tiny_obj_sizes[TINY_NUM_CLASSES_SS] = {8, 16, 24, 32, 40, 48, 56, 64}; void hak_tiny_superslab_ace_tick(int k, uint64_t now) { if (k < 0 || k >= TINY_NUM_CLASSES_SS) return; SuperSlabACEState* c = &g_ss_ace[k]; // Rate limiting: only tick every ACE_TICK_NS (~150ms) if (now - c->last_tick_ns < ACE_TICK_NS) return; // Calculate capacity for 1MB and 2MB SuperSlabs int obj_size = g_tiny_obj_sizes[k]; double cap1MB = (double)((1U << 20) / obj_size); // 1MB capacity double cap2MB = (double)((1U << 21) / obj_size); // 2MB capacity // Calculate hotness score (weighted: 60% live blocks, 40% refill rate) double hot = 0.6 * (double)c->live_blocks + 0.4 * (double)c->refill_count; if (hot < 0) hot = 0; if (hot > 1000) hot = 1000; c->hot_score = (uint16_t)hot; // Cooldown mechanism: prevent size changes within 0.8s of last change static uint64_t last_switch_ns[TINY_NUM_CLASSES_SS] = {0}; if (now - last_switch_ns[k] >= ACE_COOLDOWN_NS) { if (c->current_lg <= 20) { // Promotion condition: 1MB → 2MB // High demand (live > 75% capacity) AND high refill rate if (c->live_blocks > 0.75 * cap1MB && HI_REFILL(k)) { c->target_lg = 21; // Promote to 2MB last_switch_ns[k] = now; } } else { // Demotion condition: 2MB → 1MB // Low demand (live < 35% capacity) AND low refill rate if (c->live_blocks < 0.35 * cap2MB && !MID_REFILL(k)) { c->target_lg = 20; // Demote to 1MB last_switch_ns[k] = now; } } } // EMA-style decay for counters (reduce by 75% each tick) c->alloc_count = c->alloc_count / 4; c->refill_count = c->refill_count / 4; c->spill_count = c->spill_count / 4; // live_blocks is updated incrementally by alloc/free, not decayed here c->last_tick_ns = now; } // ============================================================================ // Phase 8.4: ACE Observer (Registry-based, zero hot-path overhead) // ============================================================================ // Global debug flag (set once at initialization) static int g_ace_debug = 0; // Registry-based observation: scan all SuperSlabs for usage stats static void ace_observe_and_decide(int k) { if (k < 0 || k >= TINY_NUM_CLASSES_SS) return; SuperSlabACEState* c = &g_ss_ace[k]; // Scan Registry to count SuperSlabs and total live blocks int ss_count = 0; uint32_t total_live = 0; for (int i = 0; i < SUPER_REG_SIZE; i++) { SuperRegEntry* e = &g_super_reg[i]; // Atomic read (thread-safe) uintptr_t base = atomic_load_explicit( (_Atomic uintptr_t*)&e->base, memory_order_acquire); if (base == 0) continue; // Empty slot // Phase 8.4: Safety check - skip if ss pointer is invalid if (!e->ss) continue; // Phase 12: per-SS size_class removed; registry entries are per-class by construction. ss_count++; // Phase 8.4: Scan all slabs to count used blocks (zero hot-path overhead) uint32_t ss_live = 0; int cap_scan = ss_slabs_capacity(e->ss); for (int slab_idx = 0; slab_idx < cap_scan; slab_idx++) { TinySlabMeta* meta = &e->ss->slabs[slab_idx]; // Relaxed read is OK (stats only, no hot-path impact) ss_live += meta->used; } total_live += ss_live; } // Calculate utilization int obj_size = g_tiny_obj_sizes[k]; uint8_t current_lg = atomic_load_explicit( (_Atomic uint8_t*)&c->current_lg, memory_order_relaxed); uint32_t capacity = (ss_count > 0) ? ss_count * ((1U << current_lg) / obj_size) : 1; double util = (double)total_live / capacity; // Update hot_score (for debugging/visualization) c->hot_score = (uint16_t)(util * 1000); if (c->hot_score > 1000) c->hot_score = 1000; // Promotion/Demotion decision uint8_t new_target = current_lg; if (current_lg <= 20) { // Promotion: 1MB → 2MB if (util > 0.75) { new_target = 21; } } else { // Demotion: 2MB → 1MB if (util < 0.35) { new_target = 20; } } // Debug output (if enabled) if (g_ace_debug && ss_count > 0) { fprintf(stderr, "[ACE] Class %d (%dB): ss=%d live=%u cap=%u util=%.2f%% lg=%d->%d hot=%d\n", k, obj_size, ss_count, total_live, capacity, util * 100.0, current_lg, new_target, c->hot_score); } // Atomic write (thread-safe) if (new_target != current_lg) { atomic_store_explicit( (_Atomic uint8_t*)&c->target_lg, new_target, memory_order_release); if (g_ace_debug) { fprintf(stderr, "[ACE] *** Class %d: SIZE CHANGE %dMB -> %dMB (util=%.2f%%)\n", k, 1 << (current_lg - 20), 1 << (new_target - 20), util * 100.0); } } } // Called from Learner thread (background observation) void hak_tiny_superslab_ace_observe_all(void) { // Initialize debug flag once static int initialized = 0; if (!initialized) { const char* ace_debug = getenv("HAKMEM_ACE_DEBUG"); g_ace_debug = (ace_debug && atoi(ace_debug) != 0) ? 1 : 0; initialized = 1; } for (int k = 0; k < TINY_NUM_CLASSES_SS; k++) { ace_observe_and_decide(k); } }