// ss_os_acquire_box.c - SuperSlab OS Memory Acquisition Box Implementation #include "ss_os_acquire_box.h" #include "../hakmem_build_flags.h" #include "../hakmem_env_cache.h" #include #include #include #include #include #include #include // Global counters for debugging (non-static for external access) extern _Atomic uint64_t g_ss_mmap_count; extern _Atomic uint64_t g_final_fallback_mmap_count; extern _Atomic uint64_t g_ss_os_alloc_calls; extern _Atomic uint64_t g_ss_os_free_calls; extern _Atomic uint64_t g_ss_os_madvise_calls; extern _Atomic uint64_t g_ss_os_madvise_fail_enomem; extern _Atomic uint64_t g_ss_os_madvise_fail_other; extern _Atomic uint64_t g_ss_os_huge_alloc_calls; extern _Atomic uint64_t g_ss_os_huge_fail_calls; extern _Atomic bool g_ss_madvise_disabled; // ============================================================================ // OOM Diagnostics // ============================================================================ static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) { static int logged = 0; if (logged) return; logged = 1; // CRITICAL FIX: Increment lock depth FIRST before any LIBC calls // fopen/fclose/getrlimit/fprintf all may call malloc internally // Must bypass HAKMEM wrapper to avoid header mismatch crash extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; // Force wrapper to use __libc_malloc struct rlimit rl = {0}; if (getrlimit(RLIMIT_AS, &rl) != 0) { rl.rlim_cur = RLIM_INFINITY; rl.rlim_max = RLIM_INFINITY; } unsigned long vm_size_kb = 0; unsigned long vm_rss_kb = 0; FILE* status = fopen("/proc/self/status", "r"); if (status) { char line[256]; while (fgets(line, sizeof(line), status)) { if (strncmp(line, "VmSize:", 7) == 0) { (void)sscanf(line + 7, "%lu", &vm_size_kb); } else if (strncmp(line, "VmRSS:", 6) == 0) { (void)sscanf(line + 6, "%lu", &vm_rss_kb); } } fclose(status); } // CRITICAL FIX: Do NOT decrement lock_depth yet! // fprintf() below may call malloc for buffering char rl_cur_buf[32]; char rl_max_buf[32]; if (rl.rlim_cur == RLIM_INFINITY) { strcpy(rl_cur_buf, "inf"); } else { snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur); } if (rl.rlim_max == RLIM_INFINITY) { strcpy(rl_max_buf, "inf"); } else { snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max); } #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu " "RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n", err, ss_size, alloc_size, rl_cur_buf, rl_max_buf, vm_size_kb, vm_rss_kb); #else (void)err; (void)ss_size; (void)alloc_size; (void)rl_cur_buf; (void)rl_max_buf; (void)vm_size_kb; (void)vm_rss_kb; #endif g_hakmem_lock_depth--; // Now safe to restore (all libc calls complete) } // ============================================================================ // HugePage (experimental) helper // ============================================================================ static void* ss_os_acquire_hugepage_try(size_t ss_size, uintptr_t ss_mask, int populate) { #ifdef MAP_HUGETLB size_t huge_sz = ss_os_huge_size_bytes(); if (ss_size != huge_sz) { // For now, only attempt hugepage when requested SuperSlab size matches the HugePage size. return NULL; } int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; #ifdef MAP_POPULATE if (populate) { flags |= MAP_POPULATE; } #endif #ifdef MAP_HUGE_2MB // Best-effort: allow the kernel to pick 2MB huge pages explicitly when available. if (huge_sz == (2ULL << 20)) { flags |= MAP_HUGE_2MB; } #endif void* ptr = mmap(NULL, huge_sz, PROT_READ | PROT_WRITE, flags, -1, 0); if (ptr == MAP_FAILED) { ss_os_stats_record_huge_fail(); return NULL; } if (((uintptr_t)ptr & ss_mask) != 0) { munmap(ptr, huge_sz); ss_os_stats_record_huge_fail(); return NULL; } ss_os_stats_record_huge_alloc(); ss_os_stats_record_alloc(); atomic_fetch_add(&g_ss_mmap_count, 1); return ptr; #else (void)ss_size; (void)ss_mask; (void)populate; return NULL; #endif } // ============================================================================ // OS Acquisition Implementation // ============================================================================ void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) { void* ptr = NULL; static int log_count = 0; (void)size_class; // Used only for logging in debug builds // Experimental HugePage path (research-only, default OFF) if (ss_os_huge_enabled()) { void* huge = ss_os_acquire_hugepage_try(ss_size, ss_mask, populate); if (huge != NULL) { return huge; } } #ifdef MAP_ALIGNED_SUPER int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER; #ifdef MAP_POPULATE if (populate) { map_flags |= MAP_POPULATE; } #endif ptr = mmap(NULL, ss_size, PROT_READ | PROT_WRITE, map_flags, -1, 0); if (ptr != MAP_FAILED) { atomic_fetch_add(&g_ss_mmap_count, 1); ss_os_stats_record_alloc(); if (((uintptr_t)ptr & ss_mask) == 0) { // Successfully got aligned pointer from OS return ptr; } munmap(ptr, ss_size); ptr = NULL; } else { log_superslab_oom_once(ss_size, ss_size, errno); } #endif // Fallback: allocate 2x size and align manually // CRITICAL FIX (2025-12-05): Use MAP_POPULATE in fallback path // BUG: Previous code marked populate as unused, ignoring prefault request size_t alloc_size = ss_size * 2; int flags = MAP_PRIVATE | MAP_ANONYMOUS; #ifdef MAP_POPULATE if (populate) { flags |= MAP_POPULATE; static int dbg = 0; if (dbg < 3) { fprintf(stderr, "[SS_MMAP_DEBUG] populate=%d flags=0x%x (MAP_POPULATE=0x%x)\n", populate, flags, MAP_POPULATE); dbg++; } } #endif void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, flags, -1, 0); if (raw != MAP_FAILED) { uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1; ss_os_stats_record_alloc(); #if !HAKMEM_BUILD_RELEASE if (log_count < 10) { fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n", (unsigned long)count, size_class, ss_size); log_count++; } #else (void)log_count; #endif } if (raw == MAP_FAILED) { log_superslab_oom_once(ss_size, alloc_size, errno); return NULL; } uintptr_t raw_addr = (uintptr_t)raw; uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask; ptr = (void*)aligned_addr; size_t prefix_size = aligned_addr - raw_addr; if (prefix_size > 0) { munmap(raw, prefix_size); } size_t suffix_size = alloc_size - prefix_size - ss_size; if (suffix_size > 0) { munmap((char*)ptr + ss_size, suffix_size); } // CRITICAL FIX (2025-12-05): Apply MADV_POPULATE_WRITE AFTER munmap trim // Issue: munmap() appears to undo MAP_POPULATE state on Linux 6.8.0-87 // When mmap(4MB, MAP_POPULATE) is trimmed via munmap(prefix) + munmap(suffix), // the remaining 2MB middle region loses its "pages populated" flag. // Solution: Force re-population after trim using MADV_POPULATE_WRITE (Linux 5.14+) // See: EXPLICIT_PREFAULT_IMPLEMENTATION_REPORT_20251205.md #ifdef MADV_POPULATE_WRITE if (populate) { int ret = ss_os_madvise_guarded(ptr, ss_size, MADV_POPULATE_WRITE, "ss_os_acquire_populate"); if (ret != 0) { if (HAK_ENV_SS_MADVISE_STRICT() && errno == EINVAL) { fprintf(stderr, "[SS_OS] madvise(MADV_POPULATE_WRITE) EINVAL (strict mode). Aborting.\n"); abort(); } // Fallback for kernels that support MADV_POPULATE_WRITE but it fails // Use explicit page-by-page touching with writes volatile char* p = (volatile char*)ptr; for (size_t i = 0; i < ss_size; i += 4096) { p[i] = 0; } p[ss_size - 1] = 0; } } #else if (populate) { // Fallback for kernels < 5.14: explicit page touch volatile char* p = (volatile char*)ptr; for (size_t i = 0; i < ss_size; i += 4096) { p[i] = 0; } p[ss_size - 1] = 0; ss_os_stats_record_madvise(); } #endif return ptr; } static void ss_os_stats_destructor(void) __attribute__((destructor)); static void ss_os_stats_destructor(void) { if (!ss_os_stats_enabled()) { return; } fprintf(stderr, "[SS_OS_STATS] alloc=%llu free=%llu madvise=%llu madvise_enomem=%llu madvise_other=%llu madvise_disabled=%d " "mmap_total=%llu fallback_mmap=%llu huge_alloc=%llu huge_fail=%llu\n", (unsigned long long)atomic_load_explicit(&g_ss_os_alloc_calls, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_ss_os_free_calls, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_ss_os_madvise_calls, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_ss_os_madvise_fail_enomem, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_ss_os_madvise_fail_other, memory_order_relaxed), atomic_load_explicit(&g_ss_madvise_disabled, memory_order_relaxed) ? 1 : 0, (unsigned long long)atomic_load_explicit(&g_ss_mmap_count, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_final_fallback_mmap_count, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_ss_os_huge_alloc_calls, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_ss_os_huge_fail_calls, memory_order_relaxed)); }