// ss_os_acquire_box.c - SuperSlab OS Memory Acquisition Box Implementation #include "ss_os_acquire_box.h" #include "../hakmem_build_flags.h" #include #include #include #include #include #include #include // Global counters for debugging (non-static for external access) _Atomic uint64_t g_ss_mmap_count = 0; _Atomic uint64_t g_final_fallback_mmap_count = 0; // ============================================================================ // OOM Diagnostics // ============================================================================ static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) { static int logged = 0; if (logged) return; logged = 1; // CRITICAL FIX: Increment lock depth FIRST before any LIBC calls // fopen/fclose/getrlimit/fprintf all may call malloc internally // Must bypass HAKMEM wrapper to avoid header mismatch crash extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; // Force wrapper to use __libc_malloc struct rlimit rl = {0}; if (getrlimit(RLIMIT_AS, &rl) != 0) { rl.rlim_cur = RLIM_INFINITY; rl.rlim_max = RLIM_INFINITY; } unsigned long vm_size_kb = 0; unsigned long vm_rss_kb = 0; FILE* status = fopen("/proc/self/status", "r"); if (status) { char line[256]; while (fgets(line, sizeof(line), status)) { if (strncmp(line, "VmSize:", 7) == 0) { (void)sscanf(line + 7, "%lu", &vm_size_kb); } else if (strncmp(line, "VmRSS:", 6) == 0) { (void)sscanf(line + 6, "%lu", &vm_rss_kb); } } fclose(status); } // CRITICAL FIX: Do NOT decrement lock_depth yet! // fprintf() below may call malloc for buffering char rl_cur_buf[32]; char rl_max_buf[32]; if (rl.rlim_cur == RLIM_INFINITY) { strcpy(rl_cur_buf, "inf"); } else { snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur); } if (rl.rlim_max == RLIM_INFINITY) { strcpy(rl_max_buf, "inf"); } else { snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max); } #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu " "RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n", err, ss_size, alloc_size, rl_cur_buf, rl_max_buf, vm_size_kb, vm_rss_kb); #else (void)err; (void)ss_size; (void)alloc_size; (void)rl_cur_buf; (void)rl_max_buf; (void)vm_size_kb; (void)vm_rss_kb; #endif g_hakmem_lock_depth--; // Now safe to restore (all libc calls complete) } // ============================================================================ // OS Acquisition Implementation // ============================================================================ void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) { void* ptr = NULL; static int log_count = 0; (void)size_class; // Used only for logging in debug builds #ifdef MAP_ALIGNED_SUPER int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER; #ifdef MAP_POPULATE if (populate) { map_flags |= MAP_POPULATE; } #endif ptr = mmap(NULL, ss_size, PROT_READ | PROT_WRITE, map_flags, -1, 0); if (ptr != MAP_FAILED) { atomic_fetch_add(&g_ss_mmap_count, 1); if (((uintptr_t)ptr & ss_mask) == 0) { // Successfully got aligned pointer from OS return ptr; } munmap(ptr, ss_size); ptr = NULL; } else { log_superslab_oom_once(ss_size, ss_size, errno); } #endif // Fallback: allocate 2x size and align manually // CRITICAL FIX (2025-12-05): Use MAP_POPULATE in fallback path // BUG: Previous code marked populate as unused, ignoring prefault request size_t alloc_size = ss_size * 2; int flags = MAP_PRIVATE | MAP_ANONYMOUS; #ifdef MAP_POPULATE if (populate) { flags |= MAP_POPULATE; static int dbg = 0; if (dbg < 3) { fprintf(stderr, "[SS_MMAP_DEBUG] populate=%d flags=0x%x (MAP_POPULATE=0x%x)\n", populate, flags, MAP_POPULATE); dbg++; } } #endif void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, flags, -1, 0); if (raw != MAP_FAILED) { uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1; #if !HAKMEM_BUILD_RELEASE if (log_count < 10) { fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n", (unsigned long)count, size_class, ss_size); log_count++; } #else (void)log_count; #endif } if (raw == MAP_FAILED) { log_superslab_oom_once(ss_size, alloc_size, errno); return NULL; } uintptr_t raw_addr = (uintptr_t)raw; uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask; ptr = (void*)aligned_addr; size_t prefix_size = aligned_addr - raw_addr; if (prefix_size > 0) { munmap(raw, prefix_size); } size_t suffix_size = alloc_size - prefix_size - ss_size; if (suffix_size > 0) { munmap((char*)ptr + ss_size, suffix_size); } // CRITICAL FIX (2025-12-05): Apply MADV_POPULATE_WRITE AFTER munmap trim // Issue: munmap() appears to undo MAP_POPULATE state on Linux 6.8.0-87 // When mmap(4MB, MAP_POPULATE) is trimmed via munmap(prefix) + munmap(suffix), // the remaining 2MB middle region loses its "pages populated" flag. // Solution: Force re-population after trim using MADV_POPULATE_WRITE (Linux 5.14+) // See: EXPLICIT_PREFAULT_IMPLEMENTATION_REPORT_20251205.md #ifdef MADV_POPULATE_WRITE if (populate) { int ret = madvise(ptr, ss_size, MADV_POPULATE_WRITE); if (ret != 0) { // Fallback for kernels that support MADV_POPULATE_WRITE but it fails // Use explicit page-by-page touching with writes volatile char* p = (volatile char*)ptr; for (size_t i = 0; i < ss_size; i += 4096) { p[i] = 0; } p[ss_size - 1] = 0; } } #else if (populate) { // Fallback for kernels < 5.14: explicit page touch volatile char* p = (volatile char*)ptr; for (size_t i = 0; i < ss_size; i += 4096) { p[i] = 0; } p[ss_size - 1] = 0; } #endif return ptr; }