hakmem/core/hakmem.c

// hakmem.c - Minimal PoC Implementation
// Purpose: Verify call-site profiling concept

#include <stdatomic.h>
#include "hakmem.h"
#include "hakmem_config.h"      // NEW Phase 6.8: Mode-based configuration
#include "hakmem_internal.h"    // NEW Phase 6.8: Static inline helpers
#include "hakmem_bigcache.h"    // NEW: BigCache Box
#include "hakmem_pool.h"        // NEW Phase 6.9: L2 Hybrid Pool (2-32KiB)
#include "hakmem_l25_pool.h"    // NEW Phase 6.13: L2.5 LargePool (64KB-1MB)
#include "hakmem_policy.h"      // NEW Phase 6.16: FrozenPolicy (SACS-3)
#include "hakmem_learner.h"     // NEW: CAP auto-tuner (background)
#include "hakmem_size_hist.h"   // NEW: size histogram sampling (off hot path)
#include "hakmem_ace.h"         // NEW Phase 6.16: ACE layer (L1)
#include "hakmem_site_rules.h"  // NEW Phase 6.10: Site-Aware Cache Routing
#include "hakmem_tiny.h"        // NEW Phase 6.12: Tiny Pool (≤1KB)
#include "hakmem_tiny_superslab.h"  // NEW Phase 7.6: SuperSlab for Tiny Pool
#include "tiny_fastcache.h"     // NEW Phase 6-3: Tiny Fast Path (System tcache style)
#include "hakmem_super_registry.h"  // NEW Phase 1: SuperSlab Registry (mincore elimination)
#include "hakmem_elo.h"         // NEW: ELO Strategy Selection (Phase 6.2)
#include "hakmem_ace_stats.h"    // NEW: ACE lightweight stats (avoid implicit decl warnings)
#include "hakmem_batch.h"       // NEW: madvise Batching (Phase 6.3)
#include "hakmem_evo.h"         // NEW: Learning Lifecycle (Phase 6.5)
#include "hakmem_debug.h"       // NEW Phase 6.11.1: Debug Timing
#include "hakmem_sys.h"         // NEW Phase 6.11.1: Syscall Wrappers
#include "hakmem_whale.h"       // NEW Phase 6.11.1: Whale Fast-Path (≥2MB)
#include "hakmem_prof.h"        // NEW Phase 6.16: Sampling profiler
#include "hakmem_syscall.h"     // NEW Phase 6.X P0 FIX: Box 3 (dlsym direct libc)
#include "hakmem_ace_controller.h"  // NEW Phase ACE: Adaptive Control Engine
#include "hakmem_ace_metrics.h"     // NEW Phase ACE: Metrics tracking (inline helpers)
#include "box/bench_fast_box.h"     // NEW Phase 20-2: BenchFast Mode (structural ceiling measurement)
#include "hakmem_env_cache.h"       // NEW Priority-2: ENV Variable Cache (eliminate hot-path getenv)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <dlfcn.h>
#include <stdatomic.h>  // NEW Phase 6.5: For atomic tick counter
#include <pthread.h>    // Phase 6.15: Threading primitives (recursion guard only)
#include <sched.h>      // Yield during init wait
#include <errno.h>      // calloc overflow handling
#include <signal.h>
#ifdef __GLIBC__
#include <execinfo.h>
#endif
#include "ptr_trace.h"

// For mmap (Linux)
#ifdef __linux__
#include <sys/mman.h>
#include <unistd.h>

// MADV_FREE support (Linux kernel 4.5+)
#ifndef MADV_FREE
  #define MADV_FREE 8  // Linux MADV_FREE
#endif

// Optional early SIGSEGV handler (runs at load if env toggled)
static void hakmem_sigsegv_handler_early(int sig) {
    (void)sig;
    const char* msg = "\n[HAKMEM] Segmentation Fault (Early Init)\n";
    (void)write(2, msg, 42);
    abort();
}

// Extern debug helper
#if !HAKMEM_BUILD_RELEASE
extern void tiny_debug_dump_last_push(int cls);
#endif

// Global variables moved out of static scope to resolve dependency issues
int g_initialized = 0;
int g_strict_free = 0;   // runtime: HAKMEM_SAFE_FREE=1 enables extra safety checks
int g_invalid_free_log = 0; // runtime: HAKMEM_INVALID_FREE_LOG=1 to log invalid-free messages (extern visible)
int g_invalid_free_mode = 1;  // 1 = skip invalid-free check (default), 0 = fallback to libc
_Atomic int g_cached_strategy_id = 0;  // Cached strategy ID (updated every window closure)
uint64_t g_evo_sample_mask = 0;  // 0 = disabled (default), (1<<N)-1 = sample every 2^N calls
int g_site_rules_enabled = 0;  // default off to avoid contention in MT
int g_bench_tiny_only = 0;     // bench preset: Tiny-only fast path
int g_flush_tiny_on_exit = 0;  // HAKMEM_TINY_FLUSH_ON_EXIT=1
int g_ultra_debug_on_exit = 0; // HAKMEM_TINY_ULTRA_DEBUG=1
struct hkm_ace_controller g_ace_controller;
_Atomic int g_initializing = 0;
pthread_t g_init_thread;
int g_jemalloc_loaded = -1;     // -1 unknown, 0/1 cached

// Forward declarations for internal functions used in init/callback
static void bigcache_free_callback(void* ptr, size_t size);
static void hak_flush_tiny_exit(void);

// Phase 6-1.7: Box Theory Refactoring - Wrapper function declarations
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
    extern void* hak_tiny_alloc_fast_wrapper(size_t size);
    extern void hak_tiny_free_fast_wrapper(void* ptr);
#endif

// KPI utils forward declarations
static void get_page_faults(uint64_t* soft_pf, uint64_t* hard_pf);
static uint64_t get_rss_kb(void);

// KPI measurement helpers - MUST be included before hak_core_init.inc.h
#include "box/hak_kpi_util.inc.h"

#include "box/hak_core_init.inc.h"
#include "box/hak_alloc_api.inc.h"
#include "box/hak_free_api.inc.h"

__attribute__((constructor)) static void hakmem_ctor_install_segv(void) {
    HAK_TRACE("[ctor1_hakmem_ctor_install_segv]\n");
    const char* dbg = getenv("HAKMEM_DEBUG_SEGV");
    if (dbg && atoi(dbg) != 0) {
        #if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[HAKMEM][EARLY] installing SIGSEGV handler\n");
        #endif
        struct sigaction sa; memset(&sa, 0, sizeof(sa));
        sa.sa_flags = SA_RESETHAND;
        sa.sa_handler = hakmem_sigsegv_handler_early;
        sigaction(SIGSEGV, &sa, NULL);
        // Also handle SIGBUS (common for alignment/unmapped) and SIGABRT (glibc free invalid)
        sigaction(SIGBUS, &sa, NULL);
        sigaction(SIGABRT, &sa, NULL);
    }
}
#endif

// ============================================================================
// Configuration
// ============================================================================

#define MAX_SITES 256           // Hash table size (power of 2)
#define SAMPLING_RATE 1         // Sample ALL (PoC demo: no sampling)
#define HASH_MASK (MAX_SITES - 1)

// Phase 6.8: FREE_POLICY/FreePolicy moved to hakmem_config.h
// Phase 6.8: FreeThermal/THERMAL_* constants moved to hakmem_internal.h
// Phase 6.8: THP_POLICY/THPPolicy moved to hakmem_config.h


// ============================================================================
// Global State
// ============================================================================

// Priority-2 Refactoring: ENV cache (eliminate ~2000 getenv syscalls/sec from hot paths)
HakEnvCache g_hak_env_cache;

// Statistics
static uint64_t g_malloc_count = 0;  // Used for optimization stats display

int g_ldpreload_mode = 0;  // 1 when running via LD_PRELOAD=libhakmem.so
// Debug: count free() wrapper entries to confirm free routing (optional)
_Atomic uint64_t g_free_wrapper_calls = 0;
// Cached LD_PRELOAD detection for wrapper hot paths (avoid getenv per call)
static int g_ldpre_env_cached = -1;  // -1 = unknown, 0/1 cached
// Cached libc force flags
static int g_force_libc_alloc_init = -1; // HAKMEM_FORCE_LIBC_ALLOC_INIT
static inline void hak_ld_env_init(void) {
    if (g_ldpre_env_cached < 0) {
        const char* ldpre = getenv("LD_PRELOAD");
        g_ldpre_env_cached = (ldpre && strstr(ldpre, "libhakmem.so")) ? 1 : 0;
    }
}
__attribute__((constructor))
static void hak_ld_env_ctor(void) {
    HAK_TRACE("[ctor2_hak_ld_env_ctor]\n");
    hak_ld_env_init();
}

// Priority-2 Refactoring: Initialize ENV cache at library load time (eliminate ~2000 syscalls/sec)
__attribute__((constructor))
static void hak_env_cache_ctor(void) {
    HAK_TRACE("[ctor3_hak_env_cache_ctor]\n");
    hakmem_env_cache_init();
}

static inline int hak_ld_env_mode(void) {
    return g_ldpre_env_cached;
}

// Sanitizer / guard rails: allow forcing libc allocator even when wrappers are linked
#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD
static int g_force_libc_alloc = 1;
#else
static int g_force_libc_alloc = -1;  // 1=force libc, 0=use hakmem, -1=uninitialized
#endif
__attribute__((constructor))
static void hak_force_libc_ctor(void) {
    HAK_TRACE("[ctor4_hak_force_libc_ctor]\n");
    // Cache FORCE_LIBC and WRAP_TINY at load time to avoid hot-path getenv
#ifndef HAKMEM_FORCE_LIBC_ALLOC_BUILD
    if (g_force_libc_alloc < 0) {
        const char* force = getenv("HAKMEM_FORCE_LIBC_ALLOC");
        if (force && *force) {
            g_force_libc_alloc = (atoi(force) != 0);
        } else {
            const char* wrap = getenv("HAKMEM_WRAP_TINY");
            if (wrap && *wrap && atoi(wrap) == 0) {
                g_force_libc_alloc = 1;
            } else {
                g_force_libc_alloc = 0;
            }
        }
    }
    if (g_force_libc_alloc_init < 0) {
        const char* init_only = getenv("HAKMEM_FORCE_LIBC_ALLOC_INIT");
        g_force_libc_alloc_init = (init_only && atoi(init_only) != 0) ? 1 : 0;
    }
#else
    g_force_libc_alloc_init = 0;
#endif
}
static inline int hak_force_libc_alloc(void) {
    // During early process start or allocator init, optionally force libc until init completes.
    // This avoids sanitizer -> dlsym -> malloc recursion before TLS is ready.
    if (!g_initialized) {
        if (g_force_libc_alloc_init < 0) {
            const char* init_only = getenv("HAKMEM_FORCE_LIBC_ALLOC_INIT");
            g_force_libc_alloc_init = (init_only && atoi(init_only) != 0) ? 1 : 0;
        }
        if (g_force_libc_alloc_init) {
            return 1;
        }
    }
    if (g_force_libc_alloc < 0) {
        const char* force = getenv("HAKMEM_FORCE_LIBC_ALLOC");
        if (force && *force) {
            g_force_libc_alloc = (atoi(force) != 0);
        } else {
            const char* wrap = getenv("HAKMEM_WRAP_TINY");
            if (wrap && *wrap && atoi(wrap) == 0) {
                g_force_libc_alloc = 1;
            } else {
                g_force_libc_alloc = 0;
            }
        }
    }
    return g_force_libc_alloc;
}

// LD_PRELOAD safety: avoid interposing when jemalloc is present
static int g_ld_block_jemalloc = -1;   // env: HAKMEM_LD_BLOCK_JEMALLOC (default 1)
static inline int hak_jemalloc_loaded(void) {
    if (g_jemalloc_loaded < 0) {
        void* h = dlopen("libjemalloc.so.2", RTLD_NOLOAD | RTLD_NOW);
        if (!h) h = dlopen("libjemalloc.so.1", RTLD_NOLOAD | RTLD_NOW);
        g_jemalloc_loaded = (h != NULL) ? 1 : 0;
        if (h) dlclose(h);
    }
    return g_jemalloc_loaded;
}
static inline int hak_ld_block_jemalloc(void) {
    if (g_ld_block_jemalloc < 0) {
        const char* e = getenv("HAKMEM_LD_BLOCK_JEMALLOC");
        g_ld_block_jemalloc = (e == NULL) ? 1 : (atoi(e) != 0);
    }
    return g_ld_block_jemalloc;
}

// ============================================================================
// Phase 6.15 P1: Remove global lock; keep recursion guard only
// ---------------------------------------------------------------------------
// We no longer serialize all allocations with a single global mutex.
// Instead, each submodule is responsible for its own fine‑grained locking.
// We keep a per‑thread recursion guard so that internal use of malloc/free
// within the allocator routes to libc (avoids infinite recursion).
//
// Phase 6.X P0 FIX (2025-10-24): Reverted to simple g_hakmem_lock_depth check
// Box Theory - Layer 1 (API Layer):
//   This guard protects against LD_PRELOAD recursion (Box 1 → Box 1)
//   Box 2 (Core) → Box 3 (Syscall) uses hkm_libc_malloc() (dlsym, no guard needed!)
// NOTE: Removed 'static' to allow access from hakmem_tiny_superslab.c (fopen fix)
__thread int g_hakmem_lock_depth = 0;  // 0 = outermost call

int hak_in_wrapper(void) {
    return g_hakmem_lock_depth > 0;  // Simple and correct!
}

// Initialization guard
int hak_is_initializing(void) { return atomic_load_explicit(&g_initializing, memory_order_acquire); }

// Wait helper for non-init threads to avoid libc fallback during init window
static inline int hak_init_wait_for_ready(void) {
    if (__builtin_expect(!atomic_load_explicit(&g_initializing, memory_order_acquire), 1)) {
        return 1;  // Ready
    }
    pthread_t self = pthread_self();
    if (pthread_equal(self, g_init_thread)) {
        return 0;  // We are the init thread; caller should take the existing fallback path
    }
    // No timeout: block until init completes to avoid libc fallback on other threads.
    for (int i = 0; atomic_load_explicit(&g_initializing, memory_order_acquire); ++i) {
#if defined(__x86_64__) || defined(__i386__)
        if (i < 1024) {
            __asm__ __volatile__("pause" ::: "memory");
        } else
#endif
        {
            sched_yield();
        }
    }
    return 1;  // Init completed
}

// ============================================================================
// Phase 6-1.5: Ultra-Simple Fast Path Forward Declarations
// ============================================================================
// Forward declarations for Phase 6 fast path variants
// Phase 6-1.5: Alignment guessing (hakmem_tiny_ultra_simple.inc)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
extern void* hak_tiny_alloc_ultra_simple(size_t size);
extern void hak_tiny_free_ultra_simple(void* ptr);
#endif

// Phase 6-1.6: Metadata header (hakmem_tiny_metadata.inc)
#ifdef HAKMEM_TINY_PHASE6_METADATA
extern void* hak_tiny_alloc_metadata(size_t size);
extern void hak_tiny_free_metadata(void* ptr);
#endif

#include "box/hak_exit_debug.inc.h"

// ============================================================================
// KPI Measurement (for UCB1) - NEW!
// ============================================================================
// NOTE: hak_kpi_util.inc.h is now included earlier (before hak_core_init.inc.h)
// to resolve dependency on g_latency_histogram and related variables

// ============================================================================
// Internal Helpers
// ============================================================================

// Phase 6.8: All legacy profiling functions removed
// - hash_site(), get_site_profile(), infer_policy(), record_alloc(), allocate_with_policy()
// Replaced by ELO-based allocation (hakmem_elo.c)

// ============================================================================
// BigCache eviction callback
// ============================================================================

// BigCache eviction callback (called when cache is full and needs to evict)
static void bigcache_free_callback(void* ptr, size_t size) {
    (void)size;  // Not used
    if (!ptr) return;

    // Get raw pointer and header
    void* raw = (char*)ptr - HEADER_SIZE;
    AllocHeader* hdr = (AllocHeader*)raw;
    extern void __libc_free(void*);

    // Verify magic before accessing method field
    if (hdr->magic != HAKMEM_MAGIC) {
        HAKMEM_LOG("BigCache eviction: invalid magic, fallback to free()\n");
        // CRITICAL FIX: When magic is invalid, allocation came from LIBC (NO header)
        // Therefore ptr IS the allocated address, not raw (ptr - HEADER_SIZE)
        // MUST use __libc_free to avoid infinite recursion through free() wrapper
        extern void __libc_free(void*);
        ptr_trace_dump_now("bigcache_libc_free_invalid_magic");
        __libc_free(ptr);
        return;
    }

    // Dispatch based on allocation method
    switch (hdr->method) {
        case ALLOC_METHOD_MALLOC:
            __libc_free(raw);
            break;

        case ALLOC_METHOD_MMAP:
            // Cold eviction: route through batch for large blocks
            // This completes Phase 6.3 architecture
#ifdef __linux__
            if (hdr->size >= BATCH_MIN_SIZE) {
                // Large blocks: use batch (deferred munmap + TLB optimization)
                hak_batch_add(raw, hdr->size);
            } else {
                // Small blocks: direct munmap (not worth batching)
                // Phase 6.11.1: Try whale cache first
                if (hkm_whale_put(raw, hdr->size) != 0) {
                    // Whale cache full or not a whale: munmap
                    madvise(raw, hdr->size, MADV_FREE);  // Best-effort
                    hkm_sys_munmap(raw, hdr->size);
                }
                // else: Successfully cached in whale cache (no munmap!)
            }
#else
            __libc_free(raw);  // Fallback (should not happen)
#endif
            break;

        default:
            HAKMEM_LOG("BigCache eviction: unknown method %d\n", hdr->method);
            __libc_free(raw);  // Fallback
            break;
    }
}

// ============================================================================
// Public API
// ============================================================================

// Thread-safe one-time initialization
// (Now included earlier)

// Phase 9.1: Force inline for performance (reduces call overhead by ~30-40%)
// (Now included earlier)

// Phase 9.1: Force inline for performance (reduces call overhead by ~30-40%)
// Phase 6-1.7: Disable inline for box refactor to avoid recursive inlining
#ifndef HAKMEM_TINY_PHASE6_BOX_REFACTOR
__attribute__((always_inline))
inline
#endif
// hak_free_at() 本体は箱へ
// (Now included earlier)


void hak_print_stats(void) {
    printf("\n========================================\n");
    printf("hakmem ELO-based Profiling Statistics\n");
    printf("========================================\n");

    printf("\nOptimization Stats:\n");
    printf("  malloc() calls: %llu\n", (unsigned long long)g_malloc_count);

    hak_elo_print_leaderboard();

    printf("========================================\n\n");
}

// ============================================================================
// Standard C Library Wrappers (LD_PRELOAD) — boxed include
// ============================================================================
#include "box/hak_wrappers.inc.h"

// (wrappers moved to box/hak_wrappers.inc.h)