Files
hakmem/core/hakmem.c
Claude 09e1d89e8d Phase 6-4: Larson benchmark optimizations - LUT size-to-class
Two optimizations to improve Larson benchmark performance:

1. **Option A: Fast Path Priority** (core/hakmem.c)
   - Move HAKMEM_TINY_FAST_PATH check before all guard checks
   - Reduce malloc() fast path from 8+ branches to 3 branches
   - Results: +42% ST, -20% MT (mixed results)

2. **LUT Optimization** (core/tiny_fastcache.h)
   - Replace 11-branch linear search with O(1) lookup table
   - Use size_to_class_lut[size >> 3] for fast mapping
   - Results: +24% MT, -24% ST (MT-optimized tradeoff)

Benchmark results (Larson 2s 8-128B 1024 chunks):
- Original:     ST 0.498M ops/s, MT 1.502M ops/s
- LUT version:  ST 0.377M ops/s, MT 1.856M ops/s

Analysis:
- ST regression: Branch predictor learns linear search pattern
- MT improvement: LUT avoids branch misprediction on context switch
- Recommendation: Keep LUT for multi-threaded workloads

Related: LARSON_PERFORMANCE_ANALYSIS_2025_11_05.md
2025-11-05 04:58:03 +00:00

1527 lines
55 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem.c - Minimal PoC Implementation
// Purpose: Verify call-site profiling concept
#define _GNU_SOURCE // For mincore, madvise on Linux
#include "hakmem.h"
#include "hakmem_config.h" // NEW Phase 6.8: Mode-based configuration
#include "hakmem_internal.h" // NEW Phase 6.8: Static inline helpers
#include "hakmem_bigcache.h" // NEW: BigCache Box
#include "hakmem_pool.h" // NEW Phase 6.9: L2 Hybrid Pool (2-32KiB)
#include "hakmem_l25_pool.h" // NEW Phase 6.13: L2.5 LargePool (64KB-1MB)
#include "hakmem_policy.h" // NEW Phase 6.16: FrozenPolicy (SACS-3)
#include "hakmem_learner.h" // NEW: CAP auto-tuner (background)
#include "hakmem_size_hist.h" // NEW: size histogram sampling (off hot path)
#include "hakmem_ace.h" // NEW Phase 6.16: ACE layer (L1)
#include "hakmem_site_rules.h" // NEW Phase 6.10: Site-Aware Cache Routing
#include "hakmem_tiny.h" // NEW Phase 6.12: Tiny Pool (≤1KB)
#include "hakmem_tiny_superslab.h" // NEW Phase 7.6: SuperSlab for Tiny Pool
#include "tiny_fastcache.h" // NEW Phase 6-3: Tiny Fast Path (System tcache style)
#include "hakmem_mid_mt.h" // NEW Phase Hybrid: Mid Range MT (8-32KB, mimalloc-style)
#include "hakmem_super_registry.h" // NEW Phase 1: SuperSlab Registry (mincore elimination)
#include "hakmem_elo.h" // NEW: ELO Strategy Selection (Phase 6.2)
#include "hakmem_ace_stats.h" // NEW: ACE lightweight stats (avoid implicit decl warnings)
#include "hakmem_batch.h" // NEW: madvise Batching (Phase 6.3)
#include "hakmem_evo.h" // NEW: Learning Lifecycle (Phase 6.5)
#include "hakmem_debug.h" // NEW Phase 6.11.1: Debug Timing
#include "hakmem_sys.h" // NEW Phase 6.11.1: Syscall Wrappers
#include "hakmem_whale.h" // NEW Phase 6.11.1: Whale Fast-Path (≥2MB)
#include "hakmem_prof.h" // NEW Phase 6.16: Sampling profiler
#include "hakmem_syscall.h" // NEW Phase 6.X P0 FIX: Box 3 (dlsym direct libc)
#include "hakmem_ace_controller.h" // NEW Phase ACE: Adaptive Control Engine
#include "hakmem_ace_metrics.h" // NEW Phase ACE: Metrics tracking (inline helpers)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <dlfcn.h>
#include <stdatomic.h> // NEW Phase 6.5: For atomic tick counter
#include <pthread.h> // Phase 6.15: Threading primitives (recursion guard only)
#include <errno.h> // calloc overflow handling
// For mmap (Linux)
#ifdef __linux__
#include <sys/mman.h>
#include <unistd.h>
// MADV_FREE support (Linux kernel 4.5+)
#ifndef MADV_FREE
#define MADV_FREE 8 // Linux MADV_FREE
#endif
#endif
// ============================================================================
// Configuration
// ============================================================================
#define MAX_SITES 256 // Hash table size (power of 2)
#define SAMPLING_RATE 1 // Sample ALL (PoC demo: no sampling)
#define HASH_MASK (MAX_SITES - 1)
// Phase 6.8: FREE_POLICY/FreePolicy moved to hakmem_config.h
// Phase 6.8: FreeThermal/THERMAL_* constants moved to hakmem_internal.h
// Phase 6.8: THP_POLICY/THPPolicy moved to hakmem_config.h
// ============================================================================
// Global State
// ============================================================================
// NEW Phase ACE: Adaptive Control Engine
static struct hkm_ace_controller g_ace_controller;
static int g_initialized = 0;
static int g_strict_free = 0; // runtime: HAKMEM_SAFE_FREE=1 enables extra safety checks
int g_invalid_free_log = 0; // runtime: HAKMEM_INVALID_FREE_LOG=1 to log invalid-free messages (extern visible)
// Phase 7.4: Cache HAKMEM_INVALID_FREE to eliminate 44% CPU overhead (getenv on hot path)
// Perf analysis showed getenv("HAKMEM_INVALID_FREE") consumed 43.96% of CPU time!
static int g_invalid_free_mode = 1; // 1 = skip invalid-free check (default), 0 = fallback to libc
// Statistics
static uint64_t g_malloc_count = 0; // Used for optimization stats display
// Phase 6.11.4 P0-2: Cached Strategy (atomic, updated by hak_evo_tick)
static _Atomic int g_cached_strategy_id = 0; // Cached strategy ID (updated every window closure)
// Phase 6.15 P0.3: EVO Sampling Control (environment variable)
static uint64_t g_evo_sample_mask = 0; // 0 = disabled (default), (1<<N)-1 = sample every 2^N calls
// Phase 6.15 P1: Site Rules enable (env: HAKMEM_SITE_RULES=1 to enable)
static int g_site_rules_enabled = 0; // default off to avoid contention in MT
static int g_bench_tiny_only = 0; // bench preset: Tiny-only fast path
int g_ldpreload_mode = 0; // 1 when running via LD_PRELOAD=libhakmem.so
static int g_flush_tiny_on_exit = 0; // HAKMEM_TINY_FLUSH_ON_EXIT=1
static int g_ultra_debug_on_exit = 0; // HAKMEM_TINY_ULTRA_DEBUG=1
// Cached LD_PRELOAD detection for wrapper hot paths (avoid getenv per call)
static int g_ldpre_env_cached = -1; // -1 = unknown, 0/1 cached
static inline int hak_ld_env_mode(void) {
if (g_ldpre_env_cached < 0) {
const char* ldpre = getenv("LD_PRELOAD");
g_ldpre_env_cached = (ldpre && strstr(ldpre, "libhakmem.so")) ? 1 : 0;
}
return g_ldpre_env_cached;
}
// Sanitizer / guard rails: allow forcing libc allocator even when wrappers are linked
#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD
static int g_force_libc_alloc = 1;
#else
static int g_force_libc_alloc = -1; // 1=force libc, 0=use hakmem, -1=uninitialized
#endif
static inline int hak_force_libc_alloc(void) {
if (g_force_libc_alloc < 0) {
const char* force = getenv("HAKMEM_FORCE_LIBC_ALLOC");
if (force && *force) {
g_force_libc_alloc = (atoi(force) != 0);
} else {
const char* wrap = getenv("HAKMEM_WRAP_TINY");
if (wrap && *wrap && atoi(wrap) == 0) {
g_force_libc_alloc = 1;
} else {
g_force_libc_alloc = 0;
}
}
}
return g_force_libc_alloc;
}
// LD_PRELOAD safety: avoid interposing when jemalloc is present
static int g_ld_block_jemalloc = -1; // env: HAKMEM_LD_BLOCK_JEMALLOC (default 1)
static int g_jemalloc_loaded = -1; // -1 unknown, 0/1 cached
static inline int hak_jemalloc_loaded(void) {
if (g_jemalloc_loaded < 0) {
void* h = dlopen("libjemalloc.so.2", RTLD_NOLOAD | RTLD_NOW);
if (!h) h = dlopen("libjemalloc.so.1", RTLD_NOLOAD | RTLD_NOW);
g_jemalloc_loaded = (h != NULL) ? 1 : 0;
if (h) dlclose(h);
}
return g_jemalloc_loaded;
}
static inline int hak_ld_block_jemalloc(void) {
if (g_ld_block_jemalloc < 0) {
const char* e = getenv("HAKMEM_LD_BLOCK_JEMALLOC");
g_ld_block_jemalloc = (e == NULL) ? 1 : (atoi(e) != 0);
}
return g_ld_block_jemalloc;
}
// ============================================================================
// Phase 6.15 P1: Remove global lock; keep recursion guard only
// ---------------------------------------------------------------------------
// We no longer serialize all allocations with a single global mutex.
// Instead, each submodule is responsible for its own finegrained locking.
// We keep a perthread recursion guard so that internal use of malloc/free
// within the allocator routes to libc (avoids infinite recursion).
//
// Phase 6.X P0 FIX (2025-10-24): Reverted to simple g_hakmem_lock_depth check
// Box Theory - Layer 1 (API Layer):
// This guard protects against LD_PRELOAD recursion (Box 1 → Box 1)
// Box 2 (Core) → Box 3 (Syscall) uses hkm_libc_malloc() (dlsym, no guard needed!)
static __thread int g_hakmem_lock_depth = 0; // 0 = outermost call
int hak_in_wrapper(void) {
return g_hakmem_lock_depth > 0; // Simple and correct!
}
// Initialization guard
static int g_initializing = 0;
int hak_is_initializing(void) { return g_initializing; }
// ============================================================================
// Phase 6-1.5: Ultra-Simple Fast Path Forward Declarations
// ============================================================================
// Forward declarations for Phase 6 fast path variants
// Phase 6-1.5: Alignment guessing (hakmem_tiny_ultra_simple.inc)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
extern void* hak_tiny_alloc_ultra_simple(size_t size);
extern void hak_tiny_free_ultra_simple(void* ptr);
#endif
// Phase 6-1.6: Metadata header (hakmem_tiny_metadata.inc)
#ifdef HAKMEM_TINY_PHASE6_METADATA
extern void* hak_tiny_alloc_metadata(size_t size);
extern void hak_tiny_free_metadata(void* ptr);
#endif
// Phase 6-1.7: Box Theory Refactoring - Wrapper function declarations
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
extern void* hak_tiny_alloc_fast_wrapper(size_t size);
extern void hak_tiny_free_fast_wrapper(void* ptr);
#endif
static void hak_flush_tiny_exit(void) {
// Best-effort: flush Tiny magazines at process exit
if (g_flush_tiny_on_exit) {
hak_tiny_magazine_flush_all();
hak_tiny_trim();
}
if (g_ultra_debug_on_exit) {
hak_tiny_ultra_debug_dump();
}
// Path debug dump (optional): HAKMEM_TINY_PATH_DEBUG=1
hak_tiny_path_debug_dump();
// Extended counters (optional): HAKMEM_TINY_COUNTERS_DUMP=1
extern void hak_tiny_debug_counters_dump(void);
hak_tiny_debug_counters_dump();
}
// ============================================================================
// KPI Measurement (for UCB1) - NEW!
// ============================================================================
#ifdef __linux__
// Latency histogram (simple buckets for P50/P95/P99)
#define LATENCY_BUCKETS 100
static uint64_t g_latency_histogram[LATENCY_BUCKETS];
static uint64_t g_latency_samples = 0;
// Baseline page faults (at init)
static uint64_t g_baseline_soft_pf = 0;
static uint64_t g_baseline_hard_pf = 0;
static uint64_t g_baseline_rss_kb = 0;
// Get page faults from /proc/self/stat
static void get_page_faults(uint64_t* soft_pf, uint64_t* hard_pf) {
FILE* f = fopen("/proc/self/stat", "r");
if (!f) {
*soft_pf = 0;
*hard_pf = 0;
return;
}
// Format: pid (comm) state ... minflt cminflt majflt cmajflt ...
// Fields: 1 2 3 ... 10(minflt) 11(cminflt) 12(majflt) 13(cmajflt)
unsigned long minflt = 0, majflt = 0;
unsigned long dummy;
char comm[256], state;
(void)fscanf(f, "%lu %s %c %lu %lu %lu %lu %lu %lu %lu %lu %lu",
&dummy, comm, &state, &dummy, &dummy, &dummy, &dummy, &dummy,
&dummy, &minflt, &dummy, &majflt);
fclose(f);
*soft_pf = minflt;
*hard_pf = majflt;
}
// Get RSS from /proc/self/statm (in KB)
static uint64_t get_rss_kb(void) {
FILE* f = fopen("/proc/self/statm", "r");
if (!f) return 0;
// Format: size resident shared text lib data dt
// We want 'resident' (field 2) in pages
unsigned long size, resident;
(void)fscanf(f, "%lu %lu", &size, &resident);
fclose(f);
long page_size = sysconf(_SC_PAGESIZE);
return (resident * page_size) / 1024; // Convert to KB
}
// NOTE: Latency measurement functions (currently unused, for future use)
/*
static inline uint64_t measure_latency_ns(void (*func)(void*), void* arg) {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
func(arg); // Execute function
clock_gettime(CLOCK_MONOTONIC, &end);
uint64_t ns = (end.tv_sec - start.tv_sec) * 1000000000ULL +
(end.tv_nsec - start.tv_nsec);
return ns;
}
static void record_latency(uint64_t ns) {
// Bucket: 0-10ns, 10-20ns, ..., 990-1000ns, 1000+ns
size_t bucket = ns / 10;
if (bucket >= LATENCY_BUCKETS) bucket = LATENCY_BUCKETS - 1;
g_latency_histogram[bucket]++;
g_latency_samples++;
}
*/
// Calculate percentile from histogram
static uint64_t calculate_percentile(double percentile) {
if (g_latency_samples == 0) return 0;
uint64_t target = (uint64_t)(g_latency_samples * percentile);
uint64_t cumulative = 0;
for (size_t i = 0; i < LATENCY_BUCKETS; i++) {
cumulative += g_latency_histogram[i];
if (cumulative >= target) {
return i * 10; // Return bucket midpoint (ns)
}
}
return (LATENCY_BUCKETS - 1) * 10;
}
// Implement hak_get_kpi()
void hak_get_kpi(hak_kpi_t* out) {
memset(out, 0, sizeof(hak_kpi_t));
// Latency (from histogram)
out->p50_alloc_ns = calculate_percentile(0.50);
out->p95_alloc_ns = calculate_percentile(0.95);
out->p99_alloc_ns = calculate_percentile(0.99);
// Page Faults (delta from baseline)
uint64_t soft_pf, hard_pf;
get_page_faults(&soft_pf, &hard_pf);
out->soft_page_faults = soft_pf - g_baseline_soft_pf;
out->hard_page_faults = hard_pf - g_baseline_hard_pf;
// RSS (delta from baseline, in MB)
uint64_t rss_kb = get_rss_kb();
int64_t rss_delta_kb = (int64_t)rss_kb - (int64_t)g_baseline_rss_kb;
out->rss_delta_mb = rss_delta_kb / 1024;
}
#else
// Non-Linux: stub implementation
void hak_get_kpi(hak_kpi_t* out) {
memset(out, 0, sizeof(hak_kpi_t));
}
#endif
// ============================================================================
// Internal Helpers
// ============================================================================
// Phase 6.8: All legacy profiling functions removed
// - hash_site(), get_site_profile(), infer_policy(), record_alloc(), allocate_with_policy()
// Replaced by ELO-based allocation (hakmem_elo.c)
// ============================================================================
// BigCache eviction callback
// ============================================================================
// BigCache eviction callback (called when cache is full and needs to evict)
static void bigcache_free_callback(void* ptr, size_t size) {
(void)size; // Not used
if (!ptr) return;
// Get raw pointer and header
void* raw = (char*)ptr - HEADER_SIZE;
AllocHeader* hdr = (AllocHeader*)raw;
// Verify magic before accessing method field
if (hdr->magic != HAKMEM_MAGIC) {
fprintf(stderr, "[hakmem] BigCache eviction: invalid magic, fallback to free()\n");
free(raw);
return;
}
// Dispatch based on allocation method
switch (hdr->method) {
case ALLOC_METHOD_MALLOC:
free(raw);
break;
case ALLOC_METHOD_MMAP:
// Cold eviction: route through batch for large blocks
// This completes Phase 6.3 architecture
#ifdef __linux__
if (hdr->size >= BATCH_MIN_SIZE) {
// Large blocks: use batch (deferred munmap + TLB optimization)
hak_batch_add(raw, hdr->size);
} else {
// Small blocks: direct munmap (not worth batching)
// Phase 6.11.1: Try whale cache first
if (hkm_whale_put(raw, hdr->size) != 0) {
// Whale cache full or not a whale: munmap
madvise(raw, hdr->size, MADV_FREE); // Best-effort
hkm_sys_munmap(raw, hdr->size);
}
// else: Successfully cached in whale cache (no munmap!)
}
#else
free(raw); // Fallback (should not happen)
#endif
break;
default:
fprintf(stderr, "[hakmem] BigCache eviction: unknown method %d\n", hdr->method);
free(raw); // Fallback
break;
}
}
// ============================================================================
// Public API
// ============================================================================
// Thread-safe one-time initialization
static void hak_init_impl(void);
static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;
void hak_init(void) {
(void)pthread_once(&g_init_once, hak_init_impl);
}
static void hak_init_impl(void) {
g_initializing = 1;
// Phase 6.X P0 FIX (2025-10-24): Initialize Box 3 (Syscall Layer) FIRST!
// This MUST be called before ANY allocation (Tiny/Mid/Large/Learner)
// dlsym() initializes function pointers to real libc (bypasses LD_PRELOAD)
hkm_syscall_init();
// NEW Phase 6.11.1: Initialize debug timing
hkm_timing_init();
// NEW Phase 6.11.1: Initialize whale fast-path cache
hkm_whale_init();
// NEW Phase Hybrid: Initialize Mid Range MT allocator (8-32KB, mimalloc-style)
mid_mt_init();
// NEW Phase 6.8: Initialize configuration system (replaces init_free_policy + init_thp_policy)
hak_config_init();
// Phase 6.16: Initialize FrozenPolicy (SACS-3)
hkm_policy_init();
// Phase 6.15 P0.3: Configure EVO sampling from environment variable
// HAKMEM_EVO_SAMPLE: 0=disabled (default), N=sample every 2^N calls
// Example: HAKMEM_EVO_SAMPLE=10 → sample every 1024 calls
// HAKMEM_EVO_SAMPLE=16 → sample every 65536 calls
char* evo_sample_str = getenv("HAKMEM_EVO_SAMPLE");
if (evo_sample_str && atoi(evo_sample_str) > 0) {
int freq = atoi(evo_sample_str);
if (freq >= 64) {
fprintf(stderr, "[hakmem] Warning: HAKMEM_EVO_SAMPLE=%d too large, using 63\n", freq);
freq = 63;
}
g_evo_sample_mask = (1ULL << freq) - 1;
HAKMEM_LOG("EVO sampling enabled: every 2^%d = %llu calls\n",
freq, (unsigned long long)(g_evo_sample_mask + 1));
} else {
g_evo_sample_mask = 0; // Disabled by default
HAKMEM_LOG("EVO sampling disabled (HAKMEM_EVO_SAMPLE not set or 0)\n");
}
#ifdef __linux__
// Record baseline KPIs
memset(g_latency_histogram, 0, sizeof(g_latency_histogram));
g_latency_samples = 0;
get_page_faults(&g_baseline_soft_pf, &g_baseline_hard_pf);
g_baseline_rss_kb = get_rss_kb();
HAKMEM_LOG("Baseline: soft_pf=%lu, hard_pf=%lu, rss=%lu KB\n",
(unsigned long)g_baseline_soft_pf,
(unsigned long)g_baseline_hard_pf,
(unsigned long)g_baseline_rss_kb);
#endif
HAKMEM_LOG("Initialized (PoC version)\n");
HAKMEM_LOG("Sampling rate: 1/%d\n", SAMPLING_RATE);
HAKMEM_LOG("Max sites: %d\n", MAX_SITES);
// Bench preset: Tiny-only (disable non-essential subsystems)
{
char* bt = getenv("HAKMEM_BENCH_TINY_ONLY");
if (bt && atoi(bt) != 0) {
g_bench_tiny_only = 1;
}
}
// Under LD_PRELOAD, enforce safer defaults for Tiny path unless overridden
{
char* ldpre = getenv("LD_PRELOAD");
if (ldpre && strstr(ldpre, "libhakmem.so")) {
g_ldpreload_mode = 1;
// Default LD-safe mode if not set: 1 (Tiny-only)
char* lds = getenv("HAKMEM_LD_SAFE");
if (lds) { /* NOP used in wrappers */ } else { setenv("HAKMEM_LD_SAFE", "1", 0); }
if (!getenv("HAKMEM_TINY_TLS_SLL")) {
setenv("HAKMEM_TINY_TLS_SLL", "0", 0); // disable TLS SLL by default
}
if (!getenv("HAKMEM_TINY_USE_SUPERSLAB")) {
setenv("HAKMEM_TINY_USE_SUPERSLAB", "0", 0); // disable SuperSlab path by default
}
}
}
// Runtime safety toggle
char* safe_free_env = getenv("HAKMEM_SAFE_FREE");
if (safe_free_env && atoi(safe_free_env) != 0) {
g_strict_free = 1;
HAKMEM_LOG("Strict free safety enabled (HAKMEM_SAFE_FREE=1)\n");
} else {
// Heuristic: if loaded via LD_PRELOAD, enable strict free by default
char* ldpre = getenv("LD_PRELOAD");
if (ldpre && strstr(ldpre, "libhakmem.so")) {
g_ldpreload_mode = 1;
g_strict_free = 1;
HAKMEM_LOG("Strict free safety auto-enabled under LD_PRELOAD\n");
}
}
// Invalid free logging toggle (default off to avoid spam under LD_PRELOAD)
char* invlog = getenv("HAKMEM_INVALID_FREE_LOG");
if (invlog && atoi(invlog) != 0) {
g_invalid_free_log = 1;
HAKMEM_LOG("Invalid free logging enabled (HAKMEM_INVALID_FREE_LOG=1)\n");
}
// Phase 7.4: Cache HAKMEM_INVALID_FREE to eliminate 44% CPU overhead
// Perf showed getenv() on hot path consumed 43.96% CPU time (26.41% strcmp + 17.55% getenv)
char* inv = getenv("HAKMEM_INVALID_FREE");
if (inv && strcmp(inv, "fallback") == 0) {
g_invalid_free_mode = 0; // fallback mode: route invalid frees to libc
HAKMEM_LOG("Invalid free mode: fallback to libc (HAKMEM_INVALID_FREE=fallback)\n");
} else {
// Under LD_PRELOAD, prefer safety: default to fallback unless explicitly overridden
char* ldpre = getenv("LD_PRELOAD");
if (ldpre && strstr(ldpre, "libhakmem.so")) {
g_ldpreload_mode = 1;
g_invalid_free_mode = 0;
HAKMEM_LOG("Invalid free mode: fallback to libc (auto under LD_PRELOAD)\n");
} else {
g_invalid_free_mode = 1; // default: skip invalid-free check
HAKMEM_LOG("Invalid free mode: skip check (default)\n");
}
}
// NEW Phase 6.8: Feature-gated initialization (check g_hakem_config flags)
if (HAK_ENABLED_ALLOC(HAKMEM_FEATURE_POOL)) {
hak_pool_init();
}
// NEW Phase 6.13: L2.5 LargePool (64KB-1MB allocations)
hak_l25_pool_init();
if (!g_bench_tiny_only && HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE)) {
hak_bigcache_init();
hak_bigcache_set_free_callback(bigcache_free_callback);
}
if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_ELO)) {
hak_elo_init();
// Phase 6.11.4 P0-2: Initialize cached strategy to default (strategy 0)
atomic_store(&g_cached_strategy_id, 0);
}
if (!g_bench_tiny_only && HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE)) {
hak_batch_init();
}
if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_EVOLUTION)) {
hak_evo_init();
}
if (!g_bench_tiny_only) {
// Phase 6.16: Initialize ACE stats (sampling) default off
hkm_ace_stats_init();
// Phase 6.16: Initialize sampling profiler default off
hkm_prof_init();
// Size histogram sampling (optional)
hkm_size_hist_init();
}
if (!g_bench_tiny_only) {
// Start CAP learner (optional, env-gated)
hkm_learner_init();
}
// NEW Phase 6.10: Site Rules (MVP: always ON)
// MT note: default disabled unless HAKMEM_SITE_RULES=1
char* sr_env = getenv("HAKMEM_SITE_RULES");
g_site_rules_enabled = (sr_env && atoi(sr_env) != 0);
if (!g_bench_tiny_only && g_site_rules_enabled) {
hak_site_rules_init();
}
// NEW Phase 6.12: Tiny Pool (≤1KB allocations)
hak_tiny_init();
// Env: optional Tiny flush on exit (memory efficiency evaluation)
{
char* tf = getenv("HAKMEM_TINY_FLUSH_ON_EXIT");
if (tf && atoi(tf) != 0) {
g_flush_tiny_on_exit = 1;
}
char* ud = getenv("HAKMEM_TINY_ULTRA_DEBUG");
if (ud && atoi(ud) != 0) {
g_ultra_debug_on_exit = 1;
}
// Register exit hook if any of the debug/flush toggles are on
// or when path debug is requested.
if (g_flush_tiny_on_exit || g_ultra_debug_on_exit || getenv("HAKMEM_TINY_PATH_DEBUG")) {
atexit(hak_flush_tiny_exit);
}
}
// NEW Phase ACE: Initialize Adaptive Control Engine
hkm_ace_controller_init(&g_ace_controller);
if (g_ace_controller.enabled) {
hkm_ace_controller_start(&g_ace_controller);
HAKMEM_LOG("ACE Learning Layer enabled and started\n");
}
g_initializing = 0;
// Publish that initialization is complete
atomic_thread_fence(memory_order_seq_cst);
g_initialized = 1;
}
void hak_shutdown(void) {
if (!g_initialized) return;
// NEW Phase ACE: Shutdown Adaptive Control Engine FIRST (before other subsystems)
hkm_ace_controller_destroy(&g_ace_controller);
if (!g_bench_tiny_only) {
printf("[hakmem] Shutting down...\n");
hak_print_stats();
}
// NEW Phase 6.9: Shutdown L2 Pool
if (!g_bench_tiny_only) hak_pool_shutdown();
// NEW Phase 6.13: Shutdown L2.5 LargePool
if (!g_bench_tiny_only) hak_l25_pool_shutdown();
// NEW: Shutdown BigCache Box
if (!g_bench_tiny_only) hak_bigcache_shutdown();
// NEW Phase 6.2: Shutdown ELO Strategy Selection
if (!g_bench_tiny_only) hak_elo_shutdown();
// NEW Phase 6.3: Shutdown madvise Batching
if (!g_bench_tiny_only) hak_batch_shutdown();
// NEW Phase 6.10: Shutdown Site Rules
if (!g_bench_tiny_only) hak_site_rules_shutdown();
// NEW Phase 6.12: Print Tiny Pool statistics
if (!g_bench_tiny_only) hak_tiny_print_stats();
// NEW Phase 6.11.1: Print whale cache statistics
if (!g_bench_tiny_only) {
hkm_whale_dump_stats();
// NEW Phase 6.11.1: Shutdown whale cache
hkm_whale_shutdown();
}
// NEW Phase 6.11.1: Shutdown debug timing (must be last!)
if (!g_bench_tiny_only) hkm_timing_shutdown();
// Phase 6.16: Dump sampling profiler
if (!g_bench_tiny_only) hkm_prof_shutdown();
// Stop learner thread
if (!g_bench_tiny_only) hkm_learner_shutdown();
// Stop Tiny background components (e.g., Intelligence Engine)
hak_tiny_shutdown();
g_initialized = 0;
}
// Phase 9.1: Force inline for performance (reduces call overhead by ~30-40%)
__attribute__((always_inline))
inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t0); // Profiling (build-time gated)
#endif
if (!g_initialized) hak_init();
// ========================================================================
// Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path)
// ========================================================================
#ifdef HAKMEM_TINY_FAST_PATH
if (size <= TINY_FAST_THRESHOLD) {
// Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab)
extern void* tiny_fast_alloc(size_t);
extern void tiny_fast_init(void);
extern __thread int g_tiny_fast_initialized;
if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
tiny_fast_init();
}
void* ptr = tiny_fast_alloc(size);
if (ptr) return ptr;
// Fall through to slow path on failure
}
#endif
// ========================================================================
uintptr_t site_id = (uintptr_t)site;
// Phase 6.12: Tiny Pool fast-path (≤1KB allocations)
// Priority: highest for tiny allocations (most frequent)
if (__builtin_expect(size <= TINY_MAX_SIZE, 1)) {
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_tiny);
#endif
void* tiny_ptr = NULL;
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
// Phase 6-1.7: Box Theory Refactoring (3-4 instruction fast path)
tiny_ptr = hak_tiny_alloc_fast_wrapper(size);
#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
// Phase 6-1.5: Ultra Simple (alignment guessing)
tiny_ptr = hak_tiny_alloc_ultra_simple(size);
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
// Phase 6-1.6: Metadata header
tiny_ptr = hak_tiny_alloc_metadata(size);
#else
// Default: Standard Tiny path
tiny_ptr = hak_tiny_alloc(size);
#endif
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_TINY_ALLOC, t_tiny);
#endif
if (tiny_ptr) {
// NEW Phase ACE: Track allocation for learning
hkm_ace_track_alloc();
// Tiny Pool hit! Return immediately (no header needed)
return tiny_ptr;
}
// DEBUG: Tiny Pool returned NULL - fallback to other paths
static int log_count = 0;
if (log_count < 3) {
fprintf(stderr, "[DEBUG] tiny_alloc(%zu) returned NULL, falling back\n", size);
log_count++;
}
// Tiny Pool miss: fallback to other paths below
}
// Record size histogram (sampling) — moved after Tiny fast-path to
// keep hottest path minimal. Tiny hits skip histogram to reduce overhead.
hkm_size_hist_record(size);
// Phase Hybrid: Mid Range MT fast-path (8-32KB allocations)
// Priority: second highest (after Tiny Pool)
// Uses mimalloc-style per-thread segments for optimal MT performance
if (__builtin_expect(mid_is_in_range(size), 0)) {
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_mid);
#endif
void* mid_ptr = mid_mt_alloc(size);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_POOL_GET, t_mid);
#endif
if (mid_ptr) {
// Mid MT hit! Return immediately (no header, lock-free)
return mid_ptr;
}
// Mid MT miss: fallback to other paths below (should be rare)
}
// Phase 6.11.4 P0-1 & P0-2: Compile-time guard + cached strategy update
// Phase 6.15 P0.3: Restored with environment variable control (default disabled)
#if HAKMEM_FEATURE_EVOLUTION
// Only sample if enabled via HAKMEM_EVO_SAMPLE environment variable
if (g_evo_sample_mask > 0) {
static _Atomic uint64_t tick_counter = 0;
if ((atomic_fetch_add(&tick_counter, 1) & g_evo_sample_mask) == 0) {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
uint64_t now_ns = now.tv_sec * 1000000000ULL + now.tv_nsec;
// P0-2: Update cached strategy when window closes
if (hak_evo_tick(now_ns)) {
// Window closed, update cached strategy
int new_strategy = hak_elo_select_strategy();
atomic_store(&g_cached_strategy_id, new_strategy);
}
}
}
#endif
// Phase 6.11.4 P0-2: Always use cached strategy (LEARN/FROZEN/CANARY all use same path)
size_t threshold;
if (HAK_ENABLED_LEARNING(HAKMEM_FEATURE_ELO)) {
// ELO enabled: use cached strategy (updated by hak_evo_tick)
int strategy_id = atomic_load(&g_cached_strategy_id);
threshold = hak_elo_get_threshold(strategy_id);
} else {
// ELO disabled: use default threshold (2MB - mimalloc's large threshold)
threshold = 2097152; // 2MB
}
// Phase SACS-3: BigCache only for very large blocks (>= threshold)
if (HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE) && size >= threshold) {
void* cached_ptr = NULL;
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_bc);
#endif
if (hak_bigcache_try_get(size, site_id, &cached_ptr)) {
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_BIGCACHE_GET, t_bc);
#endif
// Cache hit! Return immediately
return cached_ptr;
}
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_BIGCACHE_GET, t_bc);
#endif
}
// Phase SACS-3: No Site Rules in tier selection (size-only decision)
// Phase 6.16 SACS-3: L1 via ACE unified path
if (size > TINY_MAX_SIZE && size < threshold) {
const FrozenPolicy* pol = hkm_policy_get();
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_ace);
#endif
void* l1 = hkm_ace_alloc(size, site_id, pol);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_POOL_GET, t_ace);
#endif
if (l1) return l1;
}
// Phase SACS-3: For < threshold, prefer malloc; for >= threshold prefer mmap
void* ptr;
if (size >= threshold) {
// Large allocation (L2): use mmap (enables batch madvise)
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_mmap);
#endif
ptr = hak_alloc_mmap_impl(size);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
#endif
} else {
// Small/medium allocation (L0/L1): use malloc (faster for <2MB)
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_malloc);
#endif
ptr = hak_alloc_malloc_impl(size);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_FALLBACK_MALLOC, t_malloc);
#endif
}
if (!ptr) return NULL;
// NEW Phase 6.5: Record allocation size for distribution signature (gated)
if (g_evo_sample_mask > 0) {
hak_evo_record_size(size);
}
// NEW: Set alloc_site and class_bytes in header (for BigCache Phase 2)
AllocHeader* hdr = (AllocHeader*)((char*)ptr - HEADER_SIZE);
// Verify magic (fail-fast if header corrupted)
if (hdr->magic != HAKMEM_MAGIC) {
fprintf(stderr, "[hakmem] ERROR: Invalid magic in allocated header!\n");
return ptr; // Return anyway, but log error
}
// Set allocation site (for per-site cache reuse)
hdr->alloc_site = site_id;
// Set size class for caching (L2 only → threshold class)
if (size >= threshold) {
hdr->class_bytes = threshold; // cacheable at L2 threshold
} else {
hdr->class_bytes = 0; // Not cacheable
}
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_HAK_ALLOC, t0); // Profiling (build-time gated)
#endif
return ptr;
}
// Phase 9.1: Force inline for performance (reduces call overhead by ~30-40%)
// Phase 6-1.7: Disable inline for box refactor to avoid recursive inlining
#ifndef HAKMEM_TINY_PHASE6_BOX_REFACTOR
__attribute__((always_inline))
inline
#endif
void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t0); // Profiling (build-time gated)
#endif
(void)site; // Not used yet (will be used in BigCache Phase 2)
(void)size; // Size stored in header
if (!ptr) {
#if HAKMEM_DEBUG_TIMING
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
#endif
#endif
return;
}
// OPTIMIZATION PHASE 2+1 (2025-11-01): Check Tiny Pool FIRST
// Phase 2: Ultra-fast owner_slab with TLS range check (1-2 cycles negative lookup)
// Phase 1: Reorder to avoid Mid MT mutex overhead for Tiny allocations (90% of mixed workload)
//
// Target: +12-13% improvement (16.24 → 18.4-18.6 M ops/sec)
// - Tiny allocations (90%): Skip Mid MT mutex entirely → ~12% improvement
// - Mid allocations (10%): Fast negative lookup from owner_slab → minimal overhead
TinySlab* tiny_slab = hak_tiny_owner_slab(ptr);
if (tiny_slab) {
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
// Phase 6-1.7: Box Theory Refactoring (2-3 instruction fast path)
// Box 6 handles both same-thread (fast) and cross-thread (remote) internally
hak_tiny_free_fast_wrapper(ptr);
return;
#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
// Phase 6-1.5: Only use ultra-simple free on same-thread pointers.
// Cross-thread frees must go through the full tiny free path
// to ensure proper remote-queue handling and slab reuse.
pthread_t self_pt = pthread_self();
if (__builtin_expect(pthread_equal(tiny_slab->owner_tid, self_pt), 1)) {
hak_tiny_free_ultra_simple(ptr);
return;
}
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
// Phase 6-1.6: Metadata header
hak_tiny_free_metadata(ptr);
return;
#endif
// Fallback: full tiny free (handles cross-thread case correctly)
hak_tiny_free(ptr);
return;
}
// Phase Hybrid: Mid Range MT check (8-32KB, headerless)
{
size_t mid_block_size = 0;
int mid_class_idx = 0;
// First check if ptr is in current thread's segment (fast path)
for (int i = 0; i < MID_NUM_CLASSES; i++) {
MidThreadSegment* seg = &g_mid_segments[i];
if (seg->chunk_base && ptr >= seg->chunk_base && ptr < seg->end) {
*(void**)ptr = seg->free_list;
seg->free_list = ptr;
seg->used_count--;
return;
}
}
// Not in current thread's segment - try registry (mutex + binary search)
if (mid_registry_lookup(ptr, &mid_block_size, &mid_class_idx)) {
mid_mt_free(ptr, mid_block_size);
return;
}
}
// DISABLED: SuperSlab Registry lookup causes false positives
// Problem: L25 allocations aligned to 1MB boundary are misidentified as SuperSlabs
// causing crashes when checking magic number on unmapped/invalid memory
// TODO: Fix SuperSlab registry to avoid false positives (descriptor-based check?)
#if 0
SuperSlab* ss = hak_super_lookup(ptr);
if (ss) {
hak_tiny_free(ptr);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
#endif
return;
}
#endif
// Mid Pool headerless fast route: use page descriptor before header read
{
extern int hak_pool_mid_lookup(void* ptr, size_t* out_size);
extern void hak_pool_free_fast(void* ptr, uintptr_t site_id);
size_t mid_sz = 0;
if (hak_pool_mid_lookup(ptr, &mid_sz)) {
// For Mid, header read is unnecessary; free directly via pool.
hak_pool_free_fast(ptr, (uintptr_t)site);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
#endif
return;
}
}
// L2.5 headerless route: use page descriptor before header read
{
extern int hak_l25_lookup(void* ptr, size_t* out_size);
extern void hak_l25_pool_free_fast(void* ptr, uintptr_t site_id);
size_t l25_sz = 0;
if (hak_l25_lookup(ptr, &l25_sz)) {
// Stats (optional): count as large free
hkm_ace_stat_large_free();
hak_l25_pool_free_fast(ptr, (uintptr_t)site);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
#endif
return;
}
}
// NEW Phase 6.5: Measure free latency (start timing)
// Gate by EVO sampling mask to avoid per-op overhead when disabled
int _do_evo = (g_evo_sample_mask > 0);
struct timespec start_time, end_time;
if (_do_evo) {
clock_gettime(CLOCK_MONOTONIC, &start_time);
}
// Helper macro to record latency before returning (build-time gated timing)
#if HAKMEM_DEBUG_TIMING
#define RECORD_FREE_LATENCY() do { \
if (_do_evo) { \
clock_gettime(CLOCK_MONOTONIC, &end_time); \
uint64_t ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + \
(end_time.tv_nsec - start_time.tv_nsec); \
hak_evo_record_latency((double)ns); \
if (hak_evo_is_canary()) { \
hak_evo_record_canary_result(0, (double)ns); \
} \
} \
HKM_TIME_END(HKM_CAT_HAK_FREE, t0); \
} while(0)
#else
#define RECORD_FREE_LATENCY() do { \
if (_do_evo) { \
clock_gettime(CLOCK_MONOTONIC, &end_time); \
uint64_t ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + \
(end_time.tv_nsec - start_time.tv_nsec); \
hak_evo_record_latency((double)ns); \
if (hak_evo_is_canary()) { \
hak_evo_record_canary_result(0, (double)ns); \
} \
} \
} while(0)
#endif
// Get raw pointer (before header)
void* raw = (char*)ptr - HEADER_SIZE;
#ifdef __linux__
if (g_strict_free) {
// Safety: ensure header address is mapped before touching it (optional)
long _ps = sysconf(_SC_PAGESIZE);
void* _pg = (void*)((uintptr_t)raw & ~((uintptr_t)_ps - 1));
unsigned char _vec;
if (mincore(_pg, (size_t)_ps, &_vec) != 0) {
// Not a valid mapped region → fallback directly to libc free
extern void __libc_free(void*);
__libc_free(ptr);
RECORD_FREE_LATENCY();
return;
}
}
#endif
// Read header
AllocHeader* hdr = (AllocHeader*)raw;
// NEW: Verify magic (fail-fast if corrupted or not from hakmem)
if (hdr->magic != HAKMEM_MAGIC) {
if (g_invalid_free_log) {
fprintf(stderr, "[hakmem] ERROR: Invalid magic 0x%X (expected 0x%X) - possible corruption or non-hakmem pointer\n",
hdr->magic, HAKMEM_MAGIC);
}
// Phase 7.4: Use cached mode (eliminates 44% CPU overhead from getenv on hot path!)
// OLD CODE (44% CPU time!): const char* inv = getenv("HAKMEM_INVALID_FREE");
// if (inv && strcmp(inv, "fallback") == 0) mode_skip = 0;
int mode_skip = g_invalid_free_mode; // 1 = skip, 0 = fallback to libc
if (mode_skip) {
// Skip freeing unknown pointer to avoid abort (possible mmap region). Log only.
RECORD_FREE_LATENCY();
return;
} else {
fprintf(stderr, "[hakmem] Attempting fallback to system free()...\n");
extern void __libc_free(void*);
__libc_free(ptr);
RECORD_FREE_LATENCY();
return;
}
}
// Phase SACS-3: BigCache put only for L2 (class_bytes >= 2MB)
if (HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE) && hdr->class_bytes >= 2097152) {
// Pass actual allocated size (hdr->size), not class_bytes!
// This prevents buffer overflow when BigCache returns undersized blocks
if (hak_bigcache_put(ptr, hdr->size, hdr->alloc_site)) {
RECORD_FREE_LATENCY();
return; // Successfully cached, skip actual free
}
}
// Phase 6.9.1: Pool allocations are now handled via header method
// (no separate detection needed, just dispatch on method)
// Dispatch to correct free function
switch (hdr->method) {
case ALLOC_METHOD_POOL:
// Phase 6.9.1: Pool allocation - return to pool
if (HAK_ENABLED_ALLOC(HAKMEM_FEATURE_POOL)) {
// Stats: record free in ACE L1 Mid
hkm_ace_stat_mid_free();
hak_pool_free(ptr, hdr->size, hdr->alloc_site);
} else {
// Pool disabled, shouldn't happen (fail-fast)
fprintf(stderr, "[hakmem] ERROR: POOL allocation but POOL feature disabled!\\n");
}
RECORD_FREE_LATENCY();
return;
case ALLOC_METHOD_L25_POOL:
// Phase 6.13: L2.5 Pool allocation - return to pool
hkm_ace_stat_large_free();
hak_l25_pool_free(ptr, hdr->size, hdr->alloc_site);
RECORD_FREE_LATENCY();
return;
case ALLOC_METHOD_MALLOC:
free(raw);
break;
case ALLOC_METHOD_MMAP:
// Phase 6.4 P1: Apply free policy (Hot/Warm/Cold)
if (g_hakem_config.free_policy == FREE_POLICY_KEEP) {
// KEEP: 何もしないVA保持、madviseもしない
RECORD_FREE_LATENCY();
return;
} else if (g_hakem_config.free_policy == FREE_POLICY_ADAPTIVE) {
// ADAPTIVE: Hot/Warm/Cold判定
FreeThermal thermal = hak_classify_thermal(hdr->size);
switch (thermal) {
case FREE_THERMAL_HOT:
// HOT (< 1MB): 何もしない(すぐ再利用される)
RECORD_FREE_LATENCY();
return;
case FREE_THERMAL_WARM:
// WARM (1-2MB): MADV_FREEmunmapしない、物理ページのみ返す
#ifdef __linux__
madvise(raw, hdr->size, MADV_FREE);
#endif
RECORD_FREE_LATENCY();
return;
case FREE_THERMAL_COLD:
// COLD (>= 2MB): batch (Phase 6.8: feature-gated)
if (HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE) && hdr->size >= BATCH_MIN_SIZE) {
hak_batch_add(raw, hdr->size);
RECORD_FREE_LATENCY();
return;
}
// Small blocks: immediate munmap
#ifdef __linux__
// Phase 6.11.1: Try whale cache first
if (hkm_whale_put(raw, hdr->size) != 0) {
hkm_sys_munmap(raw, hdr->size);
}
#else
free(raw);
#endif
break;
}
} else {
// BATCH (default): Phase 6.8 feature-gated
// - Keep VA mapped for reuse (mimalloc strategy)
// - Only MADV_FREE on batch flush (release physical pages)
// - munmap happens on cold eviction only
if (HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE) && hdr->size >= BATCH_MIN_SIZE) {
hak_batch_add(raw, hdr->size);
RECORD_FREE_LATENCY();
return;
}
// Small blocks: immediate munmap (not worth batching)
#ifdef __linux__
// Phase 6.11.1: Try whale cache first
if (hkm_whale_put(raw, hdr->size) != 0) {
hkm_sys_munmap(raw, hdr->size);
}
#else
free(raw);
#endif
}
break;
default:
fprintf(stderr, "[hakmem] ERROR: Unknown allocation method: %d\n", hdr->method);
break;
}
// Record latency for all paths that reach here
RECORD_FREE_LATENCY();
#undef RECORD_FREE_LATENCY
}
void hak_print_stats(void) {
printf("\n========================================\n");
printf("hakmem ELO-based Profiling Statistics\n");
printf("========================================\n");
printf("\nOptimization Stats:\n");
printf(" malloc() calls: %llu\n", (unsigned long long)g_malloc_count);
hak_elo_print_leaderboard();
printf("========================================\n\n");
}
// ============================================================================
// Phase 6.15 P0: Standard C Library Wrappers (for LD_PRELOAD)
// ============================================================================
#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD
// Sanitizer/diagnostic builds: bypass hakmem allocator completely.
void* malloc(size_t size) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
void free(void* ptr) {
if (!ptr) return;
extern void __libc_free(void*);
__libc_free(ptr);
}
void* calloc(size_t nmemb, size_t size) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
void* realloc(void* ptr, size_t size) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
#else
// malloc wrapper - intercepts system malloc() calls
void* malloc(size_t size) {
// Recursion guard: if we're inside the allocator already, fall back to libc
if (g_hakmem_lock_depth > 0) {
// Nested call detected - fallback to system malloc
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
// Initialization guard: during hak_init() bootstrap, use libc directly
if (__builtin_expect(g_initializing != 0, 0)) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
// LD safe modes: 1=tiny-only, 2=pass-through
// Determine LD_PRELOAD mode early (before hak_init) to avoid misrouting
int ld_mode = hak_ld_env_mode();
if (ld_mode) {
// Avoid mixing with jemalloc-managed programs (e.g., redis)
if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
// Before hakmem initialization completes, always delegate to libc
if (!g_initialized || g_initializing) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
const char* lds = getenv("HAKMEM_LD_SAFE");
int mode = (lds ? atoi(lds) : 1);
if (mode >= 2 || size > TINY_MAX_SIZE) {
extern void* __libc_malloc(size_t);
return __libc_malloc(size);
}
}
// ========================================================================
// Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path)
// ========================================================================
#ifdef HAKMEM_TINY_FAST_PATH
if (size <= TINY_FAST_THRESHOLD) {
// Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab)
extern void* tiny_fast_alloc(size_t);
extern void tiny_fast_init(void);
extern __thread int g_tiny_fast_initialized;
if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
tiny_fast_init();
}
void* ptr = tiny_fast_alloc(size);
if (ptr) return ptr;
// Fall through to slow path on failure
}
#endif
// ========================================================================
// First-level call: enter allocator (no global lock)
g_hakmem_lock_depth++;
void* ptr = hak_alloc_at(size, HAK_CALLSITE());
g_hakmem_lock_depth--;
return ptr;
}
// free wrapper - intercepts system free() calls
void free(void* ptr) {
if (!ptr) return; // NULL check
// Recursion guard: if we're inside the allocator already, fall back to libc
if (g_hakmem_lock_depth > 0) {
// Nested call detected - fallback to system free
extern void __libc_free(void*);
__libc_free(ptr);
return;
}
if (__builtin_expect(g_initializing != 0, 0)) {
extern void __libc_free(void*);
__libc_free(ptr);
return;
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
extern void __libc_free(void*);
__libc_free(ptr);
return;
}
// In LD_PRELOAD mode, before hakmem initialization completes, always delegate
{
if (hak_ld_env_mode()) {
if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
extern void __libc_free(void*);
__libc_free(ptr);
return;
}
if (!g_initialized || g_initializing) {
extern void __libc_free(void*);
__libc_free(ptr);
return;
}
}
}
// ========================================================================
// Phase 6 Fast Path: Ultra-Simple Free (when enabled)
// ========================================================================
// This bypasses free.part.0 complexity (38.43% overhead in perf analysis)
// - free.part.0: 15.83% → eliminated!
// - mid_lookup: 9.55% → eliminated for tiny!
// - pthread locks: 8.81% → eliminated!
// Two variants:
// Phase 6-1.5: Alignment guessing (3-4 instructions, 235 M ops/sec)
// Phase 6-1.6: Metadata header (1-2 instructions, ~480 M ops/sec expected)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
g_hakmem_lock_depth++;
hak_tiny_free_ultra_simple(ptr);
g_hakmem_lock_depth--;
return;
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
g_hakmem_lock_depth++;
hak_tiny_free_metadata(ptr);
g_hakmem_lock_depth--;
return;
#endif
// ========================================================================
g_hakmem_lock_depth++;
hak_free_at(ptr, 0, HAK_CALLSITE());
g_hakmem_lock_depth--;
}
// calloc wrapper - intercepts system calloc() calls
void* calloc(size_t nmemb, size_t size) {
// Recursion guard
if (g_hakmem_lock_depth > 0) {
// Nested call detected - fallback to system calloc
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
if (__builtin_expect(g_initializing != 0, 0)) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
// Overflow check before any multiplication
if (size != 0 && nmemb > (SIZE_MAX / size)) {
errno = ENOMEM;
return NULL;
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
// Determine LD_PRELOAD mode early (before hak_init)
int ld_mode = hak_ld_env_mode();
if (ld_mode) {
if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
if (!g_initialized || g_initializing) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
const char* lds = getenv("HAKMEM_LD_SAFE");
int mode = (lds ? atoi(lds) : 1);
size_t total = nmemb * size; // safe: overflow checked above
if (mode >= 2 || total > TINY_MAX_SIZE) {
extern void* __libc_calloc(size_t, size_t);
return __libc_calloc(nmemb, size);
}
}
g_hakmem_lock_depth++;
size_t total_size = nmemb * size; // safe: overflow checked above
void* ptr = hak_alloc_at(total_size, HAK_CALLSITE());
if (ptr) {
memset(ptr, 0, total_size); // calloc zeros memory
}
g_hakmem_lock_depth--;
return ptr;
}
// realloc wrapper - intercepts system realloc() calls
void* realloc(void* ptr, size_t size) {
// Recursion guard
if (g_hakmem_lock_depth > 0) {
// Nested call detected - fallback to system realloc
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
if (__builtin_expect(g_initializing != 0, 0)) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
// Determine LD_PRELOAD mode early (before hak_init)
int ld_mode = hak_ld_env_mode();
if (ld_mode) {
if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
if (!g_initialized || g_initializing) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
const char* lds = getenv("HAKMEM_LD_SAFE");
int mode = (lds ? atoi(lds) : 1);
// Pass-through mode, or resizing beyond Tiny range → route to libc
if (mode >= 2 || size > TINY_MAX_SIZE) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
// Tiny-only safe mode: if the existing pointer is NOT Tiny-managed,
// do not touch it — delegate to libc to avoid header mismatches.
if (ptr != NULL && !hak_tiny_is_managed(ptr)) {
extern void* __libc_realloc(void*, size_t);
return __libc_realloc(ptr, size);
}
}
g_hakmem_lock_depth++;
void* new_ptr = NULL;
if (!ptr) {
// realloc(NULL, size) = malloc(size)
new_ptr = hak_alloc_at(size, HAK_CALLSITE());
} else if (size == 0) {
// realloc(ptr, 0) = free(ptr)
hak_free_at(ptr, 0, HAK_CALLSITE());
new_ptr = NULL;
} else {
// Allocate new block
new_ptr = hak_alloc_at(size, HAK_CALLSITE());
if (new_ptr) {
// Get old size from header
void* raw = (char*)ptr - HEADER_SIZE;
AllocHeader* hdr = (AllocHeader*)raw;
if (hdr->magic == HAKMEM_MAGIC) {
size_t old_size = hdr->size - HEADER_SIZE; // User-visible size
size_t copy_size = (old_size < size) ? old_size : size;
memcpy(new_ptr, ptr, copy_size);
} else {
// Invalid header, copy what we can (best effort)
memcpy(new_ptr, ptr, size);
}
// Free old block
hak_free_at(ptr, 0, HAK_CALLSITE());
}
}
g_hakmem_lock_depth--;
return new_ptr;
}
#endif // HAKMEM_FORCE_LIBC_ALLOC_BUILD