2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny.h"
|
|
|
|
|
|
#include "hakmem_tiny_config.h" // Centralized configuration
|
|
|
|
|
|
#include "hakmem_tiny_superslab.h" // Phase 6.22: SuperSlab allocator
|
|
|
|
|
|
#include "hakmem_super_registry.h" // Phase 8.2: SuperSlab registry for memory profiling
|
|
|
|
|
|
#include "hakmem_internal.h"
|
|
|
|
|
|
#include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD)
|
|
|
|
|
|
#include "hakmem_tiny_magazine.h"
|
|
|
|
|
|
// Phase 1 modules (must come AFTER hakmem_tiny.h for TinyPool definition)
|
|
|
|
|
|
#include "hakmem_tiny_batch_refill.h" // Phase 1: Batch refill/spill for mini-magazine
|
|
|
|
|
|
#include "hakmem_tiny_stats.h" // Phase 1: Batched statistics (replaces XOR RNG)
|
|
|
|
|
|
// Phase 2B modules
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_api.h" // Consolidated: stats_api, query_api, rss_api, registry_api
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "tiny_tls.h"
|
|
|
|
|
|
#include "tiny_debug.h"
|
|
|
|
|
|
#include "tiny_mmap_gate.h"
|
|
|
|
|
|
#include "tiny_debug_ring.h"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_route.h"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "tiny_tls_guard.h"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_ready.h"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny_tls_list.h"
|
|
|
|
|
|
#include "hakmem_tiny_remote_target.h" // Phase 2C-1: Remote target queue
|
|
|
|
|
|
#include "hakmem_tiny_bg_spill.h" // Phase 2C-2: Background spill queue
|
|
|
|
|
|
// NOTE: hakmem_tiny_tls_ops.h included later (after type definitions)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_system.h" // Consolidated: stdio, stdlib, string, etc.
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_prof.h"
|
|
|
|
|
|
#include "hakmem_trace.h" // Optional USDT (perf) tracepoints
|
|
|
|
|
|
|
|
|
|
|
|
extern uint64_t g_bytes_allocated; // from hakmem_tiny_superslab.c
|
|
|
|
|
|
|
|
|
|
|
|
// Build-time gate for debug counters (path/ultra). Default OFF.
|
|
|
|
|
|
#ifndef HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
#define HAKMEM_DEBUG_COUNTERS 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
int g_debug_fast0 = 0;
|
|
|
|
|
|
int g_debug_remote_guard = 0;
|
|
|
|
|
|
int g_remote_force_notify = 0;
|
|
|
|
|
|
// Tiny free safety (debug)
|
2025-11-07 22:34:24 +09:00
|
|
|
|
int g_tiny_safe_free = 1; // ULTRATHINK FIX: Enable by default to catch double-frees. env: HAKMEM_SAFE_FREE=1
|
2025-11-05 12:31:14 +09:00
|
|
|
|
int g_tiny_safe_free_strict = 0; // env: HAKMEM_SAFE_FREE_STRICT=1
|
|
|
|
|
|
int g_tiny_force_remote = 0; // env: HAKMEM_TINY_FORCE_REMOTE=1
|
|
|
|
|
|
|
|
|
|
|
|
// Build-time gate: Minimal Tiny front (bench-only)
|
|
|
|
|
|
|
|
|
|
|
|
static inline int superslab_trace_enabled(void) {
|
|
|
|
|
|
static int g_ss_trace_flag = -1;
|
|
|
|
|
|
if (__builtin_expect(g_ss_trace_flag == -1, 0)) {
|
|
|
|
|
|
const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE");
|
|
|
|
|
|
g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_ss_trace_flag;
|
|
|
|
|
|
}
|
|
|
|
|
|
// When enabled, physically excludes optional front tiers from the hot path
|
|
|
|
|
|
// (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving:
|
|
|
|
|
|
// SLL → TLS Magazine → SuperSlab → (remaining slow path)
|
|
|
|
|
|
#ifndef HAKMEM_TINY_MINIMAL_FRONT
|
|
|
|
|
|
#define HAKMEM_TINY_MINIMAL_FRONT 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// Strict front: compile-out optional front tiers but keep baseline structure intact
|
|
|
|
|
|
#ifndef HAKMEM_TINY_STRICT_FRONT
|
|
|
|
|
|
#define HAKMEM_TINY_STRICT_FRONT 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Bench-only fast path knobs (defaults)
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_REFILL 8
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// Optional per-class overrides (bench-only)
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL8
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL16
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL32
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL64
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class)
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP8
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_WARMUP8 64
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP16
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_WARMUP16 96
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP32
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_WARMUP32 160
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP64
|
|
|
|
|
|
#define HAKMEM_TINY_BENCH_WARMUP64 192
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef HAKMEM_TINY_BENCH_FASTPATH
|
|
|
|
|
|
static __thread unsigned char g_tls_bench_warm_done[4];
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)
|
|
|
|
|
|
#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)
|
|
|
|
|
|
#else
|
|
|
|
|
|
#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)
|
|
|
|
|
|
#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0)
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
#define HAK_DBG_INC(var) do { (var)++; } while(0)
|
|
|
|
|
|
#else
|
|
|
|
|
|
#define HAK_DBG_INC(var) do { (void)0; } while(0)
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// Return helper: record tiny alloc stat (guarded) then return pointer
|
|
|
|
|
|
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr);
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// Inject route commit into return helper so any successful allocation commits a fingerprint
|
2025-11-08 04:50:41 +09:00
|
|
|
|
// CRITICAL FIX (Phase 7-1.3): Guard legacy macro to allow Phase 7 override
|
|
|
|
|
|
// Phase 7 defines HAK_RET_ALLOC with header write in tiny_alloc_fast.inc.h
|
|
|
|
|
|
#ifndef HAK_RET_ALLOC
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#ifdef HAKMEM_ENABLE_STATS
|
|
|
|
|
|
// Optional: sampling(ビルド時に有効化)。ホットパスは直接インライン呼び出し(間接分岐なし)。
|
|
|
|
|
|
#ifdef HAKMEM_TINY_STAT_SAMPLING
|
|
|
|
|
|
static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES];
|
|
|
|
|
|
static int g_stat_rate_lg = 0; // 0=毎回、それ以外=2^lgごと
|
|
|
|
|
|
static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) {
|
|
|
|
|
|
if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; }
|
|
|
|
|
|
unsigned m = (1u << g_stat_rate_lg) - 1u;
|
|
|
|
|
|
if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls);
|
|
|
|
|
|
}
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); }
|
|
|
|
|
|
#endif
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); hkm_stat_alloc((cls)); ROUTE_COMMIT((cls), 0x7F); return (ptr); } while(0)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#else
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); ROUTE_COMMIT((cls), 0x7F); return (ptr); } while(0)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
2025-11-08 04:50:41 +09:00
|
|
|
|
#endif // HAK_RET_ALLOC
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// Free-side stats: compile-time zero when stats disabled
|
|
|
|
|
|
#ifdef HAKMEM_ENABLE_STATS
|
|
|
|
|
|
#define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0)
|
|
|
|
|
|
#else
|
|
|
|
|
|
#define HAK_STAT_FREE(cls) do { } while(0)
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Forward declarations for static helpers used before definition
|
|
|
|
|
|
struct TinySlab; // forward
|
|
|
|
|
|
static void move_to_free_list(int class_idx, struct TinySlab* target_slab);
|
|
|
|
|
|
static void move_to_full_list(int class_idx, struct TinySlab* target_slab);
|
|
|
|
|
|
static void release_slab(struct TinySlab* slab);
|
|
|
|
|
|
static TinySlab* allocate_new_slab(int class_idx);
|
|
|
|
|
|
static void tiny_tls_cache_drain(int class_idx);
|
|
|
|
|
|
static void tiny_apply_mem_diet(void);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.23: SuperSlab allocation forward declaration
|
|
|
|
|
|
static inline void* hak_tiny_alloc_superslab(int class_idx);
|
|
|
|
|
|
static inline void* superslab_tls_bump_fast(int class_idx);
|
|
|
|
|
|
static SuperSlab* superslab_refill(int class_idx);
|
|
|
|
|
|
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx);
|
|
|
|
|
|
static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap);
|
|
|
|
|
|
// Forward decl: used by tiny_spec_pop_path before its definition
|
|
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c)
|
|
|
|
|
|
// Note: Remove 'inline' to provide linkable definition for LTO
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
int sll_refill_small_from_ss(int class_idx, int max_take);
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline int sll_refill_small_from_ss(int class_idx, int max_take);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss);
|
|
|
|
|
|
static void* __attribute__((cold, noinline)) tiny_slow_alloc_fast(int class_idx);
|
|
|
|
|
|
static inline void tiny_remote_drain_owner(struct TinySlab* slab);
|
|
|
|
|
|
static void tiny_remote_drain_locked(struct TinySlab* slab);
|
|
|
|
|
|
// Ultra-fast try-only variant: attempt a direct SuperSlab bump/freelist pop
|
|
|
|
|
|
// without any refill or slow-path work. Returns NULL on miss.
|
|
|
|
|
|
/* moved below TinyTLSSlab definition */
|
|
|
|
|
|
|
|
|
|
|
|
// Step 3d: Forced inlining for readability + performance (306M target)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline void* hak_tiny_alloc_wrapper(int class_idx);
|
|
|
|
|
|
// Helpers for SuperSlab active block accounting (atomic, saturating dec)
|
|
|
|
|
|
static inline __attribute__((always_inline)) void ss_active_add(SuperSlab* ss, uint32_t n) {
|
|
|
|
|
|
atomic_fetch_add_explicit(&ss->total_active_blocks, n, memory_order_relaxed);
|
|
|
|
|
|
}
|
|
|
|
|
|
static inline __attribute__((always_inline)) void ss_active_inc(SuperSlab* ss) {
|
|
|
|
|
|
atomic_fetch_add_explicit(&ss->total_active_blocks, 1u, memory_order_relaxed);
|
|
|
|
|
|
}
|
|
|
|
|
|
// EXTRACTED: ss_active_dec_one() moved to hakmem_tiny_superslab.h (Phase 2C-2)
|
|
|
|
|
|
|
2025-11-05 17:45:11 +09:00
|
|
|
|
// Front refill count global config (declare before init.inc uses them)
|
|
|
|
|
|
extern int g_refill_count_global;
|
|
|
|
|
|
extern int g_refill_count_hot;
|
|
|
|
|
|
extern int g_refill_count_mid;
|
|
|
|
|
|
extern int g_refill_count_class[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Step 3d: Forced inlining for slow path (maintain monolithic performance)
|
|
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c)
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx);
|
|
|
|
|
|
#else
|
|
|
|
|
|
static void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Box: adopt_gate_try (implementation moved from header for robust linkage)
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
#include "box/adopt_gate_box.h"
|
|
|
|
|
|
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
|
|
|
|
|
|
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
|
|
|
|
|
|
extern unsigned long long g_adopt_gate_calls[];
|
|
|
|
|
|
extern unsigned long long g_adopt_gate_success[];
|
|
|
|
|
|
extern unsigned long long g_reg_scan_attempts[];
|
|
|
|
|
|
extern unsigned long long g_reg_scan_hits[];
|
|
|
|
|
|
SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
|
|
|
|
|
|
g_adopt_gate_calls[class_idx]++;
|
|
|
|
|
|
ROUTE_MARK(13);
|
|
|
|
|
|
SuperSlab* ss = tiny_refill_try_fast(class_idx, tls);
|
|
|
|
|
|
if (ss) { g_adopt_gate_success[class_idx]++; return ss; }
|
|
|
|
|
|
g_reg_scan_attempts[class_idx]++;
|
|
|
|
|
|
int reg_size = g_super_reg_class_size[class_idx];
|
|
|
|
|
|
int scan_limit = tiny_reg_scan_max();
|
|
|
|
|
|
if (scan_limit > reg_size) scan_limit = reg_size;
|
|
|
|
|
|
uint32_t self_tid = tiny_self_u32();
|
|
|
|
|
|
for (int i = 0; i < scan_limit; i++) {
|
|
|
|
|
|
SuperSlab* cand = g_super_reg_by_class[class_idx][i];
|
|
|
|
|
|
if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue;
|
|
|
|
|
|
// Fast path: use nonempty_mask / freelist_mask to locate candidates in O(1)
|
|
|
|
|
|
uint32_t mask = cand->nonempty_mask;
|
|
|
|
|
|
// Fallback to atomic freelist_mask for cross-thread visibility
|
|
|
|
|
|
if (mask == 0) {
|
|
|
|
|
|
mask = atomic_load_explicit(&cand->freelist_mask, memory_order_acquire);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (mask == 0) continue; // No visible freelists in this SS
|
|
|
|
|
|
int cap = ss_slabs_capacity(cand);
|
|
|
|
|
|
// Iterate set bits only
|
|
|
|
|
|
while (mask) {
|
|
|
|
|
|
int sidx = __builtin_ctz(mask);
|
|
|
|
|
|
mask &= (mask - 1); // clear lowest set bit
|
|
|
|
|
|
if (sidx >= cap) continue;
|
|
|
|
|
|
SlabHandle h = slab_try_acquire(cand, sidx, self_tid);
|
|
|
|
|
|
if (!slab_is_valid(&h)) continue;
|
|
|
|
|
|
if (slab_remote_pending(&h)) {
|
|
|
|
|
|
slab_drain_remote_full(&h);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (slab_is_safe_to_bind(&h)) {
|
|
|
|
|
|
tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
|
|
|
|
|
|
g_adopt_gate_success[class_idx]++;
|
|
|
|
|
|
g_reg_scan_hits[class_idx]++;
|
|
|
|
|
|
ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07);
|
|
|
|
|
|
slab_release(&h);
|
|
|
|
|
|
return h.ss;
|
|
|
|
|
|
}
|
|
|
|
|
|
slab_release(&h);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Global State
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Global pool instance (extern declared in hakmem_tiny.h)
|
|
|
|
|
|
TinyPool g_tiny_pool;
|
|
|
|
|
|
int g_tiny_initialized = 0; // Not static (extern in header for inline access)
|
|
|
|
|
|
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
|
|
|
|
|
|
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
|
|
|
|
|
|
//
|
|
|
|
|
|
// Results:
|
|
|
|
|
|
// Phase 1 (Push - deferred free): +1 instruction, zero benefit
|
|
|
|
|
|
// Phase 2 (Pull - background refill): +77 instructions, -3% performance
|
|
|
|
|
|
//
|
|
|
|
|
|
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
|
|
|
|
|
|
// - TLS Magazine capacity: 2048 items
|
|
|
|
|
|
// - Benchmark working set: 100 items
|
|
|
|
|
|
// - Magazine hit rate: 100% after warmup
|
|
|
|
|
|
// - Slow path never executed!
|
|
|
|
|
|
//
|
|
|
|
|
|
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
|
|
|
|
|
|
// - glibc: ~40 instructions/op (5-7× faster)
|
|
|
|
|
|
// - Gap is architectural (bitmap vs free-list, research features)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
|
|
|
|
|
|
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
|
|
|
|
|
|
// Decision: Enable by default (proven production-ready)
|
|
|
|
|
|
static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!)
|
|
|
|
|
|
// Optional: allow limited trylock-based refill during wrapper calls
|
|
|
|
|
|
static int g_wrap_tiny_refill = 0;
|
|
|
|
|
|
// Remote-free drain controls
|
|
|
|
|
|
static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
|
|
|
|
|
|
static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
|
|
|
|
|
|
|
|
|
|
|
|
// ACE Learning Layer: Per-class remote drain thresholds
|
|
|
|
|
|
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
|
|
|
|
|
|
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
|
|
|
|
|
|
// Old: XOR RNG sampling (10-15 ns overhead)
|
|
|
|
|
|
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
|
|
|
|
|
|
static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
|
|
|
|
|
|
|
|
|
|
|
|
// Step 2: Slab Registry (Hash Table)
|
|
|
|
|
|
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
|
|
|
|
|
|
|
|
|
|
|
|
PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
// Registry lock
|
|
|
|
|
|
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
|
|
|
|
|
|
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
|
|
|
|
|
|
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
|
|
|
|
|
|
static int g_use_registry = 1; // Default ON for thread-safety
|
|
|
|
|
|
|
|
|
|
|
|
// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
|
|
|
|
|
|
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
|
|
|
|
|
|
static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64)
|
|
|
|
|
|
static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// hakmem_tiny_tls_list.h already included at top
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
|
|
|
|
|
static int g_tls_list_enable = 1;
|
|
|
|
|
|
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
|
|
|
|
|
|
static int g_fast_enable = 1;
|
|
|
|
|
|
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
|
|
|
|
|
|
static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1
|
|
|
|
|
|
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef void* (*TinyHotAllocFn)(void);
|
|
|
|
|
|
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
|
|
|
|
|
|
static __thread void* g_fast_head[TINY_NUM_CLASSES];
|
|
|
|
|
|
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
|
|
|
|
|
|
|
|
|
|
|
|
static uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
|
|
|
|
|
|
// NEW: Per-thread active slabs (up to 2 per class)
|
|
|
|
|
|
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
|
|
|
|
|
|
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
|
|
|
|
|
|
TinySlab* cand = g_tls_active_slab_a[class_idx];
|
|
|
|
|
|
if (cand) {
|
|
|
|
|
|
uintptr_t base = (uintptr_t)cand->base;
|
|
|
|
|
|
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
|
|
|
|
|
|
return cand;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
cand = g_tls_active_slab_b[class_idx];
|
|
|
|
|
|
if (cand) {
|
|
|
|
|
|
uintptr_t base = (uintptr_t)cand->base;
|
|
|
|
|
|
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
|
|
|
|
|
|
return cand;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
|
|
|
|
|
|
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
|
|
|
|
|
|
extern int g_use_superslab;
|
|
|
|
|
|
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
|
|
|
|
|
|
if (!ptr) return;
|
2025-11-07 20:31:01 +09:00
|
|
|
|
if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
|
|
|
|
|
SuperSlab* ss = hak_super_lookup(ptr);
|
|
|
|
|
|
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
|
|
|
|
|
|
tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int slab_idx = slab_index_for(ss, ptr);
|
|
|
|
|
|
if (slab_idx < 0) {
|
|
|
|
|
|
tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
|
|
|
|
|
|
} else {
|
2025-11-07 21:45:20 +09:00
|
|
|
|
// Fail-Fast: class vs SuperSlab size_class must be consistent.
|
|
|
|
|
|
if (ss->size_class != cls) {
|
|
|
|
|
|
tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
|
|
|
|
|
|
}
|
2025-11-07 20:31:01 +09:00
|
|
|
|
size_t blk = g_tiny_class_sizes[cls];
|
|
|
|
|
|
uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
|
|
|
|
|
|
uintptr_t delta = (uintptr_t)ptr - base;
|
|
|
|
|
|
if (blk == 0 || (delta % blk) != 0) {
|
|
|
|
|
|
tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
|
|
|
|
|
|
} else if (delta / blk >= ss->slabs[slab_idx].capacity) {
|
|
|
|
|
|
tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!__builtin_expect(g_debug_remote_guard, 0)) return;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
if (!g_use_superslab) return;
|
|
|
|
|
|
SuperSlab* ss = hak_super_lookup(ptr);
|
|
|
|
|
|
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
|
|
|
|
|
|
int slab_idx = slab_index_for(ss, ptr);
|
|
|
|
|
|
if (slab_idx >= 0) {
|
|
|
|
|
|
tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Debug counters for SuperSlab investigation
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
int g_superslab_alloc_count = 0;
|
|
|
|
|
|
int g_superslab_fail_count = 0;
|
|
|
|
|
|
int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees
|
|
|
|
|
|
int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected
|
|
|
|
|
|
int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes
|
|
|
|
|
|
int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
|
|
|
|
|
|
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
|
|
|
|
|
|
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
|
|
|
|
|
|
#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default)
|
|
|
|
|
|
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class
|
|
|
|
|
|
static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs
|
|
|
|
|
|
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
|
|
|
|
|
|
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
|
|
static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default
|
|
|
|
|
|
static uint32_t g_ss_partial_interval = 4;
|
|
|
|
|
|
static _Atomic uint32_t g_ss_partial_epoch = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.24: Unified TLS slab cache (Medium fix)
|
|
|
|
|
|
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
|
|
|
|
|
|
static __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];
|
|
|
|
|
|
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
|
|
|
|
|
|
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
|
|
|
|
|
|
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
|
|
|
|
|
|
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
|
|
|
|
|
|
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
|
|
|
|
|
|
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
|
|
|
|
|
|
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Per-class partial SuperSlab slot (single-slot publish/adopt)
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Small ring of partial SuperSlabs per class (publish/adopt)
|
|
|
|
|
|
#ifndef SS_PARTIAL_RING
|
|
|
|
|
|
#define SS_PARTIAL_RING 64
|
|
|
|
|
|
#endif
|
|
|
|
|
|
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
|
|
|
|
|
|
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
|
|
|
|
|
|
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
|
|
|
|
|
|
static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES];
|
|
|
|
|
|
static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN
|
|
|
|
|
|
|
|
|
|
|
|
// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
|
|
|
|
|
|
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs
|
|
|
|
|
|
static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
|
|
|
|
|
|
static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active
|
|
|
|
|
|
static _Atomic int g_ss_adopt_log_once = 0;
|
|
|
|
|
|
|
|
|
|
|
|
static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
|
|
|
|
|
|
if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
|
|
|
|
|
|
fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
|
|
|
|
|
|
reason ? reason : "unknown", class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void tiny_adopt_gate_parse_env(void) {
|
|
|
|
|
|
if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
|
|
|
|
|
|
const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
|
|
|
|
|
|
if (!env || *env == '\0') {
|
|
|
|
|
|
g_ss_adopt_env = 0; // auto
|
|
|
|
|
|
} else if (*env == '0') {
|
|
|
|
|
|
g_ss_adopt_env = -1; // forced OFF
|
|
|
|
|
|
atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
g_ss_adopt_env = 1; // forced ON
|
|
|
|
|
|
atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
|
|
|
|
|
|
tiny_adopt_gate_log_activation("env", -1);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int tiny_adopt_gate_should_publish(void) {
|
|
|
|
|
|
tiny_adopt_gate_parse_env();
|
|
|
|
|
|
if (g_ss_adopt_env == 1) return 1;
|
|
|
|
|
|
if (g_ss_adopt_env == -1) return 0;
|
|
|
|
|
|
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int tiny_adopt_gate_should_adopt(void) {
|
|
|
|
|
|
tiny_adopt_gate_parse_env();
|
|
|
|
|
|
if (g_ss_adopt_env == 1) return 1;
|
|
|
|
|
|
if (g_ss_adopt_env == -1) return 0;
|
|
|
|
|
|
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void tiny_adopt_gate_on_remote_seen(int class_idx) {
|
|
|
|
|
|
tiny_adopt_gate_parse_env();
|
|
|
|
|
|
atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
|
|
|
|
|
|
if (g_ss_adopt_env == -1) return;
|
|
|
|
|
|
int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
|
|
|
|
|
|
if (prev == 0) {
|
|
|
|
|
|
tiny_adopt_gate_log_activation("remote", class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// TLS hint: last adopted SuperSlab/slab to avoid rescans
|
|
|
|
|
|
#include "tiny_sticky.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Mailbox box
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "box/mailbox_box.h"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// Publish pipeline counters (visibility)
|
|
|
|
|
|
unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Slab-ring counters (debug)
|
|
|
|
|
|
unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Slab entry encoding helpers (used by Bench/Slab-ring paths)
|
|
|
|
|
|
static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) {
|
|
|
|
|
|
return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu);
|
|
|
|
|
|
}
|
|
|
|
|
|
static inline SuperSlab* slab_entry_ss(uintptr_t ent) {
|
|
|
|
|
|
// SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base
|
|
|
|
|
|
return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
|
|
|
|
|
|
}
|
|
|
|
|
|
static inline int slab_entry_idx(uintptr_t ent) {
|
|
|
|
|
|
return (int)(ent & 0x3Fu);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Bench Mode Publish Mailbox (single-slot per class)
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1
|
|
|
|
|
|
static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES];
|
|
|
|
|
|
#ifndef BENCH_MAILBOX_WIDTH
|
|
|
|
|
|
#define BENCH_MAILBOX_WIDTH 16
|
|
|
|
|
|
#endif
|
|
|
|
|
|
static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH];
|
|
|
|
|
|
|
|
|
|
|
|
static inline int bench_mode_enabled(void) {
|
|
|
|
|
|
if (__builtin_expect(g_bench_mode == -1, 0)) {
|
|
|
|
|
|
const char* b = getenv("HAKMEM_TINY_BENCH_MODE");
|
|
|
|
|
|
g_bench_mode = (b && atoi(b) != 0) ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_bench_mode;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) {
|
|
|
|
|
|
if (!bench_mode_enabled()) return;
|
|
|
|
|
|
uintptr_t ent = slab_entry_make(ss, slab_idx);
|
|
|
|
|
|
uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed);
|
|
|
|
|
|
idx &= (BENCH_MAILBOX_WIDTH - 1);
|
|
|
|
|
|
atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline uintptr_t bench_pub_pop(int class_idx) {
|
|
|
|
|
|
if (!bench_mode_enabled()) return (uintptr_t)0;
|
|
|
|
|
|
for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) {
|
|
|
|
|
|
uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
|
|
|
|
|
|
if (ent) return ent;
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Slab-Granular Partial Publish/Adopt (encoded entries)
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
#ifndef SLAB_PARTIAL_RING
|
|
|
|
|
|
#define SLAB_PARTIAL_RING 128
|
|
|
|
|
|
#endif
|
|
|
|
|
|
static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING];
|
|
|
|
|
|
static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Refill-stage counters (per class)
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0};
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0)
|
|
|
|
|
|
unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE)
|
|
|
|
|
|
unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// Refill item source breakdown (freelist vs carve)
|
|
|
|
|
|
unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static int g_rf_trace_en = -1;
|
|
|
|
|
|
static inline int rf_trace_enabled(void) {
|
|
|
|
|
|
if (__builtin_expect(g_rf_trace_en == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_RF_TRACE");
|
|
|
|
|
|
g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_rf_trace_en;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline unsigned long long rf_now_ns(void) {
|
|
|
|
|
|
struct timespec ts;
|
|
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
|
|
|
|
return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// moved to tiny_sticky.c
|
|
|
|
|
|
|
|
|
|
|
|
// moved to tiny_remote.c
|
|
|
|
|
|
|
|
|
|
|
|
// moved to tiny_mailbox.c
|
|
|
|
|
|
|
|
|
|
|
|
// Publish-side counters (debug)
|
|
|
|
|
|
unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Free pipeline counters
|
|
|
|
|
|
unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0};
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
|
|
|
|
|
|
// Front Gate Breakdown (debug counters)
|
|
|
|
|
|
unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Free-side trigger counters
|
|
|
|
|
|
unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
// Adopt/Registry gate counters
|
|
|
|
|
|
unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_fast_lookup_none = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Live Superslab cap (must-adopt-before-mmap support)
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value
|
2025-11-07 01:27:04 +09:00
|
|
|
|
__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static inline int live_cap_for_class(int class_idx) {
|
|
|
|
|
|
if (__builtin_expect(g_live_cap_env == -2, 0)) {
|
|
|
|
|
|
const char* s = getenv("HAKMEM_SS_LIVE_CAP");
|
|
|
|
|
|
if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); }
|
|
|
|
|
|
}
|
|
|
|
|
|
(void)class_idx;
|
|
|
|
|
|
return g_live_cap_env;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
// Hot Slot (global simple path)
|
|
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
|
|
static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot)
|
|
|
|
|
|
static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES];
|
|
|
|
|
|
static inline int hot_slot_enabled(void) {
|
|
|
|
|
|
if (__builtin_expect(g_hot_slot_en == -1, 0)) {
|
|
|
|
|
|
const char* s = getenv("HAKMEM_HOT_SLOT");
|
|
|
|
|
|
g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_hot_slot_en || bench_mode_enabled();
|
|
|
|
|
|
}
|
|
|
|
|
|
static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) {
|
|
|
|
|
|
if (!hot_slot_enabled()) return;
|
|
|
|
|
|
uintptr_t ent = slab_entry_make(ss, slab_idx);
|
|
|
|
|
|
atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release);
|
|
|
|
|
|
}
|
|
|
|
|
|
static inline uintptr_t hot_slot_pop(int class_idx) {
|
|
|
|
|
|
if (!hot_slot_enabled()) return (uintptr_t)0;
|
|
|
|
|
|
return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// moved to tiny_publish.c
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
if (!ss) return;
|
|
|
|
|
|
uintptr_t ent = slab_entry_make(ss, slab_idx);
|
|
|
|
|
|
for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
|
|
|
|
|
|
uintptr_t expected = 0;
|
|
|
|
|
|
if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent,
|
|
|
|
|
|
memory_order_release, memory_order_relaxed)) {
|
|
|
|
|
|
g_slab_publish_dbg[class_idx]++;
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Ring full: round-robin replace and try to requeue the displaced entry into another empty slot
|
|
|
|
|
|
uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
|
|
|
|
|
|
uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel);
|
|
|
|
|
|
if (old) {
|
|
|
|
|
|
for (int t = 0; t < 8; t++) {
|
|
|
|
|
|
uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
|
|
|
|
|
|
uintptr_t expected = 0;
|
|
|
|
|
|
if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old,
|
|
|
|
|
|
memory_order_release, memory_order_relaxed)) {
|
|
|
|
|
|
g_slab_requeue_dbg[class_idx]++;
|
|
|
|
|
|
old = 0; break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
g_slab_publish_dbg[class_idx]++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
|
|
|
|
|
|
uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
|
|
|
|
|
|
if (ent) return ent;
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ss_partial_publish(int class_idx, SuperSlab* ss) {
|
|
|
|
|
|
if (!ss) return;
|
|
|
|
|
|
// Gate by listed flag to avoid repeated publishes of the same SS
|
|
|
|
|
|
unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel);
|
|
|
|
|
|
if (prev != 0u) return; // already listed
|
|
|
|
|
|
|
|
|
|
|
|
// CRITICAL: Release ownership of all slabs so adopters can claim them!
|
|
|
|
|
|
// Without this, ss_owner_try_acquire() will fail (requires owner_tid==0).
|
|
|
|
|
|
// The publishing thread must stop using this SS after publishing.
|
|
|
|
|
|
int cap_pub = ss_slabs_capacity(ss);
|
|
|
|
|
|
for (int s = 0; s < cap_pub; s++) {
|
|
|
|
|
|
uint32_t prev = __atomic_exchange_n(&ss->slabs[s].owner_tid, 0u, __ATOMIC_RELEASE);
|
|
|
|
|
|
if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) {
|
|
|
|
|
|
uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev;
|
|
|
|
|
|
tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE,
|
|
|
|
|
|
(uint16_t)ss->size_class,
|
|
|
|
|
|
&ss->slabs[s],
|
|
|
|
|
|
aux);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// CRITICAL: Unbind current thread's TLS if it points to this SS!
|
|
|
|
|
|
// Otherwise, the publishing thread will continue allocating from the published SS,
|
|
|
|
|
|
// racing with adopters who acquire ownership.
|
|
|
|
|
|
extern __thread TinyTLSSlab g_tls_slabs[];
|
|
|
|
|
|
if (g_tls_slabs[class_idx].ss == ss) {
|
|
|
|
|
|
g_tls_slabs[class_idx].ss = NULL;
|
|
|
|
|
|
g_tls_slabs[class_idx].meta = NULL;
|
|
|
|
|
|
g_tls_slabs[class_idx].slab_base = NULL;
|
|
|
|
|
|
g_tls_slabs[class_idx].slab_idx = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1)
|
|
|
|
|
|
int best = -1; uint32_t best_score = 0;
|
|
|
|
|
|
for (int s = 0; s < cap_pub; s++) {
|
|
|
|
|
|
TinySlabMeta* m = &ss->slabs[s];
|
|
|
|
|
|
uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
|
|
|
|
|
|
int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0);
|
|
|
|
|
|
unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u;
|
|
|
|
|
|
uint32_t score = rc
|
|
|
|
|
|
+ (m->freelist ? (1u<<30) : 0u)
|
|
|
|
|
|
+ (listed ? (1u<<29) : 0u)
|
|
|
|
|
|
+ (has_remote ? 1u : 0u);
|
|
|
|
|
|
if (score > best_score) { best_score = score; best = s; }
|
|
|
|
|
|
}
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
if (best >= 0 && best < 256) {
|
|
|
|
|
|
ss->publish_hint = (uint8_t)best;
|
|
|
|
|
|
// Box: Ready push — provide slab-level candidate to adopters
|
|
|
|
|
|
tiny_ready_push(class_idx, ss, best);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ss->publish_hint = 0xFF;
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
for (int i = 0; i < SS_PARTIAL_RING; i++) {
|
|
|
|
|
|
SuperSlab* expected = NULL;
|
|
|
|
|
|
if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
|
|
|
|
|
|
memory_order_release, memory_order_relaxed)) {
|
|
|
|
|
|
g_ss_publish_dbg[class_idx]++;
|
|
|
|
|
|
return; // published
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Ring full: replace one entry in round-robin to avoid dropping supply
|
|
|
|
|
|
uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed);
|
|
|
|
|
|
idx %= SS_PARTIAL_RING;
|
|
|
|
|
|
SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel);
|
|
|
|
|
|
if (old) {
|
|
|
|
|
|
// NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads
|
|
|
|
|
|
// that just adopted from it. Draining without ownership checks causes freelist corruption.
|
|
|
|
|
|
// The adopter will drain when needed (with proper ownership checks in tiny_refill.h).
|
|
|
|
|
|
//
|
|
|
|
|
|
// Previous code (UNSAFE):
|
|
|
|
|
|
// for (int s = 0; s < cap; s++) {
|
|
|
|
|
|
// ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter!
|
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
|
|
// Keep listed=1 while in overflow so it stays eligible for adopt
|
|
|
|
|
|
// Push old into overflow stack (待機箱)
|
|
|
|
|
|
SuperSlab* head;
|
|
|
|
|
|
do {
|
|
|
|
|
|
head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
|
|
|
|
|
|
old->partial_next = head;
|
|
|
|
|
|
} while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old,
|
|
|
|
|
|
memory_order_release, memory_order_relaxed));
|
|
|
|
|
|
}
|
|
|
|
|
|
g_ss_publish_dbg[class_idx]++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
SuperSlab* ss_partial_adopt(int class_idx) {
|
|
|
|
|
|
for (int i = 0; i < SS_PARTIAL_RING; i++) {
|
|
|
|
|
|
SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel);
|
|
|
|
|
|
if (ss) {
|
|
|
|
|
|
// Clear listed flag on adopt to allow future publish of this SS
|
|
|
|
|
|
atomic_store_explicit(&ss->listed, 0u, memory_order_release);
|
|
|
|
|
|
g_ss_adopt_dbg[class_idx]++;
|
|
|
|
|
|
return ss;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Fallback: adopt from overflow stack (LIFO)
|
|
|
|
|
|
while (1) {
|
|
|
|
|
|
SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
|
|
|
|
|
|
if (!head) break;
|
|
|
|
|
|
SuperSlab* next = head->partial_next;
|
|
|
|
|
|
if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next,
|
|
|
|
|
|
memory_order_acq_rel, memory_order_relaxed)) {
|
|
|
|
|
|
atomic_store_explicit(&head->listed, 0u, memory_order_release);
|
|
|
|
|
|
g_ss_adopt_dbg[class_idx]++;
|
|
|
|
|
|
return head;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) {
|
2025-11-07 21:45:20 +09:00
|
|
|
|
// Canonical binding:
|
|
|
|
|
|
// - ss->size_class defines block size for this SuperSlab
|
|
|
|
|
|
// - slab_idx is the owning slab index within ss
|
|
|
|
|
|
// - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
tls->ss = ss;
|
|
|
|
|
|
tls->slab_idx = (uint8_t)slab_idx;
|
|
|
|
|
|
tls->meta = &ss->slabs[slab_idx];
|
|
|
|
|
|
tls->slab_base = tiny_slab_base_for(ss, slab_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline uint32_t tiny_tls_default_refill(uint32_t cap) {
|
|
|
|
|
|
if (cap == 0u) return 8u;
|
|
|
|
|
|
uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u;
|
|
|
|
|
|
if (low < 4u) low = 4u;
|
|
|
|
|
|
return low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline uint32_t tiny_tls_default_spill(uint32_t cap) {
|
|
|
|
|
|
if (cap == 0u) return 0u;
|
|
|
|
|
|
uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u);
|
|
|
|
|
|
if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP;
|
|
|
|
|
|
if (spill < cap) spill = cap;
|
|
|
|
|
|
return (uint32_t)spill;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) {
|
|
|
|
|
|
atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release);
|
|
|
|
|
|
atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed);
|
|
|
|
|
|
atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed);
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) {
|
|
|
|
|
|
atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release);
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) {
|
|
|
|
|
|
uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire);
|
|
|
|
|
|
if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) {
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire);
|
|
|
|
|
|
if (target_cap != 0u && tls->cap != target_cap) {
|
|
|
|
|
|
tls->cap = target_cap;
|
|
|
|
|
|
uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed);
|
|
|
|
|
|
if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap);
|
|
|
|
|
|
tls->refill_low = target_refill;
|
|
|
|
|
|
uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed);
|
|
|
|
|
|
if (target_spill < target_cap) target_spill = target_cap;
|
|
|
|
|
|
tls->spill_high = target_spill;
|
|
|
|
|
|
}
|
|
|
|
|
|
uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire);
|
|
|
|
|
|
if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) {
|
|
|
|
|
|
g_tls_trim_seen[class_idx] = trim_epoch;
|
|
|
|
|
|
if (tls->count > tls->cap) {
|
|
|
|
|
|
tls_list_spill_excess(class_idx, tls);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
g_tls_param_seen[class_idx] = seq;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: tiny_fast_pop(), tiny_fast_push() - 28 lines (lines 377-404)
|
|
|
|
|
|
// Forward declarations for functions defined in hakmem_tiny_fastcache.inc.h
|
|
|
|
|
|
static inline void* tiny_fast_pop(int class_idx);
|
|
|
|
|
|
static inline int tiny_fast_push(int class_idx, void* ptr);
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_hot_pop.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: tiny_hot_pop_class0(), tiny_hot_pop_class1(), tiny_hot_pop_class2(), tiny_hot_pop_class3()
|
|
|
|
|
|
// 88 lines (lines 407-494)
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
int tls_enabled = g_tls_list_enable;
|
|
|
|
|
|
TinyTLSList* tls = &g_tls_lists[class_idx];
|
|
|
|
|
|
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
|
|
|
|
|
|
pthread_mutex_lock(lock);
|
|
|
|
|
|
|
|
|
|
|
|
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
|
|
|
|
|
|
if (slab) {
|
|
|
|
|
|
g_tiny_pool.free_slabs[class_idx] = slab->next;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
slab = allocate_new_slab(class_idx);
|
|
|
|
|
|
if (!slab) {
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
slab->next = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
if (atomic_load_explicit(&slab->remote_head, memory_order_acquire)) {
|
|
|
|
|
|
tiny_remote_drain_locked(slab);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int block_idx = hak_tiny_find_free_block(slab);
|
|
|
|
|
|
if (block_idx < 0) {
|
|
|
|
|
|
slab->next = g_tiny_pool.free_slabs[class_idx];
|
|
|
|
|
|
g_tiny_pool.free_slabs[class_idx] = slab;
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
hak_tiny_set_used(slab, block_idx);
|
|
|
|
|
|
slab->free_count--;
|
|
|
|
|
|
size_t block_size = g_tiny_class_sizes[class_idx];
|
|
|
|
|
|
uint8_t* base = (uint8_t*)slab->base;
|
|
|
|
|
|
void* ret = (void*)(base + ((size_t)block_idx * block_size));
|
|
|
|
|
|
g_tiny_pool.alloc_count[class_idx]++;
|
|
|
|
|
|
|
|
|
|
|
|
uint16_t cap = g_fast_cap_defaults[class_idx];
|
|
|
|
|
|
uint16_t count = g_fast_count[class_idx];
|
|
|
|
|
|
uint16_t fast_need = (cap > count) ? (uint16_t)(cap - count) : 0;
|
|
|
|
|
|
if (fast_need > slab->free_count) fast_need = (uint16_t)slab->free_count;
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t tls_need = 0;
|
|
|
|
|
|
if (tls_enabled && tls_list_needs_refill(tls)) {
|
|
|
|
|
|
uint32_t target = tls_list_refill_threshold(tls);
|
|
|
|
|
|
if (tls->count < target) {
|
|
|
|
|
|
tls_need = target - tls->count;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
uint32_t remaining = slab->free_count;
|
|
|
|
|
|
if (fast_need > remaining) fast_need = (uint16_t)remaining;
|
|
|
|
|
|
remaining -= fast_need;
|
|
|
|
|
|
if (tls_need > remaining) tls_need = remaining;
|
|
|
|
|
|
|
|
|
|
|
|
while (fast_need > 0) {
|
|
|
|
|
|
int extra_idx = hak_tiny_find_free_block(slab);
|
|
|
|
|
|
if (extra_idx < 0) break;
|
|
|
|
|
|
hak_tiny_set_used(slab, extra_idx);
|
|
|
|
|
|
slab->free_count--;
|
|
|
|
|
|
void* extra = (void*)(base + ((size_t)extra_idx * block_size));
|
|
|
|
|
|
if (!tiny_fast_push(class_idx, extra)) {
|
|
|
|
|
|
if (tls_enabled) {
|
|
|
|
|
|
tiny_tls_list_guard_push(class_idx, tls, extra);
|
|
|
|
|
|
tls_list_push(tls, extra);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
fast_need--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
while (tls_enabled && tls_need > 0) {
|
|
|
|
|
|
int extra_idx = hak_tiny_find_free_block(slab);
|
|
|
|
|
|
if (extra_idx < 0) break;
|
|
|
|
|
|
hak_tiny_set_used(slab, extra_idx);
|
|
|
|
|
|
slab->free_count--;
|
|
|
|
|
|
void* extra = (void*)(base + ((size_t)extra_idx * block_size));
|
|
|
|
|
|
tiny_tls_list_guard_push(class_idx, tls, extra);
|
|
|
|
|
|
tls_list_push(tls, extra);
|
|
|
|
|
|
tls_need--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (slab->free_count == 0) {
|
|
|
|
|
|
move_to_full_list(class_idx, slab);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
slab->next = g_tiny_pool.free_slabs[class_idx];
|
|
|
|
|
|
g_tiny_pool.free_slabs[class_idx] = slab;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tiny_fast_refill_and_take() - 39 lines (lines 584-622)
|
|
|
|
|
|
// Hot-path cheap sampling counter to avoid rand() in allocation path
|
|
|
|
|
|
// Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c)
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
2025-11-07 01:27:04 +09:00
|
|
|
|
__thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
__thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#else
|
2025-11-07 01:27:04 +09:00
|
|
|
|
static __thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode
|
|
|
|
|
|
static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation
|
|
|
|
|
|
// Ultra debug counters
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0};
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Path counters (normal mode visibility): lightweight, for debugging/bench only
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// New: slow/bitmap/bump/bin instrumentation
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
static int g_path_debug_enabled = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Spill hysteresis(freeホットパスからgetenvを排除)
|
|
|
|
|
|
static int g_spill_hyst = 32; // default margin (configured at init; never getenv on hot path)
|
|
|
|
|
|
|
|
|
|
|
|
// Optional per-class refill batch overrides (0=use global defaults)
|
|
|
|
|
|
static int g_refill_max_c[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) {
|
|
|
|
|
|
int v = g_refill_max_c[class_idx];
|
|
|
|
|
|
if (v > 0) return v;
|
|
|
|
|
|
if (class_idx <= 3) {
|
|
|
|
|
|
int hv = g_refill_max_hot_c[class_idx];
|
|
|
|
|
|
if (hv > 0) return hv;
|
|
|
|
|
|
return g_tiny_refill_max_hot;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_tiny_refill_max;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 9.5: Frontend/Backend split - Tiny FastCache (array stack)
|
|
|
|
|
|
// Enabled via HAKMEM_TINY_FASTCACHE=1 (default: 0)
|
|
|
|
|
|
// Compile-out: define HAKMEM_TINY_NO_FRONT_CACHE=1 to exclude this path
|
|
|
|
|
|
#define TINY_FASTCACHE_CAP 128
|
|
|
|
|
|
typedef struct __attribute__((aligned(64))) {
|
|
|
|
|
|
void* items[TINY_FASTCACHE_CAP];
|
|
|
|
|
|
int top;
|
|
|
|
|
|
int _pad[15];
|
|
|
|
|
|
} TinyFastCache;
|
|
|
|
|
|
static int g_fastcache_enable = 0; // HAKMEM_TINY_FASTCACHE=1
|
|
|
|
|
|
static __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
|
|
|
|
|
|
static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend)
|
|
|
|
|
|
// SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER)
|
|
|
|
|
|
static int g_sll_multiplier = 2;
|
|
|
|
|
|
// Cached thread id (uint32) to avoid repeated pthread_self() in hot paths
|
|
|
|
|
|
static __thread uint32_t g_tls_tid32;
|
|
|
|
|
|
static __thread int g_tls_tid32_inited;
|
|
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c)
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) {
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) {
|
|
|
|
|
|
#endif
|
|
|
|
|
|
if (__builtin_expect(!g_tls_tid32_inited, 0)) {
|
|
|
|
|
|
g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self();
|
|
|
|
|
|
g_tls_tid32_inited = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_tls_tid32;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Cached pthread_t as-is for APIs that require pthread_t comparison
|
|
|
|
|
|
static __thread pthread_t g_tls_pt_self;
|
|
|
|
|
|
static __thread int g_tls_pt_inited;
|
|
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c)
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) {
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) {
|
|
|
|
|
|
#endif
|
|
|
|
|
|
if (__builtin_expect(!g_tls_pt_inited, 0)) {
|
|
|
|
|
|
g_tls_pt_self = pthread_self();
|
|
|
|
|
|
g_tls_pt_inited = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_tls_pt_self;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#include "tiny_refill.h"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// tiny_mmap_gate.h already included at top
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "tiny_publish.h"
|
|
|
|
|
|
|
|
|
|
|
|
static int g_sll_cap_override[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_SLL_CAP_C{0..7}
|
|
|
|
|
|
// Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1)
|
|
|
|
|
|
static int g_tiny_prefetch = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Small-class magazine pre-initialization (to avoid cap==0 checks on hot path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Hot-class small TLS magazine(実体とスイッチ)
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
|
void* slots[128];
|
|
|
|
|
|
uint16_t top; // 0..128
|
|
|
|
|
|
uint16_t cap; // =128
|
|
|
|
|
|
} TinyHotMag;
|
|
|
|
|
|
static int g_hotmag_cap_default = 128; // default capacity (env override)
|
|
|
|
|
|
static int g_hotmag_refill_default = 32; // default refill batch (env override)
|
|
|
|
|
|
static int g_hotmag_enable = 0; // 既定OFF(A/B用)。envでON可。
|
|
|
|
|
|
static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES]; // 0=disabled for class, 1=enabled
|
|
|
|
|
|
static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES];
|
|
|
|
|
|
// Inline helpers
|
|
|
|
|
|
|
|
|
|
|
|
#include "hakmem_tiny_hotmag.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Size-specialized tiny alloc (32B/64B) via function pointers (A/B用)
|
|
|
|
|
|
// TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata)
|
|
|
|
|
|
// Opt-in via HAKMEM_TINY_QUICK=1
|
|
|
|
|
|
// NOTE: This type definition must come BEFORE the Phase 2D-1 includes below
|
|
|
|
|
|
typedef struct __attribute__((aligned(64))) {
|
|
|
|
|
|
void* items[6]; // 48B
|
|
|
|
|
|
uint8_t top; // 1B (0..6)
|
|
|
|
|
|
uint8_t _pad1; // 1B
|
|
|
|
|
|
uint16_t _pad2; // 2B
|
|
|
|
|
|
uint32_t _pad3; // 4B (padding to 64B)
|
|
|
|
|
|
} TinyQuickSlot;
|
|
|
|
|
|
static int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1
|
|
|
|
|
|
static __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2D-1: Hot-path inline function extractions
|
|
|
|
|
|
// NOTE: These includes require TinyFastCache, TinyQuickSlot, and TinyTLSSlab to be fully defined
|
|
|
|
|
|
#include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3}
|
|
|
|
|
|
#include "hakmem_tiny_fastcache.inc.h" // 5 functions: tiny_fast_pop/push, fastcache_pop/push, quick_pop
|
|
|
|
|
|
#include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-Simple front (small per-class stack) — combines tiny front to minimize
|
|
|
|
|
|
// instructions and memory touches on alloc/free. Uses existing TLS bump shadow
|
|
|
|
|
|
// (g_tls_bcur/bend) when enabled to avoid per-alloc header writes.
|
|
|
|
|
|
// UltraFront capacity for 32/64B fast pop
|
|
|
|
|
|
#ifndef ULTRA_FRONT_CAP
|
|
|
|
|
|
#define ULTRA_FRONT_CAP 64
|
|
|
|
|
|
#endif
|
|
|
|
|
|
typedef struct __attribute__((aligned(64))) {
|
|
|
|
|
|
void* slots[ULTRA_FRONT_CAP];
|
|
|
|
|
|
uint16_t top; // 0..ULTRA_FRONT_CAP
|
|
|
|
|
|
uint16_t _pad;
|
|
|
|
|
|
} TinyUltraFront;
|
|
|
|
|
|
static int g_ultra_simple = 0; // HAKMEM_TINY_ULTRA_SIMPLE=1
|
|
|
|
|
|
static __thread TinyUltraFront g_tls_ultra[TINY_NUM_CLASSES];
|
|
|
|
|
|
// Inline helpers
|
|
|
|
|
|
#include "hakmem_tiny_ultra_front.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-Bump TLS shadow (bench/opt-in): keep a TLS-only bump window
|
|
|
|
|
|
// to avoid per-alloc header writes. Header is updated per-chunk reservation.
|
|
|
|
|
|
// NOTE: Non-static because used in hakmem_tiny_refill.inc.h
|
|
|
|
|
|
int g_bump_chunk = 32; // HAKMEM_TINY_BUMP_CHUNK (blocks)
|
2025-11-07 01:27:04 +09:00
|
|
|
|
__thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
__thread uint8_t* g_tls_bend[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// SLL small refill batch for specialized class (32/64B)
|
|
|
|
|
|
// Specialized order toggle: 1 = mag-first, 0 = sll-first
|
|
|
|
|
|
// HotMag helpers (for classes 0..3)
|
|
|
|
|
|
static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; }
|
|
|
|
|
|
|
|
|
|
|
|
// Optional front (Ultra/HotMag) push helper: compile-out in release builds
|
|
|
|
|
|
static inline int tiny_optional_push(int class_idx, void* ptr) {
|
|
|
|
|
|
#if HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
(void)class_idx;
|
|
|
|
|
|
(void)ptr;
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
#else
|
|
|
|
|
|
if (__builtin_expect(g_ultra_simple && is_hot_class(class_idx), 0)) {
|
|
|
|
|
|
if (__builtin_expect(ultra_push(class_idx, ptr), 0)) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(is_hot_class(class_idx), 0)) {
|
|
|
|
|
|
if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-Simple helpers
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 9.6: Deferred Intelligence (event queue + background)
|
|
|
|
|
|
// Extended event for FLINT Intelligence (lightweight; recorded off hot path only)
|
|
|
|
|
|
// Observability, ACE, and intelligence helpers
|
|
|
|
|
|
#include "hakmem_tiny_intel.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_rss.c (Phase 2B-2)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED: static int get_rss_kb_self(void) {
|
|
|
|
|
|
// EXTRACTED: FILE* f = fopen("/proc/self/status", "r");
|
|
|
|
|
|
// EXTRACTED: if (!f) return 0;
|
|
|
|
|
|
// EXTRACTED: char buf[256];
|
|
|
|
|
|
// EXTRACTED: int kb = 0;
|
|
|
|
|
|
// EXTRACTED: while (fgets(buf, sizeof(buf), f)) {
|
|
|
|
|
|
// EXTRACTED: if (strncmp(buf, "VmRSS:", 6) == 0) {
|
|
|
|
|
|
// EXTRACTED: char* p = buf;
|
|
|
|
|
|
// EXTRACTED: while (*p && (*p < '0' || *p > '9')) {
|
|
|
|
|
|
// EXTRACTED: p++;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: kb = atoi(p);
|
|
|
|
|
|
// EXTRACTED: break;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: fclose(f);
|
|
|
|
|
|
// EXTRACTED: return kb;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Miss時にマガジンへ大量リフィルせず、1個だけ確保して即返すオプション
|
|
|
|
|
|
// Env: HAKMEM_TINY_REFILL_ONE_ON_MISS=1 で有効(デフォルト: 0)
|
|
|
|
|
|
int g_refill_one_on_miss = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Frontend fill target per class (adaptive)
|
|
|
|
|
|
// NOTE: Non-static because used in hakmem_tiny_refill.inc.h
|
|
|
|
|
|
_Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
// Forward declarations for helpers referenced by frontend_refill_fc
|
|
|
|
|
|
static inline int ultra_batch_for_class(int class_idx);
|
|
|
|
|
|
enum { HAK_TIER_SLL=1, HAK_TIER_MAG=2, HAK_TIER_SLAB=3, HAK_TIER_SUPER=4, HAK_TIER_FRONT=5 };
|
|
|
|
|
|
|
|
|
|
|
|
static inline uint16_t hak_thread_id16(void) {
|
|
|
|
|
|
// best-effort compress cached thread id to 16 bits
|
|
|
|
|
|
uint32_t tid = tiny_self_u32();
|
|
|
|
|
|
return (uint16_t)(tid ^ (tid >> 16));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void eventq_push_ex(int class_idx, uint32_t size, uint8_t tier, uint8_t flags,
|
|
|
|
|
|
uint32_t site_id, uint16_t lat_bucket) {
|
|
|
|
|
|
(void)flags;
|
|
|
|
|
|
|
|
|
|
|
|
(void)lat_bucket;
|
|
|
|
|
|
(void)site_id;
|
|
|
|
|
|
|
|
|
|
|
|
if (!g_int_engine) return;
|
|
|
|
|
|
// Lightweight sampling: if mask set, log 1 out of 2^N
|
|
|
|
|
|
unsigned m = g_int_sample_mask;
|
|
|
|
|
|
if (m != 0) {
|
|
|
|
|
|
unsigned x = g_tls_ev_seq++;
|
|
|
|
|
|
if ((x & m) != 0) return;
|
|
|
|
|
|
}
|
|
|
|
|
|
uint32_t t = atomic_fetch_add_explicit(&g_ev_tail, 1u, memory_order_relaxed);
|
|
|
|
|
|
AllocEvent ev;
|
|
|
|
|
|
ev.ts_ns = g_int_event_ts ? hak_now_ns() : 0;
|
|
|
|
|
|
ev.size = size;
|
|
|
|
|
|
ev.site_id = 0; // keep minimal
|
|
|
|
|
|
ev.latency_bucket = 0;
|
|
|
|
|
|
ev.tier_hit = tier;
|
|
|
|
|
|
ev.flags = 0;
|
|
|
|
|
|
ev.class_idx = (uint16_t)class_idx;
|
|
|
|
|
|
ev.thread_id = 0;
|
|
|
|
|
|
g_ev_ring[t & EVENTQ_MASK] = ev; // best-effort overwrite on overflow
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Background refill workers and intelligence engine
|
|
|
|
|
|
#include "hakmem_tiny_background.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: fastcache_pop(), fastcache_push(), quick_pop() - 25 lines (lines 873-896)
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-fast try-only variant: attempt a direct SuperSlab bump/freelist pop
|
|
|
|
|
|
// without any refill or slow-path work. Returns NULL on miss.
|
|
|
|
|
|
static inline void* hak_tiny_alloc_superslab_try_fast(int class_idx) {
|
|
|
|
|
|
if (!g_use_superslab) return NULL;
|
|
|
|
|
|
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
|
|
|
|
|
|
TinySlabMeta* meta = tls->meta;
|
|
|
|
|
|
if (!meta) return NULL;
|
|
|
|
|
|
// Try linear (bump) allocation first when freelist is empty
|
|
|
|
|
|
if (meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
|
|
|
|
|
|
size_t block_size = g_tiny_class_sizes[tls->ss->size_class];
|
|
|
|
|
|
void* block = tls->slab_base + ((size_t)meta->used * block_size);
|
|
|
|
|
|
meta->used++;
|
|
|
|
|
|
// Track active blocks in SuperSlab for conservative reclamation
|
|
|
|
|
|
ss_active_inc(tls->ss);
|
|
|
|
|
|
return block;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Do not pop freelist here (keep magazine/SLL handling consistent)
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: quick_refill_from_sll(), quick_refill_from_mag() - 31 lines (lines 918-949)
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: sll_refill_small_from_ss() - 45 lines (lines 952-996)
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2C-3: TLS operations module (included after helper function definitions)
|
|
|
|
|
|
#include "hakmem_tiny_tls_ops.h"
|
|
|
|
|
|
|
|
|
|
|
|
// New TLS list refill: owner-only bulk take from TLS-cached SuperSlab slab
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_tls_ops.h (Phase 2C-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tls_refill_from_tls_slab() - 101 lines
|
|
|
|
|
|
// Hot path refill operation, moved to inline function in header
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_tls_ops.h (Phase 2C-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tls_list_spill_excess() - 97 lines
|
|
|
|
|
|
// Hot path spill operation, moved to inline function in header
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: superslab_tls_bump_fast() - 45 lines (lines 1016-1060)
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: frontend_refill_fc() - 44 lines (lines 1063-1106)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// SLL capacity policy: for hot tiny classes (0..3), allow larger SLL up to multiplier * mag_cap
|
|
|
|
|
|
// for >=4 keep current conservative half (to limit footprint).
|
|
|
|
|
|
static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap) {
|
|
|
|
|
|
// Absolute override
|
|
|
|
|
|
if (g_sll_cap_override[class_idx] > 0) {
|
|
|
|
|
|
uint32_t cap = (uint32_t)g_sll_cap_override[class_idx];
|
|
|
|
|
|
if (cap > TINY_TLS_MAG_CAP) cap = TINY_TLS_MAG_CAP;
|
|
|
|
|
|
return cap;
|
|
|
|
|
|
}
|
|
|
|
|
|
uint32_t cap = mag_cap;
|
|
|
|
|
|
if (class_idx <= 3) {
|
|
|
|
|
|
uint32_t mult = (g_sll_multiplier > 0 ? (uint32_t)g_sll_multiplier : 1u);
|
|
|
|
|
|
uint64_t want = (uint64_t)cap * (uint64_t)mult;
|
|
|
|
|
|
if (want > (uint64_t)TINY_TLS_MAG_CAP) cap = TINY_TLS_MAG_CAP; else cap = (uint32_t)want;
|
|
|
|
|
|
} else if (class_idx >= 4) {
|
|
|
|
|
|
cap = (mag_cap > 1u ? (mag_cap / 2u) : 1u);
|
|
|
|
|
|
}
|
|
|
|
|
|
return cap;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: bulk_mag_to_sll_if_room() - 22 lines (lines 1133-1154)
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra helpers forward declarations (defined later)
|
|
|
|
|
|
static inline int ultra_sll_cap_for_class(int class_idx);
|
|
|
|
|
|
static inline int ultra_validate_sll_head(int class_idx, void* head);
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-mode (SLL-only) helpers
|
|
|
|
|
|
// Ultra batch overrides via env: HAKMEM_TINY_ULTRA_BATCH_C{0..7}
|
|
|
|
|
|
static int g_ultra_batch_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static int g_ultra_sll_cap_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
static inline int ultra_batch_for_class(int class_idx) {
|
|
|
|
|
|
int ov = g_ultra_batch_override[class_idx];
|
|
|
|
|
|
if (ov > 0) return ov;
|
|
|
|
|
|
switch (class_idx) {
|
|
|
|
|
|
case 0: return 64; // 8B
|
|
|
|
|
|
case 1: return 96; // 16B(A/B最良)
|
|
|
|
|
|
case 2: return 96; // 32B(A/B最良)
|
|
|
|
|
|
case 3: return 224; // 64B(A/B最良)
|
|
|
|
|
|
case 4: return 64; // 128B
|
|
|
|
|
|
default: return 32; // others
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: ultra_refill_sll() - 56 lines (lines 1178-1233)
|
|
|
|
|
|
|
|
|
|
|
|
#include "hakmem_tiny_remote.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Internal Helpers
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Step 2: Slab Registry Operations
|
|
|
|
|
|
|
|
|
|
|
|
// Hash function for slab_base (64KB aligned)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_registry.c (Phase 2B-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED: static inline int registry_hash(uintptr_t slab_base) {
|
|
|
|
|
|
// EXTRACTED: return (slab_base >> 16) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Register slab in hash table (returns 1 on success, 0 on failure)
|
|
|
|
|
|
// EXTRACTED: static int registry_register(uintptr_t slab_base, TinySlab* owner) {
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_lock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: int hash = registry_hash(slab_base);
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: // Linear probing (max 8 attempts)
|
|
|
|
|
|
// EXTRACTED: for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) {
|
|
|
|
|
|
// EXTRACTED: int idx = (hash + i) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
// EXTRACTED: SlabRegistryEntry* entry = &g_slab_registry[idx];
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: if (entry->slab_base == 0) {
|
|
|
|
|
|
// EXTRACTED: // Empty slot found
|
|
|
|
|
|
// EXTRACTED: entry->slab_base = slab_base;
|
|
|
|
|
|
// EXTRACTED: atomic_store_explicit(&entry->owner, owner, memory_order_release);
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return 1;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: // Registry full (collision limit exceeded)
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return 0;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Unregister slab from hash table
|
|
|
|
|
|
// EXTRACTED: static void registry_unregister(uintptr_t slab_base) {
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_lock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: int hash = registry_hash(slab_base);
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: // Linear probing search
|
|
|
|
|
|
// EXTRACTED: for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) {
|
|
|
|
|
|
// EXTRACTED: int idx = (hash + i) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
// EXTRACTED: SlabRegistryEntry* entry = &g_slab_registry[idx];
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: if (entry->slab_base == slab_base) {
|
|
|
|
|
|
// EXTRACTED: // Found - clear entry (atomic store prevents TOCTOU race)
|
|
|
|
|
|
// EXTRACTED: atomic_store_explicit(&entry->owner, NULL, memory_order_release);
|
|
|
|
|
|
// EXTRACTED: entry->slab_base = 0;
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: if (entry->slab_base == 0) {
|
|
|
|
|
|
// EXTRACTED: // Empty slot - not found
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Lookup slab by base address (O(1) average)
|
|
|
|
|
|
static TinySlab* registry_lookup(uintptr_t slab_base) {
|
|
|
|
|
|
// Lock-free read with atomic owner access (MT-safe)
|
|
|
|
|
|
int hash = registry_hash(slab_base);
|
|
|
|
|
|
|
|
|
|
|
|
// Linear probing search
|
|
|
|
|
|
for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) {
|
|
|
|
|
|
int idx = (hash + i) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
SlabRegistryEntry* entry = &g_slab_registry[idx];
|
|
|
|
|
|
|
|
|
|
|
|
if (entry->slab_base == slab_base) {
|
|
|
|
|
|
// Atomic load to prevent TOCTOU race with registry_unregister()
|
|
|
|
|
|
TinySlab* owner = atomic_load_explicit(&entry->owner, memory_order_acquire);
|
|
|
|
|
|
if (!owner) return NULL; // Entry cleared by unregister
|
|
|
|
|
|
return owner;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (entry->slab_base == 0) {
|
|
|
|
|
|
return NULL; // Empty slot - not found
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL; // Not found after max probes
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_slab_mgmt.inc (Phase 2D-4 FINAL)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: allocate_new_slab() - 79 lines (lines 952-1030)
|
|
|
|
|
|
// Allocate new slab for a class
|
|
|
|
|
|
|
|
|
|
|
|
// Function: release_slab() - 23 lines (lines 1033-1055)
|
|
|
|
|
|
// Release a slab back to system
|
|
|
|
|
|
|
|
|
|
|
|
// Step 2: Find slab owner by pointer (O(1) via hash table registry, or O(N) fallback)
|
|
|
|
|
|
TinySlab* hak_tiny_owner_slab(void* ptr) {
|
|
|
|
|
|
if (!ptr || !g_tiny_initialized) return NULL;
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.14: Runtime toggle between Registry (O(1)) and List (O(N))
|
|
|
|
|
|
if (g_use_registry) {
|
|
|
|
|
|
// O(1) lookup via hash table
|
|
|
|
|
|
uintptr_t slab_base = (uintptr_t)ptr & ~(TINY_SLAB_SIZE - 1);
|
|
|
|
|
|
TinySlab* slab = registry_lookup(slab_base);
|
|
|
|
|
|
if (!slab) return NULL;
|
|
|
|
|
|
// SAFETY: validate membership (ptr must be inside [base, base+64KB))
|
|
|
|
|
|
uintptr_t start = (uintptr_t)slab->base;
|
|
|
|
|
|
uintptr_t end = start + TINY_SLAB_SIZE;
|
|
|
|
|
|
if ((uintptr_t)ptr < start || (uintptr_t)ptr >= end) {
|
|
|
|
|
|
return NULL; // false positive from registry → treat as non-Tiny
|
|
|
|
|
|
}
|
|
|
|
|
|
return slab;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// O(N) fallback: linear search through all slab lists (lock per class)
|
|
|
|
|
|
for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) {
|
|
|
|
|
|
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
|
|
|
|
|
|
pthread_mutex_lock(lock);
|
|
|
|
|
|
// Search free slabs
|
|
|
|
|
|
for (TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; slab; slab = slab->next) {
|
|
|
|
|
|
uintptr_t slab_start = (uintptr_t)slab->base;
|
|
|
|
|
|
uintptr_t slab_end = slab_start + TINY_SLAB_SIZE;
|
|
|
|
|
|
if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) {
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return slab;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Search full slabs
|
|
|
|
|
|
for (TinySlab* slab = g_tiny_pool.full_slabs[class_idx]; slab; slab = slab->next) {
|
|
|
|
|
|
uintptr_t slab_start = (uintptr_t)slab->base;
|
|
|
|
|
|
uintptr_t slab_end = slab_start + TINY_SLAB_SIZE;
|
|
|
|
|
|
if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) {
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return slab;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL; // Not found
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Function: move_to_full_list() - 20 lines (lines 1104-1123)
|
|
|
|
|
|
// Move slab to full list
|
|
|
|
|
|
|
|
|
|
|
|
// Function: move_to_free_list() - 20 lines (lines 1126-1145)
|
|
|
|
|
|
// Move slab to free list
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Public API
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 2D-2: Initialization function (extracted to hakmem_tiny_init.inc)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
#include "hakmem_tiny_init.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// 3-Layer Architecture (2025-11-01 Simplification)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Layer 1: TLS Bump Allocator (ultra-fast, 2-3 instructions/op)
|
|
|
|
|
|
#include "hakmem_tiny_bump.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Layer 2: TLS Small Magazine (fast, 5-10 instructions/op)
|
|
|
|
|
|
#include "hakmem_tiny_smallmag.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 6 Fast Path Options (mutually exclusive)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Choose ONE of the following Phase 6 optimizations:
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 6-1.5: Alignment Guessing (LEGACY - committed 2025-11-02)
|
|
|
|
|
|
// - Enable: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1
|
|
|
|
|
|
// - Speed: 235 M ops/sec
|
|
|
|
|
|
// - Memory: 0% overhead
|
|
|
|
|
|
// - Method: Guess size class from pointer alignment (__builtin_ctzl)
|
|
|
|
|
|
// - Risk: Alignment assumptions may break with future changes
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 6-1.6: Metadata Header (NEW - recommended for production)
|
|
|
|
|
|
// - Enable: -DHAKMEM_TINY_PHASE6_METADATA=1
|
|
|
|
|
|
// - Speed: 450-480 M ops/sec (expected, Phase 6-1 level)
|
|
|
|
|
|
// - Memory: ~6-12% overhead (8 bytes/allocation)
|
|
|
|
|
|
// - Method: Store pool_type + size_class in 8-byte header
|
|
|
|
|
|
// - Benefit: Extends to ALL pools (Tiny/Mid/L25/Whale)
|
|
|
|
|
|
// - Eliminates: Registry lookups, mid_lookup, owner checks
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// Forward declarations for Phase 6 alloc/free functions
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
|
|
|
|
|
|
void* hak_tiny_alloc_ultra_simple(size_t size);
|
|
|
|
|
|
void hak_tiny_free_ultra_simple(void* ptr);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#if defined(HAKMEM_TINY_PHASE6_METADATA) && defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
|
|
|
|
|
|
#error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#if HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
|
|
|
|
|
|
#error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Box 1: Atomic Operations (Layer 0 - Foundation)
|
|
|
|
|
|
#include "tiny_atomic.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions)
|
|
|
|
|
|
#include "tiny_alloc_fast.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Box 6: Free Fast Path (Layer 2 - 2-3 instructions)
|
|
|
|
|
|
#include "tiny_free_fast.inc.h"
|
|
|
|
|
|
|
2025-11-05 17:45:11 +09:00
|
|
|
|
// ---------------- Refill count (Front) global config ----------------
|
|
|
|
|
|
// Parsed once at init; hot path reads plain ints (no getenv).
|
|
|
|
|
|
int g_refill_count_global = 0; // HAKMEM_TINY_REFILL_COUNT
|
|
|
|
|
|
int g_refill_count_hot = 0; // HAKMEM_TINY_REFILL_COUNT_HOT
|
|
|
|
|
|
int g_refill_count_mid = 0; // HAKMEM_TINY_REFILL_COUNT_MID
|
|
|
|
|
|
int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Export wrapper functions for hakmem.c to call
|
2025-11-05 06:25:29 +00:00
|
|
|
|
// Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining
|
2025-11-05 12:31:14 +09:00
|
|
|
|
void* hak_tiny_alloc_fast_wrapper(size_t size) {
|
2025-11-05 06:25:29 +00:00
|
|
|
|
// Diagnostic removed - use HAKMEM_TINY_FRONT_DIAG in tiny_alloc_fast_pop if needed
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return tiny_alloc_fast(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void hak_tiny_free_fast_wrapper(void* ptr) {
|
|
|
|
|
|
tiny_free_fast(ptr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
|
|
|
|
|
|
// Phase 6-1.5: Alignment guessing (legacy)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
|
|
|
|
|
|
// Refill count globals (needed for compatibility)
|
|
|
|
|
|
int g_refill_count_global = 0;
|
|
|
|
|
|
int g_refill_count_hot = 0;
|
|
|
|
|
|
int g_refill_count_mid = 0;
|
|
|
|
|
|
int g_refill_count_class[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny_ultra_simple.inc"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
|
|
|
|
|
|
// Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking)
|
|
|
|
|
|
void* hak_tiny_alloc_fast_wrapper(size_t size) {
|
|
|
|
|
|
return hak_tiny_alloc_ultra_simple(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void hak_tiny_free_fast_wrapper(void* ptr) {
|
|
|
|
|
|
hak_tiny_free_ultra_simple(ptr);
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
|
|
|
|
|
|
// Phase 6-1.6: Metadata header (recommended)
|
|
|
|
|
|
#include "hakmem_tiny_metadata.inc"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Layer 1-3: Main allocation function (simplified)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// Build-time configurable via: -DHAKMEM_TINY_USE_NEW_3LAYER=1
|
|
|
|
|
|
#ifndef HAKMEM_TINY_USE_NEW_3LAYER
|
|
|
|
|
|
#define HAKMEM_TINY_USE_NEW_3LAYER 0 // default OFF (legacy path)
|
|
|
|
|
|
#endif
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#if HAKMEM_TINY_USE_NEW_3LAYER
|
|
|
|
|
|
#include "hakmem_tiny_alloc_new.inc"
|
|
|
|
|
|
#else
|
|
|
|
|
|
// Old 6-7 layer architecture (backup)
|
|
|
|
|
|
#include "hakmem_tiny_alloc.inc"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#include "hakmem_tiny_slow.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// Free path implementations
|
|
|
|
|
|
#include "hakmem_tiny_free.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: hak_tiny_trim() - 116 lines (lines 1164-1279)
|
|
|
|
|
|
// Public trim and cleanup operation for lifecycle management
|
|
|
|
|
|
|
|
|
|
|
|
// Forward decl for internal registry lookup used by ultra safety validation
|
|
|
|
|
|
static TinySlab* registry_lookup(uintptr_t slab_base);
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra helpers: per-class SLL cap and pointer validation
|
|
|
|
|
|
static inline int ultra_sll_cap_for_class(int class_idx) {
|
|
|
|
|
|
int ov = g_ultra_sll_cap_override[class_idx];
|
|
|
|
|
|
if (ov > 0) return ov;
|
|
|
|
|
|
switch (class_idx) {
|
|
|
|
|
|
case 0: return 256; // 8B
|
|
|
|
|
|
case 1: return 384; // 16B(A/B最良)
|
|
|
|
|
|
case 2: return 384; // 32B(A/B最良)
|
|
|
|
|
|
case 3: return 768; // 64B(A/B最良)
|
|
|
|
|
|
case 4: return 256; // 128B
|
|
|
|
|
|
default: return 128; // others
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline int ultra_validate_sll_head(int class_idx, void* head) {
|
|
|
|
|
|
uintptr_t base = ((uintptr_t)head) & ~(TINY_SLAB_SIZE - 1);
|
|
|
|
|
|
TinySlab* owner = registry_lookup(base);
|
|
|
|
|
|
if (!owner) return 0;
|
|
|
|
|
|
uintptr_t start = (uintptr_t)owner->base;
|
|
|
|
|
|
if ((uintptr_t)head < start || (uintptr_t)head >= start + TINY_SLAB_SIZE) return 0;
|
|
|
|
|
|
return (owner->class_idx == class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Optional: wrapper TLS guard(ラッパー再入検知をTLSカウンタで)
|
|
|
|
|
|
#ifndef HAKMEM_WRAPPER_TLS_GUARD
|
|
|
|
|
|
#define HAKMEM_WRAPPER_TLS_GUARD 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#if HAKMEM_WRAPPER_TLS_GUARD
|
|
|
|
|
|
extern __thread int g_tls_in_wrapper;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tiny_tls_cache_drain() - 90 lines (lines 1314-1403)
|
|
|
|
|
|
// Static function for draining TLS caches
|
|
|
|
|
|
//
|
|
|
|
|
|
// Function: tiny_apply_mem_diet() - 20 lines (lines 1405-1424)
|
|
|
|
|
|
// Static function for memory diet mode application
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 2D-3: Lifecycle management functions (226 lines total)
|
|
|
|
|
|
#include "hakmem_tiny_lifecycle.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2D-4 (FINAL): Slab management functions (142 lines total)
|
|
|
|
|
|
#include "hakmem_tiny_slab_mgmt.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// ACE Learning Layer: Runtime parameter setters
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) {
|
|
|
|
|
|
// Validate inputs
|
|
|
|
|
|
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) {
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (threshold < 16 || threshold > 2048) {
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Set per-class threshold (used by remote free drain logic)
|
|
|
|
|
|
g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
|
|
|
|
|
|
}
|