Refactor: Extract 5 Box modules from hakmem_tiny.c (-52% size reduction)
Split hakmem_tiny.c (2081 lines) into focused modules for better maintainability. ## Changes **hakmem_tiny.c**: 2081 → 995 lines (-1086 lines, -52% reduction) ## Extracted Modules (5 boxes) 1. **config_box** (211 lines) - Size class tables, integrity counters - Debug flags, benchmark macros - HAK_RET_ALLOC/HAK_STAT_FREE instrumentation 2. **publish_box** (419 lines) - Publish/Adopt counters and statistics - Bench mailbox, partial ring - Live cap/Hot slot management - TLS helper functions (tiny_tls_default_*) 3. **globals_box** (256 lines) - Global variable declarations (~70 variables) - TinyPool instance and initialization flag - TLS variables (g_tls_lists, g_fast_head, g_fast_count) - SuperSlab configuration (partial ring, empty reserves) - Adopt gate functions 4. **phase6_wrappers_box** (122 lines) - Phase 6 Box Theory wrapper layer - hak_tiny_alloc_fast_wrapper() - hak_tiny_free_fast_wrapper() - Diagnostic instrumentation 5. **ace_guard_box** (100 lines) - ACE Learning Layer (hkm_ace_set_drain_threshold) - FastCache API (tiny_fc_room, tiny_fc_push_bulk) - Tiny Guard debugging system (5 functions) ## Benefits - **Readability**: Giant 2k file → focused 1k core + 5 coherent modules - **Maintainability**: Each box has clear responsibility and boundaries - **Build**: All modules compile successfully ✅ ## Technical Details - Phase 1: ChatGPT extracted config_box + publish_box (-625 lines) - Phase 2-4: Claude extracted globals_box + phase6_wrappers_box + ace_guard_box (-461 lines) - All extractions use .inc files (same translation unit, preserves static/TLS linkage) - Fixed Makefile: Added tiny_sizeclass_hist_box.o to OBJS_BASE and BENCH_HAKMEM_OBJS_BASE 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
4
Makefile
4
Makefile
@ -190,7 +190,7 @@ LDFLAGS += $(EXTRA_LDFLAGS)
|
||||
|
||||
# Targets
|
||||
TARGET = test_hakmem
|
||||
OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
|
||||
OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o hakmem_smallmid_superslab.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o
|
||||
OBJS = $(OBJS_BASE)
|
||||
|
||||
# Shared library
|
||||
@ -222,7 +222,7 @@ endif
|
||||
# Benchmark targets
|
||||
BENCH_HAKMEM = bench_allocators_hakmem
|
||||
BENCH_SYSTEM = bench_allocators_system
|
||||
BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
|
||||
BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o hakmem_tiny_superslab.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_mid_mt.o hakmem_super_registry.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_local_box.o core/box/free_remote_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/unified_batch_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/page_arena.o core/front/tiny_ring_cache.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o
|
||||
BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE)
|
||||
ifeq ($(POOL_TLS_PHASE1),1)
|
||||
BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o
|
||||
|
||||
1112
core/hakmem_tiny.c
1112
core/hakmem_tiny.c
File diff suppressed because it is too large
Load Diff
100
core/hakmem_tiny_ace_guard_box.inc
Normal file
100
core/hakmem_tiny_ace_guard_box.inc
Normal file
@ -0,0 +1,100 @@
|
||||
// ============================================================================
|
||||
// ACE Learning Layer: Runtime parameter setters
|
||||
// ============================================================================
|
||||
|
||||
void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) {
|
||||
// Validate inputs
|
||||
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) {
|
||||
return;
|
||||
}
|
||||
if (threshold < 16 || threshold > 2048) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Set per-class threshold (used by remote free drain logic)
|
||||
g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
|
||||
}
|
||||
#include "tiny_fc_api.h"
|
||||
int tiny_fc_room(int class_idx) {
|
||||
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
|
||||
TinyFastCache* fc = &g_fast_cache[class_idx];
|
||||
// Effective per-class cap comes from g_fast_cap (env-tunable),
|
||||
// clamped by the static storage capacity TINY_FASTCACHE_CAP.
|
||||
uint16_t eff_cap = g_fast_cap[class_idx];
|
||||
if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP;
|
||||
int room = (int)eff_cap - fc->top;
|
||||
return room > 0 ? room : 0;
|
||||
}
|
||||
|
||||
int tiny_fc_push_bulk(int class_idx, void** arr, int n) {
|
||||
if (!arr || n <= 0) return 0;
|
||||
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
|
||||
TinyFastCache* fc = &g_fast_cache[class_idx];
|
||||
uint16_t eff_cap = g_fast_cap[class_idx];
|
||||
if (eff_cap > TINY_FASTCACHE_CAP) eff_cap = TINY_FASTCACHE_CAP;
|
||||
int room = (int)eff_cap - fc->top;
|
||||
if (room <= 0) return 0;
|
||||
int take = n < room ? n : room;
|
||||
// Forward fill with light unrolling to reduce branch overhead
|
||||
int i = 0;
|
||||
for (; i + 3 < take; i += 4) {
|
||||
fc->items[fc->top++] = arr[i];
|
||||
fc->items[fc->top++] = arr[i + 1];
|
||||
fc->items[fc->top++] = arr[i + 2];
|
||||
fc->items[fc->top++] = arr[i + 3];
|
||||
}
|
||||
for (; i < take; i++) {
|
||||
fc->items[fc->top++] = arr[i];
|
||||
}
|
||||
return take;
|
||||
}
|
||||
|
||||
// ========= Tiny Guard (targeted debug; low overhead when disabled) =========
|
||||
static int g_tiny_guard_enabled = -1;
|
||||
static int g_tiny_guard_class = 2;
|
||||
static int g_tiny_guard_limit = 8;
|
||||
static __thread int g_tiny_guard_seen = 0;
|
||||
|
||||
static inline int tiny_guard_enabled_runtime(void) {
|
||||
if (__builtin_expect(g_tiny_guard_enabled == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_TINY_GUARD");
|
||||
g_tiny_guard_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
const char* ec = getenv("HAKMEM_TINY_GUARD_CLASS");
|
||||
if (ec && *ec) g_tiny_guard_class = atoi(ec);
|
||||
const char* el = getenv("HAKMEM_TINY_GUARD_MAX");
|
||||
if (el && *el) g_tiny_guard_limit = atoi(el);
|
||||
if (g_tiny_guard_limit <= 0) g_tiny_guard_limit = 8;
|
||||
}
|
||||
return g_tiny_guard_enabled;
|
||||
}
|
||||
|
||||
int tiny_guard_is_enabled(void) { return tiny_guard_enabled_runtime(); }
|
||||
|
||||
static void tiny_guard_dump_bytes(const char* tag, const uint8_t* p, size_t n) {
|
||||
fprintf(stderr, "[TGUARD] %s:", tag);
|
||||
for (size_t i = 0; i < n; i++) fprintf(stderr, " %02x", p[i]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
void tiny_guard_on_alloc(int cls, void* base, void* user, size_t stride) {
|
||||
if (!tiny_guard_enabled_runtime() || cls != g_tiny_guard_class) return;
|
||||
if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return;
|
||||
uint8_t* b = (uint8_t*)base;
|
||||
uint8_t* u = (uint8_t*)user;
|
||||
fprintf(stderr, "[TGUARD] alloc cls=%d base=%p user=%p stride=%zu hdr=%02x\n",
|
||||
cls, base, user, stride, b[0]);
|
||||
// 隣接ヘッダ可視化(前後)
|
||||
tiny_guard_dump_bytes("around_base", b, (stride >= 8 ? 8 : stride));
|
||||
tiny_guard_dump_bytes("next_header", b + stride, 4);
|
||||
}
|
||||
|
||||
void tiny_guard_on_invalid(void* user_ptr, uint8_t hdr) {
|
||||
if (!tiny_guard_enabled_runtime()) return;
|
||||
if (g_tiny_guard_seen++ >= g_tiny_guard_limit) return;
|
||||
uint8_t* u = (uint8_t*)user_ptr;
|
||||
fprintf(stderr, "[TGUARD] invalid header at user=%p hdr=%02x prev=%02x next=%02x\n",
|
||||
user_ptr, hdr, *(u - 2), *(u));
|
||||
tiny_guard_dump_bytes("dump_before", u - 8, 8);
|
||||
tiny_guard_dump_bytes("dump_after", u, 8);
|
||||
}
|
||||
|
||||
211
core/hakmem_tiny_config_box.inc
Normal file
211
core/hakmem_tiny_config_box.inc
Normal file
@ -0,0 +1,211 @@
|
||||
// hakmem_tiny_config_box.inc
|
||||
// Box: Tiny allocator configuration, debug counters, and return helpers.
|
||||
// Extracted from hakmem_tiny.c to reduce file size and isolate config logic.
|
||||
|
||||
// ============================================================================
|
||||
// Size class table (Box 3 dependency)
|
||||
// ============================================================================
|
||||
// Phase E1-CORRECT: ALL classes have 1-byte header
|
||||
// These sizes represent TOTAL BLOCK SIZE (stride) = [Header 1B][Data N-1B]
|
||||
// Usable data = stride - 1 (implicit)
|
||||
const size_t g_tiny_class_sizes[TINY_NUM_CLASSES] = {
|
||||
8, // Class 0: 8B total = [Header 1B][Data 7B]
|
||||
16, // Class 1: 16B total = [Header 1B][Data 15B]
|
||||
32, // Class 2: 32B total = [Header 1B][Data 31B]
|
||||
64, // Class 3: 64B total = [Header 1B][Data 63B]
|
||||
128, // Class 4: 128B total = [Header 1B][Data 127B]
|
||||
256, // Class 5: 256B total = [Header 1B][Data 255B]
|
||||
512, // Class 6: 512B total = [Header 1B][Data 511B]
|
||||
1024 // Class 7: 1024B total = [Header 1B][Data 1023B]
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Phase 16: Dynamic Tiny Max Size (ENV: HAKMEM_TINY_MAX_CLASS)
|
||||
// Phase 17-1: Auto-adjust when Small-Mid enabled
|
||||
// ============================================================================
|
||||
|
||||
// Forward declaration for Small-Mid check
|
||||
extern bool smallmid_is_enabled(void);
|
||||
|
||||
// Get dynamic max size for Tiny allocator based on ENV configuration
|
||||
// Default: 1023B (C0-C7), can be reduced to 255B (C0-C5)
|
||||
// Phase 17-1: Auto-reduces to 255B when Small-Mid is enabled
|
||||
size_t tiny_get_max_size(void) {
|
||||
static int g_max_class = -1;
|
||||
if (__builtin_expect(g_max_class == -1, 0)) {
|
||||
const char* env = getenv("HAKMEM_TINY_MAX_CLASS");
|
||||
if (env && *env) {
|
||||
int max_class = atoi(env);
|
||||
if (max_class >= 0 && max_class < TINY_NUM_CLASSES) {
|
||||
g_max_class = max_class;
|
||||
} else {
|
||||
g_max_class = 7; // Default: all classes (C0-C7)
|
||||
}
|
||||
} else {
|
||||
g_max_class = 7; // Default: all classes
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 17-1: Auto-adjust when Small-Mid enabled
|
||||
// Small-Mid handles 256B-1KB, so Tiny should only handle 0-255B
|
||||
int effective_class = g_max_class;
|
||||
if (smallmid_is_enabled() && effective_class > 5) {
|
||||
effective_class = 5; // Limit to C0-C5 (0-255B)
|
||||
}
|
||||
|
||||
// Map class to max usable size (stride - 1)
|
||||
// C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B
|
||||
static const size_t class_to_max_size[TINY_NUM_CLASSES] = {
|
||||
7, 15, 31, 63, 127, 255, 511, 1023
|
||||
};
|
||||
return class_to_max_size[effective_class];
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// PRIORITY 1-4: Integrity Check Counters
|
||||
// ============================================================================
|
||||
_Atomic uint64_t g_integrity_check_class_bounds = 0;
|
||||
_Atomic uint64_t g_integrity_check_freelist = 0;
|
||||
_Atomic uint64_t g_integrity_check_canary = 0;
|
||||
_Atomic uint64_t g_integrity_check_header = 0;
|
||||
|
||||
// Build-time gate for debug counters (path/ultra). Default OFF.
|
||||
#ifndef HAKMEM_DEBUG_COUNTERS
|
||||
#define HAKMEM_DEBUG_COUNTERS 0
|
||||
#endif
|
||||
|
||||
int g_debug_fast0 = 0;
|
||||
int g_debug_remote_guard = 0;
|
||||
int g_remote_force_notify = 0;
|
||||
// Tiny free safety (debug)
|
||||
int g_tiny_safe_free = 0; // Default OFF for performance; env: HAKMEM_SAFE_FREE=1 でON
|
||||
int g_tiny_safe_free_strict = 0; // env: HAKMEM_SAFE_FREE_STRICT=1
|
||||
int g_tiny_force_remote = 0; // env: HAKMEM_TINY_FORCE_REMOTE=1
|
||||
|
||||
// Build-time gate: Minimal Tiny front (bench-only)
|
||||
|
||||
static inline int superslab_trace_enabled(void) {
|
||||
static int g_ss_trace_flag = -1;
|
||||
if (__builtin_expect(g_ss_trace_flag == -1, 0)) {
|
||||
const char* tr = getenv("HAKMEM_TINY_SUPERSLAB_TRACE");
|
||||
g_ss_trace_flag = (tr && atoi(tr) != 0) ? 1 : 0;
|
||||
}
|
||||
return g_ss_trace_flag;
|
||||
}
|
||||
// When enabled, physically excludes optional front tiers from the hot path
|
||||
// (UltraFront/Quick/Frontend/HotMag/SS-try/BumpShadow), leaving:
|
||||
// SLL → TLS Magazine → SuperSlab → (remaining slow path)
|
||||
#ifndef HAKMEM_TINY_MINIMAL_FRONT
|
||||
#define HAKMEM_TINY_MINIMAL_FRONT 1
|
||||
#endif
|
||||
// Strict front: compile-out optional front tiers but keep baseline structure intact
|
||||
#ifndef HAKMEM_TINY_STRICT_FRONT
|
||||
#define HAKMEM_TINY_STRICT_FRONT 0
|
||||
#endif
|
||||
|
||||
// Bench-only fast path knobs (defaults)
|
||||
#ifndef HAKMEM_TINY_BENCH_REFILL
|
||||
#define HAKMEM_TINY_BENCH_REFILL 8
|
||||
#endif
|
||||
// Optional per-class overrides (bench-only)
|
||||
#ifndef HAKMEM_TINY_BENCH_REFILL8
|
||||
#define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL
|
||||
#endif
|
||||
#ifndef HAKMEM_TINY_BENCH_REFILL16
|
||||
#define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL
|
||||
#endif
|
||||
#ifndef HAKMEM_TINY_BENCH_REFILL32
|
||||
#define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL
|
||||
#endif
|
||||
#ifndef HAKMEM_TINY_BENCH_REFILL64
|
||||
#define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL
|
||||
#endif
|
||||
|
||||
// Bench-only warmup amounts (pre-fill TLS SLL on first alloc per class)
|
||||
#ifndef HAKMEM_TINY_BENCH_WARMUP8
|
||||
#define HAKMEM_TINY_BENCH_WARMUP8 64
|
||||
#endif
|
||||
#ifndef HAKMEM_TINY_BENCH_WARMUP16
|
||||
#define HAKMEM_TINY_BENCH_WARMUP16 96
|
||||
#endif
|
||||
#ifndef HAKMEM_TINY_BENCH_WARMUP32
|
||||
#define HAKMEM_TINY_BENCH_WARMUP32 160
|
||||
#endif
|
||||
#ifndef HAKMEM_TINY_BENCH_WARMUP64
|
||||
#define HAKMEM_TINY_BENCH_WARMUP64 192
|
||||
#endif
|
||||
|
||||
#ifdef HAKMEM_TINY_BENCH_FASTPATH
|
||||
static __thread unsigned char g_tls_bench_warm_done[4];
|
||||
#endif
|
||||
|
||||
#if HAKMEM_DEBUG_COUNTERS
|
||||
#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)
|
||||
#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)
|
||||
#else
|
||||
#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)
|
||||
#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)
|
||||
#endif
|
||||
// Simple scalar debug increment (no-op when HAKMEM_DEBUG_COUNTERS=0)
|
||||
#if HAKMEM_DEBUG_COUNTERS
|
||||
#define HAK_DBG_INC(var) do { (var)++; } while(0)
|
||||
#else
|
||||
#define HAK_DBG_INC(var) do { (void)0; } while(0)
|
||||
#endif
|
||||
// Return helper: record tiny alloc stat (guarded) then return pointer
|
||||
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr);
|
||||
|
||||
// ========== HAK_RET_ALLOC: Single Definition Point ==========
|
||||
// Choose implementation based on HAKMEM_TINY_HEADER_CLASSIDX
|
||||
// - Phase 7 enabled: Write header and return user pointer
|
||||
// - Phase 7 disabled: Legacy behavior (stats + route + return)
|
||||
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
#if HAKMEM_BUILD_RELEASE
|
||||
// Phase E1-CORRECT: ALL classes have 1-byte headers (including C7)
|
||||
// Ultra-fast inline macro (3-4 instructions)
|
||||
#define HAK_RET_ALLOC(cls, base_ptr) do { \
|
||||
*(uint8_t*)(base_ptr) = HEADER_MAGIC | ((cls) & HEADER_CLASS_MASK); \
|
||||
return (void*)((uint8_t*)(base_ptr) + 1); \
|
||||
} while(0)
|
||||
#else
|
||||
// Debug: Keep full validation via tiny_region_id_write_header()
|
||||
#define HAK_RET_ALLOC(cls, ptr) return tiny_region_id_write_header((ptr), (cls))
|
||||
#endif
|
||||
#else
|
||||
// Legacy: Stats and routing before return
|
||||
#ifdef HAKMEM_ENABLE_STATS
|
||||
// Optional: sampling(ビルド時に有効化)。ホットパスは直接インライン呼び出し(間接分岐なし)。
|
||||
#ifdef HAKMEM_TINY_STAT_SAMPLING
|
||||
static __thread unsigned g_tls_stat_accum_alloc[TINY_NUM_CLASSES];
|
||||
static int g_stat_rate_lg = 0; // 0=毎回、それ以外=2^lgごと
|
||||
static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) {
|
||||
if (__builtin_expect(g_stat_rate_lg == 0, 1)) { stats_record_alloc(cls); return; }
|
||||
unsigned m = (1u << g_stat_rate_lg) - 1u;
|
||||
if (((++g_tls_stat_accum_alloc[cls]) & m) == 0u) stats_record_alloc(cls);
|
||||
}
|
||||
#else
|
||||
static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); }
|
||||
#endif
|
||||
#define HAK_RET_ALLOC(cls, ptr) do { \
|
||||
tiny_debug_track_alloc_ret((cls), (ptr)); \
|
||||
hkm_stat_alloc((cls)); \
|
||||
ROUTE_COMMIT((cls), 0x7F); \
|
||||
return (ptr); \
|
||||
} while(0)
|
||||
#else
|
||||
#define HAK_RET_ALLOC(cls, ptr) do { \
|
||||
tiny_debug_track_alloc_ret((cls), (ptr)); \
|
||||
ROUTE_COMMIT((cls), 0x7F); \
|
||||
return (ptr); \
|
||||
} while(0)
|
||||
#endif
|
||||
#endif // HAKMEM_TINY_HEADER_CLASSIDX
|
||||
|
||||
// Free-side stats: compile-time zero when stats disabled
|
||||
#ifdef HAKMEM_ENABLE_STATS
|
||||
#define HAK_STAT_FREE(cls) do { stats_record_free((cls)); } while(0)
|
||||
#else
|
||||
#define HAK_STAT_FREE(cls) do { } while(0)
|
||||
#endif
|
||||
|
||||
256
core/hakmem_tiny_globals_box.inc
Normal file
256
core/hakmem_tiny_globals_box.inc
Normal file
@ -0,0 +1,256 @@
|
||||
// ============================================================================
|
||||
// Global State
|
||||
// ============================================================================
|
||||
|
||||
// Global pool instance (extern declared in hakmem_tiny.h)
|
||||
TinyPool g_tiny_pool;
|
||||
int g_tiny_initialized = 0; // Not static (extern in header for inline access)
|
||||
// Runtime toggle: allow Tiny allocations even inside malloc/free wrappers
|
||||
// Phase 7.3 LESSONS LEARNED: Async optimization (Phase 1+2) FAILED
|
||||
//
|
||||
// Results:
|
||||
// Phase 1 (Push - deferred free): +1 instruction, zero benefit
|
||||
// Phase 2 (Pull - background refill): +77 instructions, -3% performance
|
||||
//
|
||||
// Root cause: Both optimized SLOW PATH (bitmap scan), but benchmark hits FAST PATH 99.9%
|
||||
// - TLS Magazine capacity: 2048 items
|
||||
// - Benchmark working set: 100 items
|
||||
// - Magazine hit rate: 100% after warmup
|
||||
// - Slow path never executed!
|
||||
//
|
||||
// Real bottleneck: FAST PATH (TLS magazine access) = 228 instructions/op
|
||||
// - glibc: ~40 instructions/op (5-7× faster)
|
||||
// - Gap is architectural (bitmap vs free-list, research features)
|
||||
//
|
||||
// Phase 7.4: getenv fix achieved 86% speedup → now FASTER than glibc!
|
||||
// Results: 120-164 M ops/sec (vs glibc 105 M ops/sec) = 15-57% faster ✅
|
||||
// Decision: Enable by default (proven production-ready)
|
||||
static int g_wrap_tiny_enabled = 1; // ON by default (faster than glibc!)
|
||||
// Optional: allow limited trylock-based refill during wrapper calls
|
||||
static int g_wrap_tiny_refill = 0;
|
||||
// Remote-free drain controls
|
||||
static int g_remote_drain_thresh = 32; // HAKMEM_TINY_REMOTE_DRAIN_THRESHOLD (global fallback)
|
||||
static int g_remote_drain_tryrate = 16; // HAKMEM_TINY_REMOTE_DRAIN_TRYRATE (1/N probability)
|
||||
|
||||
// ACE Learning Layer: Per-class remote drain thresholds
|
||||
int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES] = {32, 32, 32, 32, 32, 32, 32, 32};
|
||||
// Sampled counter updates (Phase 3: Replaced with batched TLS counters)
|
||||
// Old: XOR RNG sampling (10-15 ns overhead)
|
||||
// New: Batched stats in hakmem_tiny_stats.h (0.5 ns overhead)
|
||||
static int g_tiny_count_sample_exp = 8; // HAKMEM_TINY_COUNT_SAMPLE (kept for compatibility)
|
||||
|
||||
// Step 2: Slab Registry (Hash Table)
|
||||
SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE];
|
||||
|
||||
PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
|
||||
|
||||
// Registry lock
|
||||
pthread_mutex_t g_tiny_registry_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
// Phase 6.14: Runtime toggle for Registry ON/OFF (default OFF)
|
||||
// O(N) Sequential Access is faster than O(1) Random Access for Small-N (8-32 slabs)
|
||||
// Reason: L1 cache hit率 95%+ (Sequential) vs 50-70% (Random Hash)
|
||||
static int g_use_registry = 1; // Default ON for thread-safety
|
||||
|
||||
// TLS Magazines (P1) definitions moved to hakmem_tiny_magazine.h
|
||||
// Upper bound for one-shot refill from a slab when the magazine is low (runtime tunable)
|
||||
static int g_tiny_refill_max = 64; // HAKMEM_TINY_REFILL_MAX (default 64)
|
||||
static int g_tiny_refill_max_hot = 192; // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
|
||||
|
||||
// hakmem_tiny_tls_list.h already included at top
|
||||
static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||||
static int g_tls_list_enable = 0; // Default OFF for bench; override via HAKMEM_TINY_TLS_LIST=1
|
||||
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
|
||||
static int g_fast_enable = 1;
|
||||
static int g_fastcache_enable = 1; // Default ON (array stack for C0-C3); override via HAKMEM_TINY_FASTCACHE=0
|
||||
static uint16_t g_fast_cap[TINY_NUM_CLASSES];
|
||||
static int g_ultra_bump_shadow = 0; // HAKMEM_TINY_BUMP_SHADOW=1
|
||||
static uint8_t g_fast_cap_locked[TINY_NUM_CLASSES];
|
||||
|
||||
|
||||
typedef void* (*TinyHotAllocFn)(void);
|
||||
static TinyHotAllocFn g_hot_alloc_fn[TINY_NUM_CLASSES];
|
||||
static __thread void* g_fast_head[TINY_NUM_CLASSES];
|
||||
static __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
|
||||
static inline void tls_list_spill_excess(int class_idx, TinyTLSList* tls);
|
||||
|
||||
uint64_t g_tls_hit_count[TINY_NUM_CLASSES];
|
||||
uint64_t g_tls_miss_count[TINY_NUM_CLASSES];
|
||||
uint64_t g_tls_spill_ss_count[TINY_NUM_CLASSES];
|
||||
uint64_t g_tls_spill_owner_count[TINY_NUM_CLASSES];
|
||||
uint64_t g_tls_spill_mag_count[TINY_NUM_CLASSES];
|
||||
uint64_t g_tls_spill_requeue_count[TINY_NUM_CLASSES];
|
||||
|
||||
// Legacy magazine definitions have been moved to hakmem_tiny_magazine.h
|
||||
// NEW: Per-thread active slabs (up to 2 per class)
|
||||
static __thread TinySlab* g_tls_active_slab_a[TINY_NUM_CLASSES];
|
||||
static __thread TinySlab* g_tls_active_slab_b[TINY_NUM_CLASSES];
|
||||
|
||||
static inline __attribute__((always_inline)) TinySlab* tls_active_owner_for_ptr(int class_idx, void* ptr) {
|
||||
TinySlab* cand = g_tls_active_slab_a[class_idx];
|
||||
if (cand) {
|
||||
uintptr_t base = (uintptr_t)cand->base;
|
||||
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
|
||||
return cand;
|
||||
}
|
||||
}
|
||||
cand = g_tls_active_slab_b[class_idx];
|
||||
if (cand) {
|
||||
uintptr_t base = (uintptr_t)cand->base;
|
||||
if ((uintptr_t)ptr >= base && (uintptr_t)ptr < base + (uintptr_t)TINY_SLAB_SIZE) {
|
||||
return cand;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Phase 6.23: SuperSlab support (mimalloc-style fast allocation)
|
||||
// Runtime toggle (global, defined in hakmem_config.c). Default is ON for Box Refactor line.
|
||||
extern int g_use_superslab;
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) {
|
||||
if (!ptr) return;
|
||||
// ✅ Phase E1-CORRECT: ptr is USER pointer, convert to BASE
|
||||
void* base_ptr = ptr ? (void*)((uint8_t*)ptr - 1) : NULL;
|
||||
if (g_use_superslab && __builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
|
||||
SuperSlab* ss = hak_super_lookup(ptr);
|
||||
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
|
||||
tiny_failfast_abort_ptr("alloc_ret_lookup", ss, -1, ptr, "lookup_fail");
|
||||
} else {
|
||||
int slab_idx = slab_index_for(ss, base_ptr);
|
||||
if (slab_idx < 0) {
|
||||
tiny_failfast_abort_ptr("alloc_ret_slabidx", ss, slab_idx, ptr, "slab_idx_mismatch");
|
||||
} else {
|
||||
// Fail-Fast: class vs SuperSlab size_class must be consistent.
|
||||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||||
if (meta->class_idx != (uint8_t)cls) {
|
||||
tiny_failfast_abort_ptr("alloc_ret_cls_mismatch", ss, slab_idx, ptr, "class_mismatch");
|
||||
}
|
||||
size_t blk = g_tiny_class_sizes[cls];
|
||||
uintptr_t base = (uintptr_t)tiny_slab_base_for(ss, slab_idx);
|
||||
uintptr_t delta = (uintptr_t)base_ptr - base;
|
||||
if (blk == 0 || (delta % blk) != 0) {
|
||||
tiny_failfast_abort_ptr("alloc_ret_align", ss, slab_idx, ptr, "misaligned");
|
||||
} else if (delta / blk >= ss_slab_meta_capacity_get(ss, slab_idx)) {
|
||||
tiny_failfast_abort_ptr("alloc_ret_range", ss, slab_idx, ptr, "out_of_capacity");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!__builtin_expect(g_debug_remote_guard, 0)) return;
|
||||
if (!g_use_superslab) return;
|
||||
SuperSlab* ss = hak_super_lookup(ptr);
|
||||
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
|
||||
int slab_idx = slab_index_for(ss, base_ptr);
|
||||
if (slab_idx >= 0) {
|
||||
tiny_remote_track_on_alloc(ss, slab_idx, ptr, "alloc_ret", 0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void tiny_debug_track_alloc_ret(int cls, void* ptr) { (void)cls; (void)ptr; }
|
||||
#endif
|
||||
|
||||
// Debug counters for SuperSlab investigation
|
||||
#if HAKMEM_DEBUG_COUNTERS
|
||||
int g_superslab_alloc_count = 0;
|
||||
int g_superslab_fail_count = 0;
|
||||
int g_superslab_free_count = 0; // Phase 7.6: Track SuperSlab frees
|
||||
int g_empty_superslab_count = 0; // Phase 7.6: Track empty SuperSlabs detected
|
||||
int g_magazine_push_count = 0; // Phase 7.6: Track Magazine pushes
|
||||
int g_tiny_free_with_slab_count = 0; // Phase 7.6: Track tiny_free_with_slab calls
|
||||
#endif
|
||||
|
||||
// Phase 7.6: Deferred deallocation - keep some empty SuperSlabs as reserve
|
||||
// Phase 8.1: Reduced from 2 to 1 (-2 MB overhead, minimal performance impact)
|
||||
// Phase 8.2: Testing Reserve 0 vs 1 (benchmarking in progress)
|
||||
#define EMPTY_SUPERSLAB_RESERVE 0 // Keep up to N empty SuperSlabs per class (default)
|
||||
static SuperSlab* g_empty_superslabs[TINY_NUM_CLASSES]; // One empty SuperSlab per class
|
||||
static int g_empty_counts[TINY_NUM_CLASSES] = {0}; // Count of empty SuperSlabs
|
||||
static int g_empty_reserve = -1; // Env: HAKMEM_TINY_SS_RESERVE (default=1)
|
||||
static pthread_mutex_t g_empty_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int g_ss_partial_enable = 1; // Enable partial SuperSlab release by default
|
||||
static uint32_t g_ss_partial_interval = 4;
|
||||
static _Atomic uint32_t g_ss_partial_epoch = 0;
|
||||
|
||||
// Phase 6.24: Unified TLS slab cache (Medium fix)
|
||||
// Reduces TLS reads from 3 to 1 (cache-line aligned for performance)
|
||||
// Phase E4: 64B alignment for L1 cache optimization
|
||||
__thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES] __attribute__((aligned(64)));
|
||||
static _Atomic uint32_t g_tls_target_cap[TINY_NUM_CLASSES];
|
||||
static _Atomic uint32_t g_tls_target_refill[TINY_NUM_CLASSES];
|
||||
static _Atomic uint32_t g_tls_target_spill[TINY_NUM_CLASSES];
|
||||
static _Atomic uint64_t g_tls_trim_epoch[TINY_NUM_CLASSES];
|
||||
static _Atomic uint32_t g_tls_param_seq[TINY_NUM_CLASSES];
|
||||
static __thread uint32_t g_tls_param_seen[TINY_NUM_CLASSES];
|
||||
static __thread uint64_t g_tls_trim_seen[TINY_NUM_CLASSES];
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Per-class partial SuperSlab slot (single-slot publish/adopt)
|
||||
// ----------------------------------------------------------------------------
|
||||
// Small ring of partial SuperSlabs per class (publish/adopt)
|
||||
#ifndef SS_PARTIAL_RING
|
||||
#define SS_PARTIAL_RING 64
|
||||
#endif
|
||||
static _Atomic(SuperSlab*) g_ss_partial_ring[TINY_NUM_CLASSES][SS_PARTIAL_RING];
|
||||
static _Atomic(uint32_t) g_ss_partial_rr[TINY_NUM_CLASSES];
|
||||
static _Atomic(SuperSlab*) g_ss_partial_over[TINY_NUM_CLASSES];
|
||||
static __thread int g_tls_adopt_cd[TINY_NUM_CLASSES];
|
||||
static int g_adopt_cool_period = -1; // env: HAKMEM_TINY_SS_ADOPT_COOLDOWN
|
||||
|
||||
// Debug counters (per class): publish/adopt hits (visible when HAKMEM_DEBUG_COUNTERS)
|
||||
unsigned long long g_ss_publish_dbg[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_ss_adopt_dbg[TINY_NUM_CLASSES] = {0};
|
||||
_Atomic int g_ss_remote_seen = 0; // becomes 1 when any remote free occurs
|
||||
static int g_ss_adopt_env = -2; // -2=unparsed, -1=forced OFF, 0=auto, 1=forced ON
|
||||
static _Atomic int g_ss_adopt_runtime = 0; // 0=inactive, 1=active
|
||||
static _Atomic int g_ss_adopt_log_once = 0;
|
||||
|
||||
static void tiny_adopt_gate_log_activation(const char* reason, int class_idx) {
|
||||
if (atomic_exchange_explicit(&g_ss_adopt_log_once, 1, memory_order_acq_rel) == 0) {
|
||||
fprintf(stderr, "[ADOPT_GATE] activated (reason=%s class=%d)\n",
|
||||
reason ? reason : "unknown", class_idx);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void tiny_adopt_gate_parse_env(void) {
|
||||
if (__builtin_expect(g_ss_adopt_env == -2, 0)) {
|
||||
const char* env = getenv("HAKMEM_TINY_SS_ADOPT");
|
||||
if (!env || *env == '\0') {
|
||||
g_ss_adopt_env = 0; // auto
|
||||
} else if (*env == '0') {
|
||||
g_ss_adopt_env = -1; // forced OFF
|
||||
atomic_store_explicit(&g_ss_adopt_runtime, 0, memory_order_release);
|
||||
} else {
|
||||
g_ss_adopt_env = 1; // forced ON
|
||||
atomic_store_explicit(&g_ss_adopt_runtime, 1, memory_order_release);
|
||||
tiny_adopt_gate_log_activation("env", -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int tiny_adopt_gate_should_publish(void) {
|
||||
tiny_adopt_gate_parse_env();
|
||||
if (g_ss_adopt_env == 1) return 1;
|
||||
if (g_ss_adopt_env == -1) return 0;
|
||||
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
|
||||
}
|
||||
|
||||
int tiny_adopt_gate_should_adopt(void) {
|
||||
tiny_adopt_gate_parse_env();
|
||||
if (g_ss_adopt_env == 1) return 1;
|
||||
if (g_ss_adopt_env == -1) return 0;
|
||||
return atomic_load_explicit(&g_ss_adopt_runtime, memory_order_acquire) != 0;
|
||||
}
|
||||
|
||||
void tiny_adopt_gate_on_remote_seen(int class_idx) {
|
||||
tiny_adopt_gate_parse_env();
|
||||
atomic_store_explicit(&g_ss_remote_seen, 1, memory_order_relaxed);
|
||||
if (g_ss_adopt_env == -1) return;
|
||||
int prev = atomic_exchange_explicit(&g_ss_adopt_runtime, 1, memory_order_acq_rel);
|
||||
if (prev == 0) {
|
||||
tiny_adopt_gate_log_activation("remote", class_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Publish/adopt instrumentation, bench mailboxes, and TLS target helpers
|
||||
122
core/hakmem_tiny_phase6_wrappers_box.inc
Normal file
122
core/hakmem_tiny_phase6_wrappers_box.inc
Normal file
@ -0,0 +1,122 @@
|
||||
// Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check
|
||||
#if HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
||||
#if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
|
||||
#error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options"
|
||||
#endif
|
||||
|
||||
// Box 1: Atomic Operations (Layer 0 - Foundation)
|
||||
#include "tiny_atomic.h"
|
||||
|
||||
// Box 5: Allocation Fast Path (Layer 1 - 3-4 instructions)
|
||||
#include "tiny_alloc_fast.inc.h"
|
||||
|
||||
// Box 6: Free Fast Path (Layer 2 - 2-3 instructions)
|
||||
#include "tiny_free_fast.inc.h"
|
||||
|
||||
// ---------------- Refill count (Front) global config ----------------
|
||||
// Parsed once at init; hot path reads plain ints (no getenv).
|
||||
int g_refill_count_global = 0; // HAKMEM_TINY_REFILL_COUNT
|
||||
int g_refill_count_hot = 0; // HAKMEM_TINY_REFILL_COUNT_HOT
|
||||
int g_refill_count_mid = 0; // HAKMEM_TINY_REFILL_COUNT_MID
|
||||
int g_refill_count_class[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_REFILL_COUNT_C{0..7}
|
||||
|
||||
// Export wrapper functions for hakmem.c to call
|
||||
// Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining
|
||||
void* hak_tiny_alloc_fast_wrapper(size_t size) {
|
||||
// Phase E5: Ultra fast path (8-instruction alloc, bypasses all layers)
|
||||
// Enable with: HAKMEM_ULTRA_FAST_PATH=1 (compile-time)
|
||||
#if HAKMEM_ULTRA_FAST_PATH
|
||||
void* ret = tiny_alloc_fast_ultra(size);
|
||||
if (ret) return ret;
|
||||
// Miss → fallback to full fast path
|
||||
#endif
|
||||
|
||||
// Bench-only ultra-short path: bypass diagnostics and pointer tracking
|
||||
// Enable with: HAKMEM_BENCH_FAST_FRONT=1
|
||||
static int g_bench_fast_front = -1;
|
||||
if (__builtin_expect(g_bench_fast_front == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_BENCH_FAST_FRONT");
|
||||
g_bench_fast_front = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
if (__builtin_expect(g_bench_fast_front, 0)) {
|
||||
return tiny_alloc_fast(size);
|
||||
}
|
||||
|
||||
static _Atomic uint64_t wrapper_call_count = 0;
|
||||
uint64_t call_num = atomic_fetch_add(&wrapper_call_count, 1);
|
||||
|
||||
// Pointer tracking init (first call only)
|
||||
PTR_TRACK_INIT();
|
||||
|
||||
// PRIORITY 3: Periodic canary validation (every 1000 ops)
|
||||
periodic_canary_check(call_num, "hak_tiny_alloc_fast_wrapper");
|
||||
|
||||
// Box I: Periodic full integrity check (every 5000 ops)
|
||||
#if HAKMEM_INTEGRITY_LEVEL >= 3
|
||||
if ((call_num % 5000) == 0) {
|
||||
extern void integrity_periodic_full_check(const char*);
|
||||
integrity_periodic_full_check("periodic check in alloc wrapper");
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
if (call_num > 14250 && call_num < 14280 && size <= 1024) {
|
||||
fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu size=%zu\n", call_num, size);
|
||||
fflush(stderr);
|
||||
}
|
||||
#endif
|
||||
|
||||
void* result = tiny_alloc_fast(size);
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
if (call_num > 14250 && call_num < 14280 && size <= 1024) {
|
||||
fprintf(stderr, "[HAK_TINY_ALLOC_FAST_WRAPPER] call=%lu returned %p\n", call_num, result);
|
||||
fflush(stderr);
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
void hak_tiny_free_fast_wrapper(void* ptr) {
|
||||
// Phase E5: Ultra fast path (6-8 instruction free)
|
||||
#if HAKMEM_ULTRA_FAST_PATH
|
||||
tiny_free_fast_ultra(ptr);
|
||||
return;
|
||||
#endif
|
||||
|
||||
static _Atomic uint64_t free_call_count = 0;
|
||||
uint64_t call_num = atomic_fetch_add(&free_call_count, 1);
|
||||
if (call_num > 14135 && call_num < 14145) {
|
||||
fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu ptr=%p\n", call_num, ptr);
|
||||
fflush(stderr);
|
||||
}
|
||||
tiny_free_fast(ptr);
|
||||
if (call_num > 14135 && call_num < 14145) {
|
||||
fprintf(stderr, "[HAK_TINY_FREE_FAST_WRAPPER] call=%lu completed\n", call_num);
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
|
||||
// Phase 6-1.5: Alignment guessing (legacy)
|
||||
|
||||
// Refill count globals (needed for compatibility)
|
||||
int g_refill_count_global = 0;
|
||||
int g_refill_count_hot = 0;
|
||||
int g_refill_count_mid = 0;
|
||||
int g_refill_count_class[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
#include "hakmem_tiny_ultra_simple.inc"
|
||||
|
||||
// Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking)
|
||||
void* hak_tiny_alloc_fast_wrapper(size_t size) {
|
||||
return hak_tiny_alloc_ultra_simple(size);
|
||||
}
|
||||
|
||||
void hak_tiny_free_fast_wrapper(void* ptr) {
|
||||
hak_tiny_free_ultra_simple(ptr);
|
||||
}
|
||||
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
|
||||
// Phase 6-1.6: Metadata header (recommended)
|
||||
#include "hakmem_tiny_metadata.inc"
|
||||
#endif
|
||||
419
core/hakmem_tiny_publish_box.inc
Normal file
419
core/hakmem_tiny_publish_box.inc
Normal file
@ -0,0 +1,419 @@
|
||||
// hakmem_tiny_publish_box.inc
|
||||
// Box: Publish/adopt instrumentation, bench mailboxes, and TLS target helpers.
|
||||
// Extracted from hakmem_tiny.c to keep hot-path logic focused.
|
||||
|
||||
// TLS hint: last adopted SuperSlab/slab to avoid rescans
|
||||
#include "tiny_sticky.h"
|
||||
|
||||
// Mailbox box
|
||||
#include "box/mailbox_box.h"
|
||||
|
||||
// Publish pipeline counters (visibility)
|
||||
unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_pub_same_empty[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_remote_transitions[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_mailbox_register_calls[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_mailbox_slow_discoveries[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Slab-ring counters (debug)
|
||||
unsigned long long g_slab_publish_dbg[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_slab_adopt_dbg[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_slab_requeue_dbg[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_slab_miss_dbg[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Slab entry encoding helpers (used by Bench/Slab-ring paths)
|
||||
static inline uintptr_t slab_entry_make(SuperSlab* ss, int slab_idx) {
|
||||
return ((uintptr_t)ss) | ((uintptr_t)slab_idx & 0x3Fu);
|
||||
}
|
||||
static inline SuperSlab* slab_entry_ss(uintptr_t ent) {
|
||||
// SuperSlab is aligned to at least 1MB; clear low 1MB bits to recover base
|
||||
return (SuperSlab*)(ent & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
|
||||
}
|
||||
static inline int slab_entry_idx(uintptr_t ent) {
|
||||
return (int)(ent & 0x3Fu);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Bench Mode Publish Mailbox (single-slot per class)
|
||||
// ----------------------------------------------------------------------------
|
||||
static int g_bench_mode = -1; // env: HAKMEM_TINY_BENCH_MODE=1
|
||||
static _Atomic(uintptr_t) g_bench_mailbox_rr[TINY_NUM_CLASSES];
|
||||
#ifndef BENCH_MAILBOX_WIDTH
|
||||
#define BENCH_MAILBOX_WIDTH 16
|
||||
#endif
|
||||
static _Atomic(uintptr_t) g_bench_mailbox[TINY_NUM_CLASSES][BENCH_MAILBOX_WIDTH];
|
||||
|
||||
static inline int bench_mode_enabled(void) {
|
||||
if (__builtin_expect(g_bench_mode == -1, 0)) {
|
||||
const char* b = getenv("HAKMEM_TINY_BENCH_MODE");
|
||||
g_bench_mode = (b && atoi(b) != 0) ? 1 : 0;
|
||||
}
|
||||
return g_bench_mode;
|
||||
}
|
||||
|
||||
static inline void bench_pub_push(int class_idx, SuperSlab* ss, int slab_idx) {
|
||||
if (!bench_mode_enabled()) return;
|
||||
uintptr_t ent = slab_entry_make(ss, slab_idx);
|
||||
uint32_t idx = atomic_fetch_add_explicit(&g_bench_mailbox_rr[class_idx], 1u, memory_order_relaxed);
|
||||
idx &= (BENCH_MAILBOX_WIDTH - 1);
|
||||
atomic_store_explicit(&g_bench_mailbox[class_idx][idx], ent, memory_order_release);
|
||||
}
|
||||
|
||||
static inline uintptr_t bench_pub_pop(int class_idx) {
|
||||
if (!bench_mode_enabled()) return (uintptr_t)0;
|
||||
for (int i = 0; i < BENCH_MAILBOX_WIDTH; i++) {
|
||||
uintptr_t ent = atomic_exchange_explicit(&g_bench_mailbox[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
|
||||
if (ent) return ent;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Slab-Granular Partial Publish/Adopt (encoded entries)
|
||||
// ----------------------------------------------------------------------------
|
||||
#ifndef SLAB_PARTIAL_RING
|
||||
#define SLAB_PARTIAL_RING 128
|
||||
#endif
|
||||
static _Atomic(uintptr_t) g_slab_partial_ring[TINY_NUM_CLASSES][SLAB_PARTIAL_RING];
|
||||
static _Atomic(uint32_t) g_slab_partial_rr2[TINY_NUM_CLASSES];
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Refill-stage counters (per class)
|
||||
// ----------------------------------------------------------------------------
|
||||
unsigned long long g_rf_total_calls[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_hit_reg[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_mmap_calls[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Diagnostic: refill early return counters (to debug why g_rf_hit_slab is 0)
|
||||
unsigned long long g_rf_early_no_ss[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_early_no_meta[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_early_no_room[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_early_want_zero[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Refill timing (ns) per class and per stage (env: HAKMEM_TINY_RF_TRACE)
|
||||
unsigned long long g_rf_time_total_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_hot_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_bench_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_mail_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_slab_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Refill item source breakdown (freelist vs carve)
|
||||
unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_rf_carve_items[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
static int g_rf_trace_en = -1;
|
||||
static inline int rf_trace_enabled(void) {
|
||||
if (__builtin_expect(g_rf_trace_en == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_TINY_RF_TRACE");
|
||||
g_rf_trace_en = (e && atoi(e) != 0) ? 1 : 0;
|
||||
}
|
||||
return g_rf_trace_en;
|
||||
}
|
||||
|
||||
static inline unsigned long long rf_now_ns(void) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned long long)ts.tv_sec * 1000000000ull + (unsigned long long)ts.tv_nsec;
|
||||
}
|
||||
|
||||
// Publish-side counters (debug)
|
||||
unsigned long long g_pub_bench_hits[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_pub_hot_hits[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_pub_mail_hits[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Free pipeline counters
|
||||
unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_free_via_mag[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Front Gate Breakdown (debug counters)
|
||||
unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_front_sll_hit[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_front_mag_hit[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Free-side trigger counters
|
||||
unsigned long long g_first_free_transitions[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES] = {0};
|
||||
|
||||
// Adopt/Registry gate counters
|
||||
unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_push_hits[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_push_full[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_push_disabled[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_push_zero_cap[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_push_gate_disabled[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_push_gate_zero_cap[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_spare_attempts[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_spare_disabled[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_spare_empty[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_spare_lookup_fail[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_spare_bad_index[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_lookup_ss[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_lookup_slab[TINY_NUM_CLASSES] = {0};
|
||||
unsigned long long g_fast_lookup_none = 0;
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Live Superslab cap (must-adopt-before-mmap support)
|
||||
// ----------------------------------------------------------------------------
|
||||
static int g_live_cap_env = -2; // -2=unparsed, -1=disabled, >=0=cap value
|
||||
__thread int g_tls_live_ss[TINY_NUM_CLASSES] = {0};
|
||||
static inline int live_cap_for_class(int class_idx) {
|
||||
if (__builtin_expect(g_live_cap_env == -2, 0)) {
|
||||
const char* s = getenv("HAKMEM_SS_LIVE_CAP");
|
||||
if (!s) g_live_cap_env = -1; else { int v = atoi(s); g_live_cap_env = (v>0? v : -1); }
|
||||
}
|
||||
(void)class_idx;
|
||||
return g_live_cap_env;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Hot Slot (global simple path)
|
||||
// ----------------------------------------------------------------------------
|
||||
static int g_hot_slot_en = -1; // env: HAKMEM_HOT_SLOT=1 (bench mode implies hot slot)
|
||||
static _Atomic(uintptr_t) g_hot_slot[TINY_NUM_CLASSES];
|
||||
static inline int hot_slot_enabled(void) {
|
||||
if (__builtin_expect(g_hot_slot_en == -1, 0)) {
|
||||
const char* s = getenv("HAKMEM_HOT_SLOT");
|
||||
g_hot_slot_en = (s && atoi(s) != 0) ? 1 : 0;
|
||||
}
|
||||
return g_hot_slot_en || bench_mode_enabled();
|
||||
}
|
||||
static inline void hot_slot_push(int class_idx, SuperSlab* ss, int slab_idx) {
|
||||
if (!hot_slot_enabled()) return;
|
||||
uintptr_t ent = slab_entry_make(ss, slab_idx);
|
||||
atomic_exchange_explicit(&g_hot_slot[class_idx], ent, memory_order_release);
|
||||
}
|
||||
static inline uintptr_t hot_slot_pop(int class_idx) {
|
||||
if (!hot_slot_enabled()) return (uintptr_t)0;
|
||||
return atomic_exchange_explicit(&g_hot_slot[class_idx], (uintptr_t)0, memory_order_acq_rel);
|
||||
}
|
||||
|
||||
static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
|
||||
if (!ss) return;
|
||||
uintptr_t ent = slab_entry_make(ss, slab_idx);
|
||||
for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
|
||||
uintptr_t expected = 0;
|
||||
if (atomic_compare_exchange_strong_explicit(&g_slab_partial_ring[class_idx][i], &expected, ent,
|
||||
memory_order_release, memory_order_relaxed)) {
|
||||
g_slab_publish_dbg[class_idx]++;
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Ring full: round-robin replace and try to requeue the displaced entry into another empty slot
|
||||
uint32_t idx = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
|
||||
uintptr_t old = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][idx], ent, memory_order_acq_rel);
|
||||
if (old) {
|
||||
for (int t = 0; t < 8; t++) {
|
||||
uint32_t j = atomic_fetch_add_explicit(&g_slab_partial_rr2[class_idx], 1u, memory_order_relaxed) % SLAB_PARTIAL_RING;
|
||||
uintptr_t expected = 0;
|
||||
if (atomic_compare_exchange_weak_explicit(&g_slab_partial_ring[class_idx][j], &expected, old,
|
||||
memory_order_release, memory_order_relaxed)) {
|
||||
g_slab_requeue_dbg[class_idx]++;
|
||||
old = 0; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
g_slab_publish_dbg[class_idx]++;
|
||||
}
|
||||
|
||||
static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
|
||||
for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
|
||||
uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
|
||||
if (ent) return ent;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ss_partial_publish(int class_idx, SuperSlab* ss) {
|
||||
if (!ss) return;
|
||||
// Gate by listed flag to avoid repeated publishes of the same SS
|
||||
unsigned prev = atomic_exchange_explicit(&ss->listed, 1u, memory_order_acq_rel);
|
||||
if (prev != 0u) return; // already listed
|
||||
|
||||
// CRITICAL: Release ownership of all slabs so adopters can claim them!
|
||||
// Without this, ss_owner_try_acquire() will fail (requires owner_tid==0).
|
||||
// The publishing thread must stop using this SS after publishing.
|
||||
int cap_pub = ss_slabs_capacity(ss);
|
||||
for (int s = 0; s < cap_pub; s++) {
|
||||
// TODO Phase 3d-B: Add atomic accessor when implementing Hot/Cold split
|
||||
TinySlabMeta* meta = ss_slab_meta_ptr(ss, s);
|
||||
uint8_t prev = __atomic_exchange_n(&meta->owner_tid_low, 0u, __ATOMIC_RELEASE);
|
||||
if (__builtin_expect(g_debug_remote_guard && prev != 0u, 0)) {
|
||||
uintptr_t aux = ((uintptr_t)s << 32) | (uintptr_t)prev;
|
||||
tiny_debug_ring_record(TINY_RING_EVENT_OWNER_RELEASE,
|
||||
(uint16_t)ss_slab_meta_class_idx_get(ss, s),
|
||||
meta,
|
||||
aux);
|
||||
}
|
||||
}
|
||||
|
||||
// CRITICAL: Unbind current thread's TLS if it points to this SS!
|
||||
// Otherwise, the publishing thread will continue allocating from the published SS,
|
||||
// racing with adopters who acquire ownership.
|
||||
extern __thread TinyTLSSlab g_tls_slabs[];
|
||||
if (g_tls_slabs[class_idx].ss == ss) {
|
||||
g_tls_slabs[class_idx].ss = NULL;
|
||||
g_tls_slabs[class_idx].meta = NULL;
|
||||
g_tls_slabs[class_idx].slab_base = NULL;
|
||||
g_tls_slabs[class_idx].slab_idx = 0;
|
||||
}
|
||||
|
||||
// Compute a quick best-slab hint for adopters (prefer slabs marked slab_listed=1)
|
||||
int best = -1; uint32_t best_score = 0;
|
||||
for (int s = 0; s < cap_pub; s++) {
|
||||
TinySlabMeta* m = &ss->slabs[s];
|
||||
uint32_t rc = atomic_load_explicit(&ss->remote_counts[s], memory_order_relaxed);
|
||||
int has_remote = (atomic_load_explicit(&ss->remote_heads[s], memory_order_acquire) != 0);
|
||||
unsigned listed = atomic_load_explicit(&ss->slab_listed[s], memory_order_relaxed) ? 1u : 0u;
|
||||
uint32_t score = rc
|
||||
+ (m->freelist ? (1u<<30) : 0u)
|
||||
+ (listed ? (1u<<29) : 0u)
|
||||
+ (has_remote ? 1u : 0u);
|
||||
if (score > best_score) { best_score = score; best = s; }
|
||||
}
|
||||
if (best >= 0 && best < 256) {
|
||||
ss->publish_hint = (uint8_t)best;
|
||||
// Box: Ready push — provide slab-level candidate to adopters
|
||||
tiny_ready_push(class_idx, ss, best);
|
||||
} else {
|
||||
ss->publish_hint = 0xFF;
|
||||
}
|
||||
for (int i = 0; i < SS_PARTIAL_RING; i++) {
|
||||
SuperSlab* expected = NULL;
|
||||
if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
|
||||
memory_order_release, memory_order_relaxed)) {
|
||||
g_ss_publish_dbg[class_idx]++;
|
||||
return; // published
|
||||
}
|
||||
}
|
||||
// Ring full: replace one entry in round-robin to avoid dropping supply
|
||||
uint32_t idx = atomic_fetch_add_explicit(&g_ss_partial_rr[class_idx], 1u, memory_order_relaxed);
|
||||
idx %= SS_PARTIAL_RING;
|
||||
SuperSlab* old = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][idx], ss, memory_order_acq_rel);
|
||||
if (old) {
|
||||
// NOTE: Do NOT drain here! The old SuperSlab may have slabs owned by other threads
|
||||
// that just adopted from it. Draining without ownership checks causes freelist corruption.
|
||||
// The adopter will drain when needed (with proper ownership checks in tiny_refill.h).
|
||||
//
|
||||
// Previous code (UNSAFE):
|
||||
// for (int s = 0; s < cap; s++) {
|
||||
// ss_remote_drain_to_freelist(old, s); // ← Race with concurrent adopter!
|
||||
// }
|
||||
|
||||
// Keep listed=1 while in overflow so it stays eligible for adopt
|
||||
// Push old into overflow stack (待機箱)
|
||||
SuperSlab* head;
|
||||
do {
|
||||
head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
|
||||
old->partial_next = head;
|
||||
} while (!atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, old,
|
||||
memory_order_release, memory_order_relaxed));
|
||||
}
|
||||
g_ss_publish_dbg[class_idx]++;
|
||||
}
|
||||
|
||||
SuperSlab* ss_partial_adopt(int class_idx) {
|
||||
for (int i = 0; i < SS_PARTIAL_RING; i++) {
|
||||
SuperSlab* ss = atomic_exchange_explicit(&g_ss_partial_ring[class_idx][i], NULL, memory_order_acq_rel);
|
||||
if (ss) {
|
||||
// Clear listed flag on adopt to allow future publish of this SS
|
||||
atomic_store_explicit(&ss->listed, 0u, memory_order_release);
|
||||
g_ss_adopt_dbg[class_idx]++;
|
||||
return ss;
|
||||
}
|
||||
}
|
||||
// Fallback: adopt from overflow stack (LIFO)
|
||||
while (1) {
|
||||
SuperSlab* head = atomic_load_explicit(&g_ss_partial_over[class_idx], memory_order_acquire);
|
||||
if (!head) break;
|
||||
SuperSlab* next = head->partial_next;
|
||||
if (atomic_compare_exchange_weak_explicit(&g_ss_partial_over[class_idx], &head, next,
|
||||
memory_order_acq_rel, memory_order_relaxed)) {
|
||||
atomic_store_explicit(&head->listed, 0u, memory_order_release);
|
||||
g_ss_adopt_dbg[class_idx]++;
|
||||
return head;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx) {
|
||||
// Canonical binding under Phase 12:
|
||||
// - Per-slab TinySlabMeta.class_idx defines class for this slab
|
||||
// - slab_idx is the owning slab index within ss
|
||||
// - slab_base is ALWAYS derived from tiny_slab_base_for(ss, slab_idx)
|
||||
tls->ss = ss;
|
||||
tls->slab_idx = (uint8_t)slab_idx;
|
||||
tls->meta = &ss->slabs[slab_idx];
|
||||
tls->slab_base = tiny_slab_base_for(ss, slab_idx);
|
||||
}
|
||||
|
||||
static inline uint32_t tiny_tls_default_refill(uint32_t cap) {
|
||||
if (cap == 0u) return 8u;
|
||||
uint32_t low = (cap >= 32u) ? (cap / 4u) : 8u;
|
||||
if (low < 4u) low = 4u;
|
||||
return low;
|
||||
}
|
||||
|
||||
static inline uint32_t tiny_tls_default_spill(uint32_t cap) {
|
||||
if (cap == 0u) return 0u;
|
||||
uint64_t spill = (uint64_t)cap + (uint64_t)(cap / 2u);
|
||||
if (spill > TINY_TLS_MAG_CAP) spill = TINY_TLS_MAG_CAP;
|
||||
if (spill < cap) spill = cap;
|
||||
return (uint32_t)spill;
|
||||
}
|
||||
|
||||
static inline void tiny_tls_publish_targets(int class_idx, uint32_t cap) {
|
||||
atomic_store_explicit(&g_tls_target_cap[class_idx], cap, memory_order_release);
|
||||
atomic_store_explicit(&g_tls_target_refill[class_idx], tiny_tls_default_refill(cap), memory_order_relaxed);
|
||||
atomic_store_explicit(&g_tls_target_spill[class_idx], tiny_tls_default_spill(cap), memory_order_relaxed);
|
||||
atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
|
||||
}
|
||||
|
||||
static inline void tiny_tls_request_trim(int class_idx, uint64_t epoch) {
|
||||
atomic_store_explicit(&g_tls_trim_epoch[class_idx], epoch, memory_order_release);
|
||||
atomic_fetch_add_explicit(&g_tls_param_seq[class_idx], 1u, memory_order_release);
|
||||
}
|
||||
|
||||
static inline void tiny_tls_refresh_params(int class_idx, TinyTLSList* tls) {
|
||||
uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_acquire);
|
||||
if (__builtin_expect(seq == g_tls_param_seen[class_idx], 1)) {
|
||||
return;
|
||||
}
|
||||
uint32_t target_cap = atomic_load_explicit(&g_tls_target_cap[class_idx], memory_order_acquire);
|
||||
if (target_cap != 0u && tls->cap != target_cap) {
|
||||
tls->cap = target_cap;
|
||||
uint32_t target_refill = atomic_load_explicit(&g_tls_target_refill[class_idx], memory_order_relaxed);
|
||||
if (target_refill == 0u) target_refill = tiny_tls_default_refill(target_cap);
|
||||
tls->refill_low = target_refill;
|
||||
uint32_t target_spill = atomic_load_explicit(&g_tls_target_spill[class_idx], memory_order_relaxed);
|
||||
if (target_spill < target_cap) target_spill = target_cap;
|
||||
tls->spill_high = target_spill;
|
||||
}
|
||||
uint64_t trim_epoch = atomic_load_explicit(&g_tls_trim_epoch[class_idx], memory_order_acquire);
|
||||
if (trim_epoch != 0u && g_tls_trim_seen[class_idx] != trim_epoch) {
|
||||
g_tls_trim_seen[class_idx] = trim_epoch;
|
||||
if (tls->count > tls->cap) {
|
||||
tls_list_spill_excess(class_idx, tls);
|
||||
}
|
||||
}
|
||||
g_tls_param_seen[class_idx] = seq;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user