Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update

Add comprehensive design docs and research boxes:
- docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation
- docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs
- docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research
- docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design
- docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings
- docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results
- docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation

Research boxes (SS page table):
- core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate
- core/box/ss_pt_types_box.h: 2-level page table structures
- core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation
- core/box/ss_pt_register_box.h: Page table registration
- core/box/ss_pt_impl.c: Global definitions

Updates:
- docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars
- core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration
- core/box/pool_mid_inuse_deferred_box.h: Deferred API updates
- core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection
- core/hakmem_super_registry: SS page table integration

Current Status:
- FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption
- ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box
- Next: Optimization roadmap per ROI (mimalloc gap 2.5x)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-13 05:35:46 +09:00
parent b917357034
commit d9991f39ff
18 changed files with 1721 additions and 25 deletions

View File

@ -224,19 +224,42 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
// ========== Mid/L25/Tiny Registry Lookup (Headerless) ==========
// MIDCAND: Could be Mid/Large/C7, needs registry lookup
// Phase MID-V3: Try v3 ownership first (RegionIdBox-based)
// ENV-controlled, default OFF
if (__builtin_expect(mid_v3_enabled(), 0)) {
// Phase FREE-DISPATCH-SSOT: Single Source of Truth for region lookup
// ENV: HAKMEM_FREE_DISPATCH_SSOT (default: 0 for backward compat, 1 for optimized)
// Problem: Old code did region_id_lookup TWICE in MID-V3 path (once inside mid_hot_v3_free, once after)
// Fix: Do lookup ONCE at top, dispatch based on kind
static int g_free_dispatch_ssot = -1;
if (__builtin_expect(g_free_dispatch_ssot == -1, 0)) {
const char* env = getenv("HAKMEM_FREE_DISPATCH_SSOT");
g_free_dispatch_ssot = (env && *env == '1') ? 1 : 0;
}
if (g_free_dispatch_ssot && __builtin_expect(mid_v3_enabled(), 0)) {
// SSOT=1: Single lookup, then dispatch
extern RegionLookupV6 region_id_lookup_cached_v6(void* ptr);
RegionLookupV6 lk = region_id_lookup_cached_v6(ptr);
if (lk.kind == REGION_KIND_MID_V3) {
// Owned by MID-V3: call free handler directly (no internal lookup)
// Note: We pass the pre-looked-up info implicitly via TLS cache
mid_hot_v3_free(ptr);
if (mid_v3_debug_enabled()) {
static _Atomic int free_log_count = 0;
if (atomic_fetch_add(&free_log_count, 1) < 10) {
fprintf(stderr, "[MID_V3] Free SSOT: ptr=%p\n", ptr);
}
}
goto done;
}
// Not MID-V3: fall through to other dispatch paths below
} else if (__builtin_expect(mid_v3_enabled(), 0)) {
// SSOT=0: Legacy double-lookup path (for A/B comparison)
// RegionIdBox lookup to check if v3 owns this pointer
// mid_hot_v3_free() will check internally and return early if not owned
mid_hot_v3_free(ptr);
// Check if v3 actually owned it by doing a quick verification
// For now, we'll use the existence check via RegionIdBox
// If v3 handled it, it would have returned already
// We need to check if v3 took ownership - simplified: always check other paths too
// Better approach: mid_hot_v3_free returns bool or we check ownership first
// For safety, check ownership explicitly before continuing
// This prevents double-free if v3 handled it
extern RegionLookupV6 region_id_lookup_v6(void* ptr);

View File

@ -72,6 +72,7 @@ static void mid_inuse_deferred_thread_cleanup(void* arg) {
(void)arg;
if (hak_pool_mid_inuse_deferred_enabled()) {
mid_inuse_deferred_drain();
mid_inuse_deferred_stats_flush_tls_to_global();
}
}
@ -193,15 +194,16 @@ static inline void mid_inuse_deferred_drain(void) {
MID_INUSE_DEFERRED_STAT_ADD(decs_drained, n);
// Atomic subtract (batched count)
uint64_t old = atomic_fetch_sub_explicit(&d->in_use, n, memory_order_relaxed);
int old = atomic_fetch_sub_explicit(&d->in_use, (int)n, memory_order_relaxed);
int nv = old - (int)n;
// Check for empty transition
if (old >= n && old - n == 0) {
if (nv <= 0) {
// Fire once per empty transition
// Use atomic_exchange to ensure only ONE thread enqueues DONTNEED
if (d->pending_dn == 0) {
d->pending_dn = 1;
if (atomic_exchange_explicit(&d->pending_dn, 1, memory_order_acq_rel) == 0) {
MID_INUSE_DEFERRED_STAT_INC(empty_transitions);
hak_batch_add_page(page, POOL_PAGE_SIZE);
hak_batch_add_page(d->page, POOL_PAGE_SIZE);
}
}
}

View File

@ -18,6 +18,15 @@
#include <stdio.h>
#include <stdlib.h>
static inline int hak_pool_mid_inuse_deferred_stats_enabled(void) {
static int g = -1;
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_POOL_MID_INUSE_DEFERRED_STATS");
g = (e && *e == '1') ? 1 : 0; // default OFF
}
return g;
}
// Statistics structure
typedef struct {
_Atomic uint64_t mid_inuse_deferred_hit; // Total deferred decrements
@ -27,21 +36,58 @@ typedef struct {
_Atomic uint64_t empty_transitions; // Pages that went to 0
} MidInuseDeferredStats;
typedef struct {
uint64_t mid_inuse_deferred_hit;
uint64_t drain_calls;
uint64_t pages_drained;
uint64_t decs_drained;
uint64_t empty_transitions;
} MidInuseDeferredStatsTls;
// Global stats instance
static MidInuseDeferredStats g_mid_inuse_deferred_stats;
// Stats increment macros (inline for hot path)
static __thread MidInuseDeferredStatsTls g_mid_inuse_deferred_stats_tls;
static inline MidInuseDeferredStatsTls* mid_inuse_deferred_stats_tls(void) {
return &g_mid_inuse_deferred_stats_tls;
}
static inline void mid_inuse_deferred_stats_flush_tls_to_global(void) {
if (!hak_pool_mid_inuse_deferred_stats_enabled()) return;
MidInuseDeferredStatsTls* tls = mid_inuse_deferred_stats_tls();
if (!tls->mid_inuse_deferred_hit && !tls->drain_calls) return;
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.mid_inuse_deferred_hit, tls->mid_inuse_deferred_hit, memory_order_relaxed);
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.drain_calls, tls->drain_calls, memory_order_relaxed);
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.pages_drained, tls->pages_drained, memory_order_relaxed);
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.decs_drained, tls->decs_drained, memory_order_relaxed);
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.empty_transitions, tls->empty_transitions, memory_order_relaxed);
*tls = (MidInuseDeferredStatsTls){0};
}
// Stats increment macros (hot path): default OFF, per-thread counters.
#define MID_INUSE_DEFERRED_STAT_INC(field) \
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.field, 1, memory_order_relaxed)
do { \
if (__builtin_expect(hak_pool_mid_inuse_deferred_stats_enabled(), 0)) { \
mid_inuse_deferred_stats_tls()->field++; \
} \
} while (0)
#define MID_INUSE_DEFERRED_STAT_ADD(field, n) \
atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.field, (n), memory_order_relaxed)
do { \
if (__builtin_expect(hak_pool_mid_inuse_deferred_stats_enabled(), 0)) { \
mid_inuse_deferred_stats_tls()->field += (uint64_t)(n); \
} \
} while (0)
// Dump stats on exit (if ENV var set)
static void mid_inuse_deferred_stats_dump(void) {
// Only dump if deferred is enabled
const char* e = getenv("HAKMEM_POOL_MID_INUSE_DEFERRED");
if (!e || *e != '1') return;
if (!hak_pool_mid_inuse_deferred_stats_enabled()) return;
// Best-effort flush for the current thread (other threads flush at thread-exit cleanup).
mid_inuse_deferred_stats_flush_tls_to_global();
uint64_t hits = atomic_load_explicit(&g_mid_inuse_deferred_stats.mid_inuse_deferred_hit, memory_order_relaxed);
uint64_t drains = atomic_load_explicit(&g_mid_inuse_deferred_stats.drain_calls, memory_order_relaxed);

27
core/box/ss_pt_env_box.h Normal file
View File

@ -0,0 +1,27 @@
#ifndef SS_PT_ENV_BOX_H
#define SS_PT_ENV_BOX_H
#include <stdlib.h>
#include <string.h>
// HAKMEM_SS_LOOKUP_KIND=hash|pt (default hash)
static inline int hak_ss_lookup_pt_enabled(void) {
static int g = -1;
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_SS_LOOKUP_KIND");
g = (e && strcmp(e, "pt") == 0) ? 1 : 0;
}
return g;
}
// HAKMEM_SS_PT_STATS=1 (default 0, OFF)
static inline int hak_ss_pt_stats_enabled(void) {
static int g = -1;
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_SS_PT_STATS");
g = (e && *e == '1') ? 1 : 0;
}
return g;
}
#endif

7
core/box/ss_pt_impl.c Normal file
View File

@ -0,0 +1,7 @@
#include "ss_pt_types_box.h"
// Global page table (2MB BSS)
SsPtL1 g_ss_pt = {0};
// TLS stats
__thread SsPtStats t_ss_pt_stats = {0};

View File

@ -0,0 +1,36 @@
#ifndef SS_PT_LOOKUP_BOX_H
#define SS_PT_LOOKUP_BOX_H
#include "ss_pt_types_box.h"
#include "ss_pt_env_box.h"
// O(1) lookup (hot path, lock-free)
static inline struct SuperSlab* ss_pt_lookup(void* addr) {
uintptr_t p = (uintptr_t)addr;
// Out-of-range check (>> 48 for LA57 compatibility)
if (__builtin_expect(p >> 48, 0)) {
if (hak_ss_pt_stats_enabled()) t_ss_pt_stats.pt_out_of_range++;
return NULL; // Fallback to hash handled by caller
}
uint32_t l1_idx = SS_PT_L1_INDEX(addr);
uint32_t l2_idx = SS_PT_L2_INDEX(addr);
// L1 load (acquire)
SsPtL2* l2 = atomic_load_explicit(&g_ss_pt.l2[l1_idx], memory_order_acquire);
if (__builtin_expect(l2 == NULL, 0)) {
if (hak_ss_pt_stats_enabled()) t_ss_pt_stats.pt_miss++;
return NULL;
}
// L2 load (acquire)
struct SuperSlab* ss = atomic_load_explicit(&l2->entries[l2_idx], memory_order_acquire);
if (hak_ss_pt_stats_enabled()) {
if (ss) t_ss_pt_stats.pt_hit++;
else t_ss_pt_stats.pt_miss++;
}
return ss;
}
#endif

View File

@ -0,0 +1,74 @@
#ifndef SS_PT_REGISTER_BOX_H
#define SS_PT_REGISTER_BOX_H
#include "ss_pt_types_box.h"
#include <sys/mman.h>
// Register single 512KB chunk (cold path)
static inline void ss_pt_register_chunk(void* chunk_base, struct SuperSlab* ss) {
uintptr_t p = (uintptr_t)chunk_base;
// Out-of-range check
if (p >> 48) return;
uint32_t l1_idx = SS_PT_L1_INDEX(chunk_base);
uint32_t l2_idx = SS_PT_L2_INDEX(chunk_base);
// Ensure L2 exists
SsPtL2* l2 = atomic_load_explicit(&g_ss_pt.l2[l1_idx], memory_order_acquire);
if (l2 == NULL) {
SsPtL2* new_l2 = (SsPtL2*)mmap(NULL, sizeof(SsPtL2),
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (new_l2 == MAP_FAILED) return;
SsPtL2* expected = NULL;
if (!atomic_compare_exchange_strong_explicit(&g_ss_pt.l2[l1_idx],
&expected, new_l2, memory_order_acq_rel, memory_order_acquire)) {
munmap(new_l2, sizeof(SsPtL2));
l2 = expected;
} else {
l2 = new_l2;
}
}
// Store SuperSlab pointer (release)
atomic_store_explicit(&l2->entries[l2_idx], ss, memory_order_release);
}
// Unregister single chunk (NULL store, L2 never freed)
static inline void ss_pt_unregister_chunk(void* chunk_base) {
uintptr_t p = (uintptr_t)chunk_base;
if (p >> 48) return;
uint32_t l1_idx = SS_PT_L1_INDEX(chunk_base);
uint32_t l2_idx = SS_PT_L2_INDEX(chunk_base);
SsPtL2* l2 = atomic_load_explicit(&g_ss_pt.l2[l1_idx], memory_order_acquire);
if (l2) {
atomic_store_explicit(&l2->entries[l2_idx], NULL, memory_order_release);
}
}
// Register all chunks of a SuperSlab (1MB=2 chunks, 2MB=4 chunks)
static inline void ss_pt_register(struct SuperSlab* ss, void* base, int lg_size) {
size_t size = (size_t)1 << lg_size;
size_t chunk_size = (size_t)1 << SS_PT_CHUNK_LG; // 512KB
size_t n_chunks = size / chunk_size;
for (size_t i = 0; i < n_chunks; i++) {
ss_pt_register_chunk((char*)base + i * chunk_size, ss);
}
}
static inline void ss_pt_unregister(void* base, int lg_size) {
size_t size = (size_t)1 << lg_size;
size_t chunk_size = (size_t)1 << SS_PT_CHUNK_LG;
size_t n_chunks = size / chunk_size;
for (size_t i = 0; i < n_chunks; i++) {
ss_pt_unregister_chunk((char*)base + i * chunk_size);
}
}
#endif

View File

@ -0,0 +1,49 @@
#ifndef SS_PT_TYPES_BOX_H
#define SS_PT_TYPES_BOX_H
#include <stdatomic.h>
#include <stdint.h>
// Constants (18/11 split as per design)
#define SS_PT_CHUNK_LG 19 // 512KB
#define SS_PT_L2_BITS 11 // 2K entries per L2
#define SS_PT_L1_BITS 18 // 256K L1 entries
#define SS_PT_L2_SIZE (1u << SS_PT_L2_BITS) // 2048
#define SS_PT_L1_SIZE (1u << SS_PT_L1_BITS) // 262144
#define SS_PT_L2_MASK (SS_PT_L2_SIZE - 1)
#define SS_PT_L1_MASK (SS_PT_L1_SIZE - 1)
// Index extraction macros
#define SS_PT_L1_INDEX(addr) \
((uint32_t)(((uintptr_t)(addr) >> (SS_PT_CHUNK_LG + SS_PT_L2_BITS)) & SS_PT_L1_MASK))
#define SS_PT_L2_INDEX(addr) \
((uint32_t)(((uintptr_t)(addr) >> SS_PT_CHUNK_LG) & SS_PT_L2_MASK))
// Forward declaration
struct SuperSlab;
// L2 page: 2K entries (16KB)
typedef struct SsPtL2 {
_Atomic(struct SuperSlab*) entries[SS_PT_L2_SIZE];
} SsPtL2;
// L1 table: 256K entries (2MB)
typedef struct SsPtL1 {
_Atomic(SsPtL2*) l2[SS_PT_L1_SIZE];
} SsPtL1;
// Global page table (defined in ss_pt_impl.c)
extern SsPtL1 g_ss_pt;
// Stats (TLS to avoid contention, aggregate on dump)
typedef struct SsPtStats {
uint64_t pt_hit;
uint64_t pt_miss;
uint64_t pt_out_of_range;
} SsPtStats;
extern __thread SsPtStats t_ss_pt_stats;
#endif

View File

@ -4,6 +4,7 @@
#include "box/ss_addr_map_box.h" // Phase 9-1: SuperSlab address map
#include "box/ss_cold_start_box.inc.h" // Phase 11+: Cold Start prewarm defaults
#include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
#include "box/ss_pt_register_box.h" // Phase 9-2: Page table registration
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
@ -135,6 +136,11 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) {
// Phase 9-1: Also register in new hash table (for optimized lookup)
ss_map_insert(&g_ss_addr_map, (void*)base, ss);
// Phase 9-2: Register in page table (if enabled)
if (hak_ss_lookup_pt_enabled()) {
ss_pt_register(ss, (void*)base, lg);
}
pthread_mutex_unlock(&g_super_reg_lock);
return 1;
}
@ -214,6 +220,12 @@ hash_removed:
// Phase 12: per-class registry no longer keyed; no per-class removal required.
}
// Phase 9-2: Remove from page table (if enabled)
// Need to determine lg_size for unregistration
if (hak_ss_lookup_pt_enabled() && ss) {
ss_pt_unregister((void*)base, ss->lg_size);
}
// Phase 9-1: Also remove from new hash table
ss_map_remove(&g_ss_addr_map, (void*)base);

View File

@ -20,6 +20,8 @@
#include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC
#include "box/ss_addr_map_box.h" // Phase 9-1: O(1) hash table lookup
#include "box/super_reg_box.h" // Phase X: profile-aware logical registry sizing
#include "box/ss_pt_lookup_box.h" // Phase 9-2: O(1) page table lookup
#include "box/ss_pt_env_box.h" // Phase 9-2: ENV gate for PT vs hash
// Registry configuration
// Increased from 4096 to 32768 to avoid registry exhaustion under
@ -115,13 +117,22 @@ static inline int hak_super_hash(uintptr_t base, int lg_size) {
// Lookup SuperSlab by pointer (lock-free, thread-safe)
// Returns: SuperSlab* if found, NULL otherwise
// Phase 9-1: Optimized with hash table O(1) lookup (replaced linear probing)
// Phase 9-2: Dispatch between page table (O(1) absolute) vs hash table (O(1) amortized)
static inline SuperSlab* hak_super_lookup(void* ptr) {
if (!g_super_reg_initialized) return NULL;
// Phase 9-1: Use new O(1) hash table lookup
SuperSlab* ss = NULL;
// Phase 9-2: Try page table first if enabled
if (hak_ss_lookup_pt_enabled()) {
ss = ss_pt_lookup(ptr);
if (ss) return ss;
// Fallback to hash on miss (out_of_range or not registered)
}
// Phase 9-1: Use hash table lookup
// Replaces old linear probing (50-80 cycles → 10-20 cycles)
SuperSlab* ss = ss_map_lookup(&g_ss_addr_map, ptr);
ss = ss_map_lookup(&g_ss_addr_map, ptr);
// Fallback: If hash map misses (e.g., map not populated yet), probe the
// legacy registry table to avoid NULL for valid SuperSlabs.