Phase ALLOC-TINY-FAST-DUALHOT-1 & Optimization Roadmap Update

Add comprehensive design docs and research boxes: - docs/analysis/ALLOC_TINY_FAST_DUALHOT_1_DESIGN.md: ALLOC DUALHOT investigation - docs/analysis/FREE_TINY_FAST_DUALHOT_1_DESIGN.md: FREE DUALHOT final specs - docs/analysis/FREE_TINY_FAST_HOTCOLD_OPT_1_DESIGN.md: Hot/Cold split research - docs/analysis/POOL_MID_INUSE_DEFERRED_DN_BATCH_DESIGN.md: Deferred batching design - docs/analysis/POOL_MID_INUSE_DEFERRED_REGRESSION_ANALYSIS.md: Stats overhead findings - docs/analysis/MID_DESC_CACHE_BENCHMARK_2025-12-12.md: Cache measurement results - docs/analysis/LAST_MATCH_CACHE_IMPLEMENTATION.md: TLS cache investigation Research boxes (SS page table): - core/box/ss_pt_env_box.h: HAKMEM_SS_LOOKUP_KIND gate - core/box/ss_pt_types_box.h: 2-level page table structures - core/box/ss_pt_lookup_box.h: ss_pt_lookup() implementation - core/box/ss_pt_register_box.h: Page table registration - core/box/ss_pt_impl.c: Global definitions Updates: - docs/specs/ENV_VARS_COMPLETE.md: HOTCOLD, DEFERRED, SS_LOOKUP env vars - core/box/hak_free_api.inc.h: FREE-DISPATCH-SSOT integration - core/box/pool_mid_inuse_deferred_box.h: Deferred API updates - core/box/pool_mid_inuse_deferred_stats_box.h: Stats collection - core/hakmem_super_registry: SS page table integration Current Status: - FREE-TINY-FAST-DUALHOT-1: +13% improvement, ready for adoption - ALLOC-TINY-FAST-DUALHOT-1: -2% regression, frozen as research box - Next: Optimization roadmap per ROI (mimalloc gap 2.5x) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-13 05:35:46 +09:00
parent b917357034
commit d9991f39ff
18 changed files with 1721 additions and 25 deletions
--- a/core/box/hak_free_api.inc.h
+++ b/core/box/hak_free_api.inc.h
@ -224,19 +224,42 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
    // ========== Mid/L25/Tiny Registry Lookup (Headerless) ==========
    // MIDCAND: Could be Mid/Large/C7, needs registry lookup

-    // Phase MID-V3: Try v3 ownership first (RegionIdBox-based)
-    // ENV-controlled, default OFF
-    if (__builtin_expect(mid_v3_enabled(), 0)) {
+    // Phase FREE-DISPATCH-SSOT: Single Source of Truth for region lookup
+    // ENV: HAKMEM_FREE_DISPATCH_SSOT (default: 0 for backward compat, 1 for optimized)
+    // Problem: Old code did region_id_lookup TWICE in MID-V3 path (once inside mid_hot_v3_free, once after)
+    // Fix: Do lookup ONCE at top, dispatch based on kind
+    static int g_free_dispatch_ssot = -1;
+    if (__builtin_expect(g_free_dispatch_ssot == -1, 0)) {
+        const char* env = getenv("HAKMEM_FREE_DISPATCH_SSOT");
+        g_free_dispatch_ssot = (env && *env == '1') ? 1 : 0;
+    }
+
+    if (g_free_dispatch_ssot && __builtin_expect(mid_v3_enabled(), 0)) {
+        // SSOT=1: Single lookup, then dispatch
+        extern RegionLookupV6 region_id_lookup_cached_v6(void* ptr);
+        RegionLookupV6 lk = region_id_lookup_cached_v6(ptr);
+
+        if (lk.kind == REGION_KIND_MID_V3) {
+            // Owned by MID-V3: call free handler directly (no internal lookup)
+            // Note: We pass the pre-looked-up info implicitly via TLS cache
+            mid_hot_v3_free(ptr);
+
+            if (mid_v3_debug_enabled()) {
+                static _Atomic int free_log_count = 0;
+                if (atomic_fetch_add(&free_log_count, 1) < 10) {
+                    fprintf(stderr, "[MID_V3] Free SSOT: ptr=%p\n", ptr);
+                }
+            }
+            goto done;
+        }
+        // Not MID-V3: fall through to other dispatch paths below
+    } else if (__builtin_expect(mid_v3_enabled(), 0)) {
+        // SSOT=0: Legacy double-lookup path (for A/B comparison)
        // RegionIdBox lookup to check if v3 owns this pointer
        // mid_hot_v3_free() will check internally and return early if not owned
        mid_hot_v3_free(ptr);

        // Check if v3 actually owned it by doing a quick verification
-        // For now, we'll use the existence check via RegionIdBox
-        // If v3 handled it, it would have returned already
-        // We need to check if v3 took ownership - simplified: always check other paths too
-        // Better approach: mid_hot_v3_free returns bool or we check ownership first
-
        // For safety, check ownership explicitly before continuing
        // This prevents double-free if v3 handled it
        extern RegionLookupV6 region_id_lookup_v6(void* ptr);
--- a/core/box/pool_mid_inuse_deferred_box.h
+++ b/core/box/pool_mid_inuse_deferred_box.h
@ -72,6 +72,7 @@ static void mid_inuse_deferred_thread_cleanup(void* arg) {
    (void)arg;
    if (hak_pool_mid_inuse_deferred_enabled()) {
        mid_inuse_deferred_drain();
+        mid_inuse_deferred_stats_flush_tls_to_global();
    }
 }

@ -193,15 +194,16 @@ static inline void mid_inuse_deferred_drain(void) {
                    MID_INUSE_DEFERRED_STAT_ADD(decs_drained, n);

                    // Atomic subtract (batched count)
-                    uint64_t old = atomic_fetch_sub_explicit(&d->in_use, n, memory_order_relaxed);
+                    int old = atomic_fetch_sub_explicit(&d->in_use, (int)n, memory_order_relaxed);
+                    int nv = old - (int)n;

                    // Check for empty transition
-                    if (old >= n && old - n == 0) {
+                    if (nv <= 0) {
+                        // Fire once per empty transition
                        // Use atomic_exchange to ensure only ONE thread enqueues DONTNEED
-                        if (d->pending_dn == 0) {
-                            d->pending_dn = 1;
+                        if (atomic_exchange_explicit(&d->pending_dn, 1, memory_order_acq_rel) == 0) {
                            MID_INUSE_DEFERRED_STAT_INC(empty_transitions);
-                            hak_batch_add_page(page, POOL_PAGE_SIZE);
+                            hak_batch_add_page(d->page, POOL_PAGE_SIZE);
                        }
                    }
                }
--- a/core/box/pool_mid_inuse_deferred_stats_box.h
+++ b/core/box/pool_mid_inuse_deferred_stats_box.h
@ -18,6 +18,15 @@
 #include <stdio.h>
 #include <stdlib.h>

+static inline int hak_pool_mid_inuse_deferred_stats_enabled(void) {
+    static int g = -1;
+    if (__builtin_expect(g == -1, 0)) {
+        const char* e = getenv("HAKMEM_POOL_MID_INUSE_DEFERRED_STATS");
+        g = (e && *e == '1') ? 1 : 0;  // default OFF
+    }
+    return g;
+}
+
 // Statistics structure
 typedef struct {
    _Atomic uint64_t mid_inuse_deferred_hit;      // Total deferred decrements
@ -27,21 +36,58 @@ typedef struct {
    _Atomic uint64_t empty_transitions;            // Pages that went to 0
 } MidInuseDeferredStats;

+typedef struct {
+    uint64_t mid_inuse_deferred_hit;
+    uint64_t drain_calls;
+    uint64_t pages_drained;
+    uint64_t decs_drained;
+    uint64_t empty_transitions;
+} MidInuseDeferredStatsTls;
+
 // Global stats instance
 static MidInuseDeferredStats g_mid_inuse_deferred_stats;

-// Stats increment macros (inline for hot path)
+static __thread MidInuseDeferredStatsTls g_mid_inuse_deferred_stats_tls;
+
+static inline MidInuseDeferredStatsTls* mid_inuse_deferred_stats_tls(void) {
+    return &g_mid_inuse_deferred_stats_tls;
+}
+
+static inline void mid_inuse_deferred_stats_flush_tls_to_global(void) {
+    if (!hak_pool_mid_inuse_deferred_stats_enabled()) return;
+    MidInuseDeferredStatsTls* tls = mid_inuse_deferred_stats_tls();
+    if (!tls->mid_inuse_deferred_hit && !tls->drain_calls) return;
+
+    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.mid_inuse_deferred_hit, tls->mid_inuse_deferred_hit, memory_order_relaxed);
+    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.drain_calls, tls->drain_calls, memory_order_relaxed);
+    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.pages_drained, tls->pages_drained, memory_order_relaxed);
+    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.decs_drained, tls->decs_drained, memory_order_relaxed);
+    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.empty_transitions, tls->empty_transitions, memory_order_relaxed);
+
+    *tls = (MidInuseDeferredStatsTls){0};
+}
+
+// Stats increment macros (hot path): default OFF, per-thread counters.
 #define MID_INUSE_DEFERRED_STAT_INC(field) \
-    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.field, 1, memory_order_relaxed)
+    do { \
+        if (__builtin_expect(hak_pool_mid_inuse_deferred_stats_enabled(), 0)) { \
+            mid_inuse_deferred_stats_tls()->field++; \
+        } \
+    } while (0)

 #define MID_INUSE_DEFERRED_STAT_ADD(field, n) \
-    atomic_fetch_add_explicit(&g_mid_inuse_deferred_stats.field, (n), memory_order_relaxed)
+    do { \
+        if (__builtin_expect(hak_pool_mid_inuse_deferred_stats_enabled(), 0)) { \
+            mid_inuse_deferred_stats_tls()->field += (uint64_t)(n); \
+        } \
+    } while (0)

 // Dump stats on exit (if ENV var set)
 static void mid_inuse_deferred_stats_dump(void) {
-    // Only dump if deferred is enabled
-    const char* e = getenv("HAKMEM_POOL_MID_INUSE_DEFERRED");
-    if (!e || *e != '1') return;
+    if (!hak_pool_mid_inuse_deferred_stats_enabled()) return;
+
+    // Best-effort flush for the current thread (other threads flush at thread-exit cleanup).
+    mid_inuse_deferred_stats_flush_tls_to_global();

    uint64_t hits = atomic_load_explicit(&g_mid_inuse_deferred_stats.mid_inuse_deferred_hit, memory_order_relaxed);
    uint64_t drains = atomic_load_explicit(&g_mid_inuse_deferred_stats.drain_calls, memory_order_relaxed);
--- a/core/box/ss_pt_env_box.h
+++ b/core/box/ss_pt_env_box.h
@ -0,0 +1,27 @@
+#ifndef SS_PT_ENV_BOX_H
+#define SS_PT_ENV_BOX_H
+
+#include <stdlib.h>
+#include <string.h>
+
+// HAKMEM_SS_LOOKUP_KIND=hash|pt (default hash)
+static inline int hak_ss_lookup_pt_enabled(void) {
+    static int g = -1;
+    if (__builtin_expect(g == -1, 0)) {
+        const char* e = getenv("HAKMEM_SS_LOOKUP_KIND");
+        g = (e && strcmp(e, "pt") == 0) ? 1 : 0;
+    }
+    return g;
+}
+
+// HAKMEM_SS_PT_STATS=1 (default 0, OFF)
+static inline int hak_ss_pt_stats_enabled(void) {
+    static int g = -1;
+    if (__builtin_expect(g == -1, 0)) {
+        const char* e = getenv("HAKMEM_SS_PT_STATS");
+        g = (e && *e == '1') ? 1 : 0;
+    }
+    return g;
+}
+
+#endif
--- a/core/box/ss_pt_impl.c
+++ b/core/box/ss_pt_impl.c
@ -0,0 +1,7 @@
+#include "ss_pt_types_box.h"
+
+// Global page table (2MB BSS)
+SsPtL1 g_ss_pt = {0};
+
+// TLS stats
+__thread SsPtStats t_ss_pt_stats = {0};
--- a/core/box/ss_pt_lookup_box.h
+++ b/core/box/ss_pt_lookup_box.h
@ -0,0 +1,36 @@
+#ifndef SS_PT_LOOKUP_BOX_H
+#define SS_PT_LOOKUP_BOX_H
+
+#include "ss_pt_types_box.h"
+#include "ss_pt_env_box.h"
+
+// O(1) lookup (hot path, lock-free)
+static inline struct SuperSlab* ss_pt_lookup(void* addr) {
+    uintptr_t p = (uintptr_t)addr;
+
+    // Out-of-range check (>> 48 for LA57 compatibility)
+    if (__builtin_expect(p >> 48, 0)) {
+        if (hak_ss_pt_stats_enabled()) t_ss_pt_stats.pt_out_of_range++;
+        return NULL;  // Fallback to hash handled by caller
+    }
+
+    uint32_t l1_idx = SS_PT_L1_INDEX(addr);
+    uint32_t l2_idx = SS_PT_L2_INDEX(addr);
+
+    // L1 load (acquire)
+    SsPtL2* l2 = atomic_load_explicit(&g_ss_pt.l2[l1_idx], memory_order_acquire);
+    if (__builtin_expect(l2 == NULL, 0)) {
+        if (hak_ss_pt_stats_enabled()) t_ss_pt_stats.pt_miss++;
+        return NULL;
+    }
+
+    // L2 load (acquire)
+    struct SuperSlab* ss = atomic_load_explicit(&l2->entries[l2_idx], memory_order_acquire);
+    if (hak_ss_pt_stats_enabled()) {
+        if (ss) t_ss_pt_stats.pt_hit++;
+        else t_ss_pt_stats.pt_miss++;
+    }
+    return ss;
+}
+
+#endif
--- a/core/box/ss_pt_register_box.h
+++ b/core/box/ss_pt_register_box.h
@ -0,0 +1,74 @@
+#ifndef SS_PT_REGISTER_BOX_H
+#define SS_PT_REGISTER_BOX_H
+
+#include "ss_pt_types_box.h"
+#include <sys/mman.h>
+
+// Register single 512KB chunk (cold path)
+static inline void ss_pt_register_chunk(void* chunk_base, struct SuperSlab* ss) {
+    uintptr_t p = (uintptr_t)chunk_base;
+
+    // Out-of-range check
+    if (p >> 48) return;
+
+    uint32_t l1_idx = SS_PT_L1_INDEX(chunk_base);
+    uint32_t l2_idx = SS_PT_L2_INDEX(chunk_base);
+
+    // Ensure L2 exists
+    SsPtL2* l2 = atomic_load_explicit(&g_ss_pt.l2[l1_idx], memory_order_acquire);
+    if (l2 == NULL) {
+        SsPtL2* new_l2 = (SsPtL2*)mmap(NULL, sizeof(SsPtL2),
+                                        PROT_READ | PROT_WRITE,
+                                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        if (new_l2 == MAP_FAILED) return;
+
+        SsPtL2* expected = NULL;
+        if (!atomic_compare_exchange_strong_explicit(&g_ss_pt.l2[l1_idx],
+                &expected, new_l2, memory_order_acq_rel, memory_order_acquire)) {
+            munmap(new_l2, sizeof(SsPtL2));
+            l2 = expected;
+        } else {
+            l2 = new_l2;
+        }
+    }
+
+    // Store SuperSlab pointer (release)
+    atomic_store_explicit(&l2->entries[l2_idx], ss, memory_order_release);
+}
+
+// Unregister single chunk (NULL store, L2 never freed)
+static inline void ss_pt_unregister_chunk(void* chunk_base) {
+    uintptr_t p = (uintptr_t)chunk_base;
+    if (p >> 48) return;
+
+    uint32_t l1_idx = SS_PT_L1_INDEX(chunk_base);
+    uint32_t l2_idx = SS_PT_L2_INDEX(chunk_base);
+
+    SsPtL2* l2 = atomic_load_explicit(&g_ss_pt.l2[l1_idx], memory_order_acquire);
+    if (l2) {
+        atomic_store_explicit(&l2->entries[l2_idx], NULL, memory_order_release);
+    }
+}
+
+// Register all chunks of a SuperSlab (1MB=2 chunks, 2MB=4 chunks)
+static inline void ss_pt_register(struct SuperSlab* ss, void* base, int lg_size) {
+    size_t size = (size_t)1 << lg_size;
+    size_t chunk_size = (size_t)1 << SS_PT_CHUNK_LG;  // 512KB
+    size_t n_chunks = size / chunk_size;
+
+    for (size_t i = 0; i < n_chunks; i++) {
+        ss_pt_register_chunk((char*)base + i * chunk_size, ss);
+    }
+}
+
+static inline void ss_pt_unregister(void* base, int lg_size) {
+    size_t size = (size_t)1 << lg_size;
+    size_t chunk_size = (size_t)1 << SS_PT_CHUNK_LG;
+    size_t n_chunks = size / chunk_size;
+
+    for (size_t i = 0; i < n_chunks; i++) {
+        ss_pt_unregister_chunk((char*)base + i * chunk_size);
+    }
+}
+
+#endif
--- a/core/box/ss_pt_types_box.h
+++ b/core/box/ss_pt_types_box.h
@ -0,0 +1,49 @@
+#ifndef SS_PT_TYPES_BOX_H
+#define SS_PT_TYPES_BOX_H
+
+#include <stdatomic.h>
+#include <stdint.h>
+
+// Constants (18/11 split as per design)
+#define SS_PT_CHUNK_LG      19      // 512KB
+#define SS_PT_L2_BITS       11      // 2K entries per L2
+#define SS_PT_L1_BITS       18      // 256K L1 entries
+
+#define SS_PT_L2_SIZE       (1u << SS_PT_L2_BITS)   // 2048
+#define SS_PT_L1_SIZE       (1u << SS_PT_L1_BITS)   // 262144
+
+#define SS_PT_L2_MASK       (SS_PT_L2_SIZE - 1)
+#define SS_PT_L1_MASK       (SS_PT_L1_SIZE - 1)
+
+// Index extraction macros
+#define SS_PT_L1_INDEX(addr) \
+    ((uint32_t)(((uintptr_t)(addr) >> (SS_PT_CHUNK_LG + SS_PT_L2_BITS)) & SS_PT_L1_MASK))
+#define SS_PT_L2_INDEX(addr) \
+    ((uint32_t)(((uintptr_t)(addr) >> SS_PT_CHUNK_LG) & SS_PT_L2_MASK))
+
+// Forward declaration
+struct SuperSlab;
+
+// L2 page: 2K entries (16KB)
+typedef struct SsPtL2 {
+    _Atomic(struct SuperSlab*) entries[SS_PT_L2_SIZE];
+} SsPtL2;
+
+// L1 table: 256K entries (2MB)
+typedef struct SsPtL1 {
+    _Atomic(SsPtL2*) l2[SS_PT_L1_SIZE];
+} SsPtL1;
+
+// Global page table (defined in ss_pt_impl.c)
+extern SsPtL1 g_ss_pt;
+
+// Stats (TLS to avoid contention, aggregate on dump)
+typedef struct SsPtStats {
+    uint64_t pt_hit;
+    uint64_t pt_miss;
+    uint64_t pt_out_of_range;
+} SsPtStats;
+
+extern __thread SsPtStats t_ss_pt_stats;
+
+#endif
--- a/core/hakmem_super_registry.c
+++ b/core/hakmem_super_registry.c
@ -4,6 +4,7 @@
 #include "box/ss_addr_map_box.h"     // Phase 9-1: SuperSlab address map
 #include "box/ss_cold_start_box.inc.h"  // Phase 11+: Cold Start prewarm defaults
 #include "hakmem_env_cache.h"        // Priority-2: ENV cache (eliminate syscalls)
+#include "box/ss_pt_register_box.h"  // Phase 9-2: Page table registration
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@ -135,6 +136,11 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) {
    // Phase 9-1: Also register in new hash table (for optimized lookup)
    ss_map_insert(&g_ss_addr_map, (void*)base, ss);

+    // Phase 9-2: Register in page table (if enabled)
+    if (hak_ss_lookup_pt_enabled()) {
+        ss_pt_register(ss, (void*)base, lg);
+    }
+
    pthread_mutex_unlock(&g_super_reg_lock);
    return 1;
 }
@ -214,6 +220,12 @@ hash_removed:
        // Phase 12: per-class registry no longer keyed; no per-class removal required.
    }

+    // Phase 9-2: Remove from page table (if enabled)
+    // Need to determine lg_size for unregistration
+    if (hak_ss_lookup_pt_enabled() && ss) {
+        ss_pt_unregister((void*)base, ss->lg_size);
+    }
+
    // Phase 9-1: Also remove from new hash table
    ss_map_remove(&g_ss_addr_map, (void*)base);

--- a/core/hakmem_super_registry.h
+++ b/core/hakmem_super_registry.h
@ -20,6 +20,8 @@
 #include "hakmem_tiny_superslab.h"  // For SuperSlab and SUPERSLAB_MAGIC
 #include "box/ss_addr_map_box.h"    // Phase 9-1: O(1) hash table lookup
 #include "box/super_reg_box.h"      // Phase X: profile-aware logical registry sizing
+#include "box/ss_pt_lookup_box.h"   // Phase 9-2: O(1) page table lookup
+#include "box/ss_pt_env_box.h"      // Phase 9-2: ENV gate for PT vs hash

 // Registry configuration
 // Increased from 4096 to 32768 to avoid registry exhaustion under
@ -115,13 +117,22 @@ static inline int hak_super_hash(uintptr_t base, int lg_size) {

 // Lookup SuperSlab by pointer (lock-free, thread-safe)
 // Returns: SuperSlab* if found, NULL otherwise
-// Phase 9-1: Optimized with hash table O(1) lookup (replaced linear probing)
+// Phase 9-2: Dispatch between page table (O(1) absolute) vs hash table (O(1) amortized)
 static inline SuperSlab* hak_super_lookup(void* ptr) {
    if (!g_super_reg_initialized) return NULL;

-    // Phase 9-1: Use new O(1) hash table lookup
+    SuperSlab* ss = NULL;
+
+    // Phase 9-2: Try page table first if enabled
+    if (hak_ss_lookup_pt_enabled()) {
+        ss = ss_pt_lookup(ptr);
+        if (ss) return ss;
+        // Fallback to hash on miss (out_of_range or not registered)
+    }
+
+    // Phase 9-1: Use hash table lookup
    // Replaces old linear probing (50-80 cycles → 10-20 cycles)
-    SuperSlab* ss = ss_map_lookup(&g_ss_addr_map, ptr);
+    ss = ss_map_lookup(&g_ss_addr_map, ptr);

    // Fallback: If hash map misses (e.g., map not populated yet), probe the
    // legacy registry table to avoid NULL for valid SuperSlabs.