hakmem/core/tiny_alloc_fast.inc.h

// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include <stdio.h>

// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Refill-stage counters (defined in hakmem_tiny.c)
extern unsigned long long g_rf_total_calls[];
extern unsigned long long g_rf_hit_bench[];
extern unsigned long long g_rf_hit_hot[];
extern unsigned long long g_rf_hit_mail[];
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_hit_ss[];
extern unsigned long long g_rf_hit_reg[];
extern unsigned long long g_rf_mmap_calls[];

// Publish hits (defined in hakmem_tiny.c)
extern unsigned long long g_pub_mail_hits[];
extern unsigned long long g_pub_bench_hits[];
extern unsigned long long g_pub_hot_hits[];

// Free pipeline (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_tls_sll[];
#endif

// ========== Box 5: Allocation Fast Path ==========
// 箱理論の Fast Allocation 層。TLS freelist から直接 pop（3-4命令）。
// 不変条件:
// - TLS freelist が非空なら即座に return (no lock, no sync)
// - Miss なら Backend (Box 3: SuperSlab) に委譲
// - Cross-thread allocation は考慮しない（Backend が処理）

// External TLS variables (defined in hakmem_tiny.c)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];

// External backend functions
extern int sll_refill_small_from_ss(int class_idx, int max_take);
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
extern int hak_tiny_size_to_class(size_t size);

// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
extern int g_refill_count_global;
extern int g_refill_count_hot;
extern int g_refill_count_mid;
extern int g_refill_count_class[TINY_NUM_CLASSES];

// External macros
#ifndef HAK_RET_ALLOC
#define HAK_RET_ALLOC(cls, ptr) return (ptr)
#endif

// ========== RDTSC Profiling (lightweight) ==========
#ifdef __x86_64__
static inline uint64_t tiny_fast_rdtsc(void) {
    unsigned int lo, hi;
    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
    return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
#endif

// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
static __thread uint64_t g_tiny_alloc_hits = 0;
static __thread uint64_t g_tiny_alloc_cycles = 0;
static __thread uint64_t g_tiny_refill_calls = 0;
static __thread uint64_t g_tiny_refill_cycles = 0;
static int g_tiny_profile_enabled = -1;  // -1: uninitialized

static inline int tiny_profile_enabled(void) {
    if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) {
        const char* env = getenv("HAKMEM_TINY_PROFILE");
        g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
    }
    return g_tiny_profile_enabled;
}

// Print profiling results at exit
static void tiny_fast_print_profile(void) __attribute__((destructor));
static void tiny_fast_print_profile(void) {
    if (!tiny_profile_enabled()) return;
    if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;

    fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
    if (g_tiny_alloc_hits > 0) {
        fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
                (unsigned long)g_tiny_alloc_hits,
                (unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
    }
    if (g_tiny_refill_calls > 0) {
        fprintf(stderr, "[REFILL]    count=%lu, avg_cycles=%lu\n",
                (unsigned long)g_tiny_refill_calls,
                (unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
    }
    fprintf(stderr, "===================================================\n\n");
}

// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========

// Allocation fast path (inline for zero-cost)
// Returns: pointer on success, NULL on miss (caller should try refill/slow)
//
// Assembly (x86-64, optimized):
//   mov    rax, QWORD PTR g_tls_sll_head[class_idx]  ; Load head
//   test   rax, rax                                   ; Check NULL
//   je     .miss                                      ; If empty, miss
//   mov    rdx, QWORD PTR [rax]                       ; Load next
//   mov    QWORD PTR g_tls_sll_head[class_idx], rdx  ; Update head
//   ret                                               ; Return ptr
// .miss:
//   ; Fall through to refill
//
// Expected: 3-4 instructions on hit (1 load, 1 test, 1 load, 1 store)
static inline void* tiny_alloc_fast_pop(int class_idx) {
    uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;

    // Box Boundary: TLS freelist の先頭を pop
    // Ownership: TLS なので所有権チェック不要（同一スレッド保証）
    void* head = g_tls_sll_head[class_idx];
    if (__builtin_expect(head != NULL, 1)) {
        // Fast path hit: 3 instructions
        g_tls_sll_head[class_idx] = *(void**)head;  // Pop: next = *head

        // Optional: update count (for stats, can be disabled)
        if (g_tls_sll_count[class_idx] > 0) {
            g_tls_sll_count[class_idx]--;
        }

#if HAKMEM_DEBUG_COUNTERS
        // Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
        g_free_via_tls_sll[class_idx]++;
#endif

        if (start) {
            g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
            g_tiny_alloc_hits++;
        }
        return head;
    }

    // Fast path miss → NULL (caller should refill)
    return NULL;
}

// ========== Refill Path: Backend Integration ==========

// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
// Returns: number of blocks refilled
//
// This integrates with existing HAKMEM infrastructure:
// - SuperSlab provides memory chunks
// - ACE provides adaptive capacity learning
// - L25 provides mid-large integration
//
// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 32)
// - Smaller count (8-16): better for diverse workloads, faster warmup
// - Larger count (64-128): better for homogeneous workloads, fewer refills
static inline int tiny_alloc_fast_refill(int class_idx) {
    uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;

    // Tunable refill count (cached per-class in TLS for performance)
    static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
    int cnt = s_refill_count[class_idx];
    if (__builtin_expect(cnt == 0, 0)) {
        int def = 16;  // Default: 16 (smaller = less overhead per refill)
        int v = def;
        // Resolve precedence without getenv on hot path (values parsed at init)
        if (g_refill_count_class[class_idx] > 0) {
            v = g_refill_count_class[class_idx];
        } else if (class_idx <= 3 && g_refill_count_hot > 0) {
            v = g_refill_count_hot;
        } else if (class_idx >= 4 && g_refill_count_mid > 0) {
            v = g_refill_count_mid;
        } else if (g_refill_count_global > 0) {
            v = g_refill_count_global;
        }

        // Clamp to sane range (avoid pathological cases)
        if (v < 8) v = 8;        // Minimum: avoid thrashing
        if (v > 256) v = 256;    // Maximum: avoid excessive TLS memory

        s_refill_count[class_idx] = v;
        cnt = v;
    }

#if HAKMEM_DEBUG_COUNTERS
    // Track refill calls (compile-time gated)
    g_rf_total_calls[class_idx]++;
#endif

    // Box Boundary: Delegate to Backend (Box 3: SuperSlab)
    // This gives us ACE, Learning layer, L25 integration for free!
    // Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss()
    int refilled = sll_refill_small_from_ss(class_idx, cnt);

    if (start) {
        g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
        g_tiny_refill_calls++;
    }

    return refilled;
}

// ========== Combined Fast Path (Alloc + Refill) ==========

// Complete fast path allocation (inline for zero-cost)
// Returns: pointer on success, NULL on failure (OOM or size too large)
//
// Flow:
// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
// 2. Miss → Refill from backend (~5% cases)
// 3. Refill success → Retry pop
// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
//
// Example usage:
//   void* ptr = tiny_alloc_fast(64);
//   if (!ptr) {
//       // OOM handling
//   }
static inline void* tiny_alloc_fast(size_t size) {
    // 1. Size → class index (inline, fast)
    int class_idx = hak_tiny_size_to_class(size);
    if (__builtin_expect(class_idx < 0, 0)) {
        return NULL;  // Size > 1KB, not Tiny
    }

    // 2. Fast path: TLS freelist pop (3-4 instructions, 95% hit rate)
    void* ptr = tiny_alloc_fast_pop(class_idx);
    if (__builtin_expect(ptr != NULL, 1)) {
        HAK_RET_ALLOC(class_idx, ptr);
    }

    // 3. Miss: Refill from backend (Box 3: SuperSlab)
    int refilled = tiny_alloc_fast_refill(class_idx);
    if (__builtin_expect(refilled > 0, 1)) {
        // Refill success → retry pop
        ptr = tiny_alloc_fast_pop(class_idx);
        if (ptr) {
            HAK_RET_ALLOC(class_idx, ptr);
        }
    }

    // 4. Refill failure or still empty → slow path (OOM or new SuperSlab)
    // Box Boundary: Delegate to Slow Path (Box 3 backend)
    ptr = hak_tiny_alloc_slow(size, class_idx);
    if (ptr) {
        HAK_RET_ALLOC(class_idx, ptr);
    }

    return ptr;  // NULL if OOM
}

// ========== Push to TLS Freelist (for free path) ==========

// Push block to TLS freelist (used by free fast path)
// This is a "helper" for Box 6 (Free Fast Path)
//
// Invariant: ptr must belong to current thread (no ownership check here)
// Caller (Box 6) is responsible for ownership verification
static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
    // Box Boundary: Push to TLS freelist
    *(void**)ptr = g_tls_sll_head[class_idx];
    g_tls_sll_head[class_idx] = ptr;
    g_tls_sll_count[class_idx]++;
}

// ========== Statistics & Diagnostics ==========

// Get TLS freelist stats (for debugging/profiling)
typedef struct {
    int class_idx;
    void* head;
    uint32_t count;
} TinyAllocFastStats;

static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
    TinyAllocFastStats stats = {
        .class_idx = class_idx,
        .head = g_tls_sll_head[class_idx],
        .count = g_tls_sll_count[class_idx]
    };
    return stats;
}

// Reset TLS freelist (for testing/benchmarking)
// WARNING: This leaks memory! Only use in controlled test environments.
static inline void tiny_alloc_fast_reset(int class_idx) {
    g_tls_sll_head[class_idx] = NULL;
    g_tls_sll_count[class_idx] = 0;
}

// ========== Performance Notes ==========
//
// Expected metrics (based on System tcache & HAKX +171% results):
// - Fast path hit rate: 95%+ (workload dependent)
// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
// - Throughput improvement: +10-25% vs current multi-layer design
//
// Key optimizations:
// 1. `__builtin_expect` for branch prediction (hot path first)
// 2. `static inline` for zero-cost abstraction
// 3. TLS variables (no atomic ops, no locks)
// 4. Minimal work in fast path (defer stats/accounting to backend)
//
// Comparison with current design:
// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
// - New: 3-4 instructions (TLS freelist pop only)
// - Reduction: -80% instructions in hot path
//
// Inspired by:
// - System tcache (glibc malloc) - 3-4 instruction fast path
// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
// - Box Theory - Clear boundaries, minimal coupling
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
 								// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
 								// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
 								// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
 								#pragma once
 								#include "tiny_atomic.h"
 								#include "hakmem_tiny.h"
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								#include <stdio.h>
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
 								// ========== Debug Counters (compile-time gated) ==========
 								#if HAKMEM_DEBUG_COUNTERS
 								// Refill-stage counters (defined in hakmem_tiny.c)
 								extern unsigned long long g_rf_total_calls[];
 								extern unsigned long long g_rf_hit_bench[];
 								extern unsigned long long g_rf_hit_hot[];
 								extern unsigned long long g_rf_hit_mail[];
 								extern unsigned long long g_rf_hit_slab[];
 								extern unsigned long long g_rf_hit_ss[];
 								extern unsigned long long g_rf_hit_reg[];
 								extern unsigned long long g_rf_mmap_calls[];
 								// Publish hits (defined in hakmem_tiny.c)
 								extern unsigned long long g_pub_mail_hits[];
 								extern unsigned long long g_pub_bench_hits[];
 								extern unsigned long long g_pub_hot_hits[];
 								// Free pipeline (defined in hakmem_tiny.c)
 								extern unsigned long long g_free_via_tls_sll[];
 								#endif
 								// ========== Box 5: Allocation Fast Path ==========
 								// 箱理論の Fast Allocation 層。TLS freelist から直接 pop（3-4命令）。
 								// 不変条件:
 								// - TLS freelist が非空なら即座に return (no lock, no sync)
 								// - Miss なら Backend (Box 3: SuperSlab) に委譲
 								// - Cross-thread allocation は考慮しない（Backend が処理）
 								// External TLS variables (defined in hakmem_tiny.c)
 								extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
 								extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
 								// External backend functions
 								extern int sll_refill_small_from_ss(int class_idx, int max_take);
 								extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
 								extern int hak_tiny_size_to_class(size_t size);
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
 								extern int g_refill_count_global;
 								extern int g_refill_count_hot;
 								extern int g_refill_count_mid;
 								extern int g_refill_count_class[TINY_NUM_CLASSES];
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								// External macros
 								#ifndef HAK_RET_ALLOC
 								#define HAK_RET_ALLOC(cls, ptr) return (ptr)
 								#endif
-												Add RDTSC profiling - Identify refill bottleneck

Profiling Results:
- Fast path: 143 cycles (10.4% of time) ✅ Good
- Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck!

Refill is 137x slower than fast path and dominates total cost.
Only happens 6.3% of the time but takes 90% of execution time.

Next: Optimize sll_refill_small_from_ss() backend.

											
										
										
											2025-11-05 06:35:03 +00:00
+								// ========== RDTSC Profiling (lightweight) ==========
 								#ifdef __x86_64__
 								static inline uint64_t tiny_fast_rdtsc(void) {
 								    unsigned int lo, hi;
 								    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
 								    return ((uint64_t)hi << 32) | lo;
 								}
 								#else
 								static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
 								#endif
 								// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
 								static __thread uint64_t g_tiny_alloc_hits = 0;
 								static __thread uint64_t g_tiny_alloc_cycles = 0;
 								static __thread uint64_t g_tiny_refill_calls = 0;
 								static __thread uint64_t g_tiny_refill_cycles = 0;
 								static int g_tiny_profile_enabled = -1;  // -1: uninitialized
 								static inline int tiny_profile_enabled(void) {
 								    if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) {
 								        const char* env = getenv("HAKMEM_TINY_PROFILE");
 								        g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
 								    }
 								    return g_tiny_profile_enabled;
 								}
 								// Print profiling results at exit
 								static void tiny_fast_print_profile(void) __attribute__((destructor));
 								static void tiny_fast_print_profile(void) {
 								    if (!tiny_profile_enabled()) return;
 								    if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;
 								    fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
 								    if (g_tiny_alloc_hits > 0) {
 								        fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
 								                (unsigned long)g_tiny_alloc_hits,
 								                (unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
 								    }
 								    if (g_tiny_refill_calls > 0) {
 								        fprintf(stderr, "[REFILL]    count=%lu, avg_cycles=%lu\n",
 								                (unsigned long)g_tiny_refill_calls,
 								                (unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
 								    }
 								    fprintf(stderr, "===================================================\n\n");
 								}
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========
 								// Allocation fast path (inline for zero-cost)
 								// Returns: pointer on success, NULL on miss (caller should try refill/slow)
 								//
 								// Assembly (x86-64, optimized):
 								//   mov    rax, QWORD PTR g_tls_sll_head[class_idx]  ; Load head
 								//   test   rax, rax                                   ; Check NULL
 								//   je     .miss                                      ; If empty, miss
 								//   mov    rdx, QWORD PTR [rax]                       ; Load next
 								//   mov    QWORD PTR g_tls_sll_head[class_idx], rdx  ; Update head
 								//   ret                                               ; Return ptr
 								// .miss:
 								//   ; Fall through to refill
 								//
 								// Expected: 3-4 instructions on hit (1 load, 1 test, 1 load, 1 store)
 								static inline void* tiny_alloc_fast_pop(int class_idx) {
-												Add RDTSC profiling - Identify refill bottleneck

Profiling Results:
- Fast path: 143 cycles (10.4% of time) ✅ Good
- Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck!

Refill is 137x slower than fast path and dominates total cost.
Only happens 6.3% of the time but takes 90% of execution time.

Next: Optimize sll_refill_small_from_ss() backend.

											
										
										
											2025-11-05 06:35:03 +00:00
+								    uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    // Box Boundary: TLS freelist の先頭を pop
 								    // Ownership: TLS なので所有権チェック不要（同一スレッド保証）
 								    void* head = g_tls_sll_head[class_idx];
 								    if (__builtin_expect(head != NULL, 1)) {
 								        // Fast path hit: 3 instructions
 								        g_tls_sll_head[class_idx] = *(void**)head;  // Pop: next = *head
 								        // Optional: update count (for stats, can be disabled)
 								        if (g_tls_sll_count[class_idx] > 0) {
 								            g_tls_sll_count[class_idx]--;
 								        }
 								#if HAKMEM_DEBUG_COUNTERS
 								        // Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
 								        g_free_via_tls_sll[class_idx]++;
 								#endif
-												Add RDTSC profiling - Identify refill bottleneck

Profiling Results:
- Fast path: 143 cycles (10.4% of time) ✅ Good
- Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck!

Refill is 137x slower than fast path and dominates total cost.
Only happens 6.3% of the time but takes 90% of execution time.

Next: Optimize sll_refill_small_from_ss() backend.

											
										
										
											2025-11-05 06:35:03 +00:00
+								        if (start) {
 								            g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
 								            g_tiny_alloc_hits++;
 								        }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								        return head;
 								    }
 								    // Fast path miss → NULL (caller should refill)
 								    return NULL;
 								}
 								// ========== Refill Path: Backend Integration ==========
 								// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
 								// Returns: number of blocks refilled
 								//
 								// This integrates with existing HAKMEM infrastructure:
 								// - SuperSlab provides memory chunks
 								// - ACE provides adaptive capacity learning
 								// - L25 provides mid-large integration
 								//
 								// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 32)
 								// - Smaller count (8-16): better for diverse workloads, faster warmup
 								// - Larger count (64-128): better for homogeneous workloads, fewer refills
 								static inline int tiny_alloc_fast_refill(int class_idx) {
-												Add RDTSC profiling - Identify refill bottleneck

Profiling Results:
- Fast path: 143 cycles (10.4% of time) ✅ Good
- Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck!

Refill is 137x slower than fast path and dominates total cost.
Only happens 6.3% of the time but takes 90% of execution time.

Next: Optimize sll_refill_small_from_ss() backend.

											
										
										
											2025-11-05 06:35:03 +00:00
+								    uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								    // Tunable refill count (cached per-class in TLS for performance)
 								    static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
 								    int cnt = s_refill_count[class_idx];
 								    if (__builtin_expect(cnt == 0, 0)) {
-												Add RDTSC profiling - Identify refill bottleneck

Profiling Results:
- Fast path: 143 cycles (10.4% of time) ✅ Good
- Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck!

Refill is 137x slower than fast path and dominates total cost.
Only happens 6.3% of the time but takes 90% of execution time.

Next: Optimize sll_refill_small_from_ss() backend.

											
										
										
											2025-11-05 06:35:03 +00:00
+								        int def = 16;  // Default: 16 (smaller = less overhead per refill)
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								        int v = def;
 								        // Resolve precedence without getenv on hot path (values parsed at init)
 								        if (g_refill_count_class[class_idx] > 0) {
 								            v = g_refill_count_class[class_idx];
 								        } else if (class_idx <= 3 && g_refill_count_hot > 0) {
 								            v = g_refill_count_hot;
 								        } else if (class_idx >= 4 && g_refill_count_mid > 0) {
 								            v = g_refill_count_mid;
 								        } else if (g_refill_count_global > 0) {
 								            v = g_refill_count_global;
 								        }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
 								        // Clamp to sane range (avoid pathological cases)
 								        if (v < 8) v = 8;        // Minimum: avoid thrashing
 								        if (v > 256) v = 256;    // Maximum: avoid excessive TLS memory
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								        s_refill_count[class_idx] = v;
 								        cnt = v;
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    }
 								#if HAKMEM_DEBUG_COUNTERS
 								    // Track refill calls (compile-time gated)
 								    g_rf_total_calls[class_idx]++;
 								#endif
 								    // Box Boundary: Delegate to Backend (Box 3: SuperSlab)
 								    // This gives us ACE, Learning layer, L25 integration for free!
 								    // Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss()
-												Tiny: add per-class refill count tuning infrastructure (ChatGPT)

External AI (ChatGPT Pro) implemented hierarchical refill count tuning:
- Move getenv() from hot path to init (performance hygiene)
- Add per-class granularity: global → hot/mid → per-class precedence
- Environment variables:
  * HAKMEM_TINY_REFILL_COUNT (global default)
  * HAKMEM_TINY_REFILL_COUNT_HOT (classes 0-3)
  * HAKMEM_TINY_REFILL_COUNT_MID (classes 4-7)
  * HAKMEM_TINY_REFILL_COUNT_C{0..7} (per-class override)

Performance impact: Neutral (no tuning applied yet, default=16)
- Larson 4-thread: 4.19M ops/s (unchanged)
- No measurable overhead from init-time parsing

Code quality improvement:
- Better separation: hot path reads plain ints (no syscalls)
- Future-proof: enables A/B testing per size class
- Documentation: ENV_VARS.md updated

Note: Per Ultrathink's advice, further tuning deferred until bottleneck
visualization (superslab_refill branch analysis) is complete.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <external-ai@openai.com>

											
										
										
											2025-11-05 17:45:11 +09:00
+								    int refilled = sll_refill_small_from_ss(class_idx, cnt);
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
-												Add RDTSC profiling - Identify refill bottleneck

Profiling Results:
- Fast path: 143 cycles (10.4% of time) ✅ Good
- Refill: 19,624 cycles (89.6% of time) 🚨 Bottleneck!

Refill is 137x slower than fast path and dominates total cost.
Only happens 6.3% of the time but takes 90% of execution time.

Next: Optimize sll_refill_small_from_ss() backend.

											
										
										
											2025-11-05 06:35:03 +00:00
+								    if (start) {
 								        g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
 								        g_tiny_refill_calls++;
 								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    return refilled;
 								}
 								// ========== Combined Fast Path (Alloc + Refill) ==========
 								// Complete fast path allocation (inline for zero-cost)
 								// Returns: pointer on success, NULL on failure (OOM or size too large)
 								//
 								// Flow:
 								// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
 								// 2. Miss → Refill from backend (~5% cases)
 								// 3. Refill success → Retry pop
 								// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
 								//
 								// Example usage:
 								//   void* ptr = tiny_alloc_fast(64);
 								//   if (!ptr) {
 								//       // OOM handling
 								//   }
 								static inline void* tiny_alloc_fast(size_t size) {
 								    // 1. Size → class index (inline, fast)
 								    int class_idx = hak_tiny_size_to_class(size);
 								    if (__builtin_expect(class_idx < 0, 0)) {
 								        return NULL;  // Size > 1KB, not Tiny
 								    }
 								    // 2. Fast path: TLS freelist pop (3-4 instructions, 95% hit rate)
 								    void* ptr = tiny_alloc_fast_pop(class_idx);
 								    if (__builtin_expect(ptr != NULL, 1)) {
 								        HAK_RET_ALLOC(class_idx, ptr);
 								    }
 								    // 3. Miss: Refill from backend (Box 3: SuperSlab)
 								    int refilled = tiny_alloc_fast_refill(class_idx);
 								    if (__builtin_expect(refilled > 0, 1)) {
 								        // Refill success → retry pop
 								        ptr = tiny_alloc_fast_pop(class_idx);
 								        if (ptr) {
 								            HAK_RET_ALLOC(class_idx, ptr);
 								        }
 								    }
 								    // 4. Refill failure or still empty → slow path (OOM or new SuperSlab)
 								    // Box Boundary: Delegate to Slow Path (Box 3 backend)
 								    ptr = hak_tiny_alloc_slow(size, class_idx);
 								    if (ptr) {
 								        HAK_RET_ALLOC(class_idx, ptr);
 								    }
 								    return ptr;  // NULL if OOM
 								}
 								// ========== Push to TLS Freelist (for free path) ==========
 								// Push block to TLS freelist (used by free fast path)
 								// This is a "helper" for Box 6 (Free Fast Path)
 								//
 								// Invariant: ptr must belong to current thread (no ownership check here)
 								// Caller (Box 6) is responsible for ownership verification
 								static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
 								    // Box Boundary: Push to TLS freelist
 								    *(void**)ptr = g_tls_sll_head[class_idx];
 								    g_tls_sll_head[class_idx] = ptr;
 								    g_tls_sll_count[class_idx]++;
 								}
 								// ========== Statistics & Diagnostics ==========
 								// Get TLS freelist stats (for debugging/profiling)
 								typedef struct {
 								    int class_idx;
 								    void* head;
 								    uint32_t count;
 								} TinyAllocFastStats;
 								static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
 								    TinyAllocFastStats stats = {
 								        .class_idx = class_idx,
 								        .head = g_tls_sll_head[class_idx],
 								        .count = g_tls_sll_count[class_idx]
 								    };
 								    return stats;
 								}
 								// Reset TLS freelist (for testing/benchmarking)
 								// WARNING: This leaks memory! Only use in controlled test environments.
 								static inline void tiny_alloc_fast_reset(int class_idx) {
 								    g_tls_sll_head[class_idx] = NULL;
 								    g_tls_sll_count[class_idx] = 0;
 								}
 								// ========== Performance Notes ==========
 								//
 								// Expected metrics (based on System tcache & HAKX +171% results):
 								// - Fast path hit rate: 95%+ (workload dependent)
 								// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
 								// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
 								// - Throughput improvement: +10-25% vs current multi-layer design
 								//
 								// Key optimizations:
 								// 1. `__builtin_expect` for branch prediction (hot path first)
 								// 2. `static inline` for zero-cost abstraction
 								// 3. TLS variables (no atomic ops, no locks)
 								// 4. Minimal work in fast path (defer stats/accounting to backend)
 								//
 								// Comparison with current design:
 								// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
 								// - New: 3-4 instructions (TLS freelist pop only)
 								// - Reduction: -80% instructions in hot path
 								//
 								// Inspired by:
 								// - System tcache (glibc malloc) - 3-4 instruction fast path
 								// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
 								// - Box Theory - Clear boundaries, minimal coupling