hakmem/core/hakmem_tiny_refill_p0.inc.h

// hakmem_tiny_refill_p0.inc.h
// ChatGPT Pro P0: Complete Batch Refill (SLL用)
//
// Purpose: Optimize sll_refill_small_from_ss with batch carving
// Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126)
//
// Key optimization: ss_active_inc × 64 → ss_active_add × 1
//
// Maintains: Existing g_tls_sll_head fast path (no changes to hot path!)
//
// Enable P0 by default for testing (set to 0 to disable)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
#define HAKMEM_TINY_P0_BATCH_REFILL 0
#endif

#ifndef HAKMEM_TINY_REFILL_P0_INC_H
#define HAKMEM_TINY_REFILL_P0_INC_H

// Debug counters (compile-time gated)
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_hit_slab[];
// Diagnostic counters for refill early returns
extern unsigned long long g_rf_early_no_ss[];      // Line 27: !g_use_superslab
extern unsigned long long g_rf_early_no_meta[];    // Line 35: !meta
extern unsigned long long g_rf_early_no_room[];    // Line 40: room <= 0
extern unsigned long long g_rf_early_want_zero[];  // Line 55: want == 0
#endif

// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
#include "tiny_refill_opt.h"
#include "tiny_fc_api.h"
#include "superslab/superslab_inline.h"  // For _ss_remote_drain_to_freelist_unsafe()
// Optional P0 diagnostic logging helper
static inline int p0_should_log(void) {
    static int en = -1;
    if (__builtin_expect(en == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_P0_LOG");
        en = (e && *e && *e != '0') ? 1 : 0;
    }
    return en;
}

static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
    // Conservative guard: class7(1KB) uses legacy path by default until fully stabilized.
    // Opt-in via HAKMEM_TINY_P0_C7_ENABLE=1
    if (__builtin_expect(class_idx == 7, 0)) {
        static int c7_en = -1;
        if (c7_en == -1) {
            const char* e = getenv("HAKMEM_TINY_P0_C7_ENABLE");
            c7_en = (e && *e && *e != '0') ? 1 : 0;
        }
        if (!c7_en) return 0;
    }
    // Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path.
    do {
        static int g_p0_disable = -1;
        if (__builtin_expect(g_p0_disable == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_P0_DISABLE");
            g_p0_disable = (e && *e && *e != '0') ? 1 : 0;
        }
        if (__builtin_expect(g_p0_disable, 0)) {
            return 0;
        }
    } while (0);
    if (!g_use_superslab || max_take <= 0) {
#if HAKMEM_DEBUG_COUNTERS
        if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
#endif
        return 0;
    }

    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    uint32_t active_before = 0;
    if (tls->ss) {
        active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
    }

    // CRITICAL DEBUG: Log class 7 pre-warm
    if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
        fprintf(stderr, "[P0_DEBUG_C7] Entry: tls->ss=%p tls->meta=%p max_take=%d\n",
                (void*)tls->ss, (void*)tls->meta, max_take);
    }

    if (!tls->ss) {
        // Try to obtain a SuperSlab for this class
        if (superslab_refill(class_idx) == NULL) {
            if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
                fprintf(stderr, "[P0_DEBUG_C7] superslab_refill() returned NULL\n");
            }
            return 0;
        }
        if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
            fprintf(stderr, "[P0_DEBUG_C7] After superslab_refill(): tls->ss=%p tls->meta=%p\n",
                    (void*)tls->ss, (void*)tls->meta);
        }
    }
    TinySlabMeta* meta = tls->meta;
    if (!meta) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_no_meta[class_idx]++;
#endif
        if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
            fprintf(stderr, "[P0_DEBUG_C7] meta is NULL after superslab_refill, returning 0\n");
        }
        return 0;
    }

    // Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
    // env:
    //  - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
    //  - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
    do {
        static int g_direct_fc = -1;
        static int g_direct_fc_c7 = -1;
        if (__builtin_expect(g_direct_fc == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
            // Default ON when unset
            g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
        }
        if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
            const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
            // Default OFF for class7 (1KB) until stability is fully verified; opt-in via env
            g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0;
        }
        if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
            int room = tiny_fc_room(class_idx);
            if (room <= 0) return 0;
            // Drain only if above threshold
            uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
            static int g_drain_th = -1;
            if (__builtin_expect(g_drain_th == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
                g_drain_th = (e && *e) ? atoi(e) : 64;
                if (g_drain_th < 0) g_drain_th = 0;
            }
            if (rmt >= (uint32_t)g_drain_th) {
                static int no_drain = -1;
                if (__builtin_expect(no_drain == -1, 0)) {
                    const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
                    no_drain = (e && *e && *e != '0') ? 1 : 0;
                }
                if (!no_drain) {
                    _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
                }
            }
            // Gather pointers without writing into objects
            void* out[128]; int produced = 0;
            TinySlabMeta* m = tls->meta;
            size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
            uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
            while (produced < room) {
                if (__builtin_expect(m->freelist != NULL, 0)) {
                    void* p = m->freelist; m->freelist = *(void**)p; m->used++;
                    out[produced++] = p;
                    continue;
                }
                if (__builtin_expect(m->carved < m->capacity, 1)) {
                    void* p = (void*)(base + ((size_t)m->carved * bs));
                    m->carved++; m->used++;
                    out[produced++] = p;
                    continue;
                }
                // Need to move to another slab with space
                if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
                // Rebind
                tls = &g_tls_slabs[class_idx];
                m = tls->meta;
                base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
            }
            if (produced > 0) {
                ss_active_add(tls->ss, (uint32_t)produced);
                int pushed = tiny_fc_push_bulk(class_idx, out, produced);
                (void)pushed; // roomに合わせているので一致するはず
                if (p0_should_log()) {
                    static _Atomic int g_logged = 0;
                    int exp = 0;
                    if (atomic_compare_exchange_strong(&g_logged, &exp, 1)) {
                        fprintf(stderr, "[P0_DIRECT_FC_TAKE] cls=%d take=%d room=%d drain_th=%d remote_cnt=%u\n",
                                class_idx, produced, room, g_drain_th, rmt);
                    }
                }
                return produced;
            }
            // fallthrough to regular path
        }
    } while (0);

    // Compute how many we can actually push into SLL without overflow
    uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
    int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
    if (room <= 0) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_no_room[class_idx]++;
#endif
        return 0;
    }

    // For hot tiny classes (0..3), allow an env override to increase batch size
    uint32_t want = (uint32_t)max_take;
    if (class_idx <= 3) {
        static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
        if (__builtin_expect(g_hot_override == -2, 0)) {
            const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
            int v = (e && *e) ? atoi(e) : -1;
            if (v < 0) v = -1; if (v > 256) v = 256; // clamp
            g_hot_override = v;
        }
        if (g_hot_override > 0) want = (uint32_t)g_hot_override;
    } else {
        // Mid classes (>=4): optional override for batch size
        static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
        if (__builtin_expect(g_mid_override == -2, 0)) {
            const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
            int v = (e && *e) ? atoi(e) : -1;
            if (v < 0) v = -1; if (v > 256) v = 256; // clamp
            g_mid_override = v;
        }
        if (g_mid_override > 0) want = (uint32_t)g_mid_override;
    }
    if (want > (uint32_t)room) want = (uint32_t)room;
    if (want == 0) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_want_zero[class_idx]++;
#endif
        return 0;
    }

    // Effective stride: class block size + 1-byte header for classes 0..6
    size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
    int total_taken = 0;

    // === P0 Batch Carving Loop ===
    while (want > 0) {
        // Calculate slab base for validation (accounts for 2048 offset in slab 0)
        uintptr_t ss_base = 0;
        uintptr_t ss_limit = 0;
        if (tls->ss && tls->slab_idx >= 0) {
            uint8_t* slab_base = tiny_slab_base_for(tls->ss, tls->slab_idx);
            ss_base = (uintptr_t)slab_base;
            // Limit is end of current slab
            ss_limit = ss_base + SLAB_SIZE;
            if (tls->slab_idx == 0) {
                ss_limit = ss_base + (SLAB_SIZE - SUPERSLAB_SLAB0_DATA_OFFSET);
            }
        }

        // CRITICAL FIX: Drain remote queue BEFORE popping from freelist
        // Without this, blocks in both freelist and remote queue can be double-allocated
        // (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data)
        // OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter)
        if (tls->ss && tls->slab_idx >= 0) {
            uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
            if (remote_count > 0) {
                // Runtime A/B: allow skipping remote drain for切り分け
                static int no_drain = -1;
                if (__builtin_expect(no_drain == -1, 0)) {
                    const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
                    no_drain = (e && *e && *e != '0') ? 1 : 0;
                }
                if (!no_drain) {
                    _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
                }
            }
        }

        // Handle freelist items first (usually 0)
        TinyRefillChain chain;
        uint32_t from_freelist = trc_pop_from_freelist(
            meta, class_idx, ss_base, ss_limit, bs, want, &chain);
        if (from_freelist > 0) {
            trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
            // FIX: Blocks from freelist were decremented when freed, must increment when allocated
            ss_active_add(tls->ss, from_freelist);
            // FIX: Keep TinySlabMeta::used consistent with non-P0 path
            meta->used = (uint16_t)((uint32_t)meta->used + from_freelist);
            extern unsigned long long g_rf_freelist_items[];
            g_rf_freelist_items[class_idx] += from_freelist;
            total_taken += from_freelist;
            want -= from_freelist;
            if (want == 0) break;
        }

        // === Linear Carve (P0 Key Optimization!) ===
        // Use monotonic 'carved' to track linear progression (used can decrement on free)
        if (meta->carved >= meta->capacity) {
            // Slab exhausted, try to get another
            if (superslab_refill(class_idx) == NULL) break;
            // CRITICAL FIX: Reload tls pointer after superslab_refill() binds new slab
            tls = &g_tls_slabs[class_idx];
            meta = tls->meta;
            if (!meta) break;
            continue;
        }

        uint32_t available = meta->capacity - meta->carved;
        uint32_t batch = want;
        if (batch > available) batch = available;
        if (batch == 0) break;

        // Get slab base
        uint8_t* slab_base = tls->slab_base ? tls->slab_base
                                            : tiny_slab_base_for(tls->ss, tls->slab_idx);

        // Diagnostic log (one-shot)
        static _Atomic int g_carve_log_printed = 0;
        if (atomic_load(&g_carve_log_printed) == 0 &&
            atomic_exchange(&g_carve_log_printed, 1) == 0) {
            fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n",
                    class_idx, tls->slab_idx, meta->used, meta->capacity, batch,
                    (void*)slab_base, bs);
            fflush(stderr);
        }

        TinyRefillChain carve;
        trc_linear_carve(slab_base, bs, meta, batch, &carve);
        trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
        // FIX: Update SuperSlab active counter (was missing!)
        ss_active_add(tls->ss, batch);
        extern unsigned long long g_rf_carve_items[];
        g_rf_carve_items[class_idx] += batch;

        total_taken += batch;
        want -= batch;
    }

#if HAKMEM_DEBUG_COUNTERS
    // Track successful SLL refills from SuperSlab (compile-time gated)
    // NOTE: Increment unconditionally to verify counter is working
    g_rf_hit_slab[class_idx]++;
#endif

    if (tls->ss && p0_should_log()) {
        uint32_t active_after = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
        int32_t delta = (int32_t)active_after - (int32_t)active_before;
        if ((int32_t)total_taken != delta) {
            fprintf(stderr,
                    "[P0_COUNTER_MISMATCH] cls=%d slab=%d taken=%d active_delta=%d used=%u carved=%u cap=%u freelist=%p\n",
                    class_idx, tls->slab_idx, total_taken, delta,
                    (unsigned)meta->used, (unsigned)meta->carved, (unsigned)meta->capacity,
                    meta->freelist);
        } else {
            fprintf(stderr,
                    "[P0_COUNTER_OK] cls=%d slab=%d taken=%d active_delta=%d\n",
                    class_idx, tls->slab_idx, total_taken, delta);
        }
    }
    return total_taken;
}

#endif // HAKMEM_TINY_REFILL_P0_INC_H
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								// hakmem_tiny_refill_p0.inc.h
 								// ChatGPT Pro P0: Complete Batch Refill (SLL用)
 								//
 								// Purpose: Optimize sll_refill_small_from_ss with batch carving
 								// Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126)
 								//
 								// Key optimization: ss_active_inc × 64 → ss_active_add × 1
 								//
 								// Maintains: Existing g_tls_sll_head fast path (no changes to hot path!)
 								//
 								// Enable P0 by default for testing (set to 0 to disable)
 								#ifndef HAKMEM_TINY_P0_BATCH_REFILL
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								#define HAKMEM_TINY_P0_BATCH_REFILL 0
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								#endif
 								#ifndef HAKMEM_TINY_REFILL_P0_INC_H
 								#define HAKMEM_TINY_REFILL_P0_INC_H
 								// Debug counters (compile-time gated)
 								#if HAKMEM_DEBUG_COUNTERS
 								extern unsigned long long g_rf_hit_slab[];
 								// Diagnostic counters for refill early returns
 								extern unsigned long long g_rf_early_no_ss[];      // Line 27: !g_use_superslab
 								extern unsigned long long g_rf_early_no_meta[];    // Line 35: !meta
 								extern unsigned long long g_rf_early_no_room[];    // Line 40: room <= 0
 								extern unsigned long long g_rf_early_want_zero[];  // Line 55: want == 0
 								#endif
 								// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
 								#include "tiny_refill_opt.h"
-												Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs

- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0').
- Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7),
  HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG.
- P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve)
  without writing into objects, then bulk-push into FC, update meta/active counters once.
- Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md.

Notes
- Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs.
- Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.

											
										
										
											2025-11-09 23:15:02 +09:00
+								#include "tiny_fc_api.h"
-												Fix: CRITICAL multi-threaded freelist/remote queue race condition

Root Cause:
===========
Freelist and remote queue contained the SAME blocks, causing use-after-free:

1. Thread A (owner): pops block X from freelist → allocates to user
2. User writes data ("ab") to block X
3. Thread B (remote): free(block X) → adds to remote queue
4. Thread A (later): drains remote queue → *(void**)block_X = chain_head
   → OVERWRITES USER DATA! 💥

The freelist pop path did NOT drain the remote queue first, so blocks could
be simultaneously in both freelist and remote queue.

Fix:
====
Add remote queue drain BEFORE freelist pop in refill path:

core/hakmem_tiny_refill_p0.inc.h:
  - Call _ss_remote_drain_to_freelist_unsafe() BEFORE trc_pop_from_freelist()
  - Add #include "superslab/superslab_inline.h"
  - This ensures freelist and remote queue are mutually exclusive

Test Results:
=============
BEFORE:
  larson_hakmem (4 threads): ❌ SEGV in seconds (freelist corruption)

AFTER:
  larson_hakmem (4 threads): ✅ 931,629 ops/s (1073 sec stable run)
  bench_random_mixed:        ✅ 1,020,163 ops/s (no crashes)

Evidence:
  - Fail-Fast logs showed next pointer corruption: 0x...6261 (ASCII "ab")
  - Single-threaded benchmarks worked (865K ops/s)
  - Multi-threaded Larson crashed immediately
  - Fix eliminates all crashes in both benchmarks

Files:
  - core/hakmem_tiny_refill_p0.inc.h: Add remote drain before freelist pop
  - CURRENT_TASK.md: Document fix details

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:35:45 +09:00
+								#include "superslab/superslab_inline.h"  // For _ss_remote_drain_to_freelist_unsafe()
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								// Optional P0 diagnostic logging helper
 								static inline int p0_should_log(void) {
 								    static int en = -1;
 								    if (__builtin_expect(en == -1, 0)) {
 								        const char* e = getenv("HAKMEM_TINY_P0_LOG");
 								        en = (e && *e && *e != '0') ? 1 : 0;
 								    }
 								    return en;
 								}
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
-												Tiny C7(1KB) SEGV triage hardening: always-on lightweight free-time guards for headerless class7 in both hak_tiny_free_with_slab and superslab free path (alignment/range check, fail-fast via SIGUSR2). Leave C7 P0/direct-FC gated OFF by default. Add docs/TINY_C7_1KB_SEGV_TRIAGE.md for Claude with repro matrix, hypotheses, instrumentation and acceptance criteria.

											
										
										
											2025-11-10 01:59:11 +09:00
+								    // Conservative guard: class7(1KB) uses legacy path by default until fully stabilized.
 								    // Opt-in via HAKMEM_TINY_P0_C7_ENABLE=1
 								    if (__builtin_expect(class_idx == 7, 0)) {
 								        static int c7_en = -1;
 								        if (c7_en == -1) {
 								            const char* e = getenv("HAKMEM_TINY_P0_C7_ENABLE");
 								            c7_en = (e && *e && *e != '0') ? 1 : 0;
 								        }
 								        if (!c7_en) return 0;
 								    }
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								    // Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path.
 								    do {
 								        static int g_p0_disable = -1;
 								        if (__builtin_expect(g_p0_disable == -1, 0)) {
 								            const char* e = getenv("HAKMEM_TINY_P0_DISABLE");
 								            g_p0_disable = (e && *e && *e != '0') ? 1 : 0;
 								        }
 								        if (__builtin_expect(g_p0_disable, 0)) {
 								            return 0;
 								        }
 								    } while (0);
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    if (!g_use_superslab || max_take <= 0) {
 								#if HAKMEM_DEBUG_COUNTERS
 								        if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
 								#endif
 								        return 0;
 								    }
 								    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								    uint32_t active_before = 0;
 								    if (tls->ss) {
 								        active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
 								    }
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
 								    // CRITICAL DEBUG: Log class 7 pre-warm
 								    if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
 								        fprintf(stderr, "[P0_DEBUG_C7] Entry: tls->ss=%p tls->meta=%p max_take=%d\n",
 								                (void*)tls->ss, (void*)tls->meta, max_take);
 								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    if (!tls->ss) {
 								        // Try to obtain a SuperSlab for this class
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
+								        if (superslab_refill(class_idx) == NULL) {
 								            if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
 								                fprintf(stderr, "[P0_DEBUG_C7] superslab_refill() returned NULL\n");
 								            }
 								            return 0;
 								        }
 								        if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
 								            fprintf(stderr, "[P0_DEBUG_C7] After superslab_refill(): tls->ss=%p tls->meta=%p\n",
 								                    (void*)tls->ss, (void*)tls->meta);
 								        }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    }
 								    TinySlabMeta* meta = tls->meta;
 								    if (!meta) {
 								#if HAKMEM_DEBUG_COUNTERS
 								        g_rf_early_no_meta[class_idx]++;
 								#endif
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
+								        if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
 								            fprintf(stderr, "[P0_DEBUG_C7] meta is NULL after superslab_refill, returning 0\n");
 								        }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								        return 0;
 								    }
-												Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs

- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0').
- Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7),
  HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG.
- P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve)
  without writing into objects, then bulk-push into FC, update meta/active counters once.
- Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md.

Notes
- Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs.
- Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.

											
										
										
											2025-11-09 23:15:02 +09:00
+								    // Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
 								    // env:
 								    //  - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
 								    //  - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
 								    do {
 								        static int g_direct_fc = -1;
 								        static int g_direct_fc_c7 = -1;
 								        if (__builtin_expect(g_direct_fc == -1, 0)) {
 								            const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
 								            // Default ON when unset
 								            g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
 								        }
 								        if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
 								            const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
+								            // Default OFF for class7 (1KB) until stability is fully verified; opt-in via env
 								            g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0;
-												Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs

- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0').
- Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7),
  HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG.
- P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve)
  without writing into objects, then bulk-push into FC, update meta/active counters once.
- Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md.

Notes
- Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs.
- Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.

											
										
										
											2025-11-09 23:15:02 +09:00
+								        }
 								        if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
 								            int room = tiny_fc_room(class_idx);
 								            if (room <= 0) return 0;
 								            // Drain only if above threshold
 								            uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
 								            static int g_drain_th = -1;
 								            if (__builtin_expect(g_drain_th == -1, 0)) {
 								                const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
+								                g_drain_th = (e && *e) ? atoi(e) : 64;
-												Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs

- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0').
- Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7),
  HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG.
- P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve)
  without writing into objects, then bulk-push into FC, update meta/active counters once.
- Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md.

Notes
- Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs.
- Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.

											
										
										
											2025-11-09 23:15:02 +09:00
+								                if (g_drain_th < 0) g_drain_th = 0;
 								            }
 								            if (rmt >= (uint32_t)g_drain_th) {
 								                static int no_drain = -1;
 								                if (__builtin_expect(no_drain == -1, 0)) {
 								                    const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
 								                    no_drain = (e && *e && *e != '0') ? 1 : 0;
 								                }
 								                if (!no_drain) {
 								                    _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
 								                }
 								            }
 								            // Gather pointers without writing into objects
 								            void* out[128]; int produced = 0;
 								            TinySlabMeta* m = tls->meta;
 								            size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
 								            uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
 								            while (produced < room) {
 								                if (__builtin_expect(m->freelist != NULL, 0)) {
 								                    void* p = m->freelist; m->freelist = *(void**)p; m->used++;
 								                    out[produced++] = p;
 								                    continue;
 								                }
 								                if (__builtin_expect(m->carved < m->capacity, 1)) {
 								                    void* p = (void*)(base + ((size_t)m->carved * bs));
 								                    m->carved++; m->used++;
 								                    out[produced++] = p;
 								                    continue;
 								                }
 								                // Need to move to another slab with space
 								                if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
 								                // Rebind
 								                tls = &g_tls_slabs[class_idx];
 								                m = tls->meta;
 								                base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
 								            }
 								            if (produced > 0) {
 								                ss_active_add(tls->ss, (uint32_t)produced);
 								                int pushed = tiny_fc_push_bulk(class_idx, out, produced);
 								                (void)pushed; // roomに合わせているので一致するはず
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
+								                if (p0_should_log()) {
 								                    static _Atomic int g_logged = 0;
 								                    int exp = 0;
 								                    if (atomic_compare_exchange_strong(&g_logged, &exp, 1)) {
 								                        fprintf(stderr, "[P0_DIRECT_FC_TAKE] cls=%d take=%d room=%d drain_th=%d remote_cnt=%u\n",
 								                                class_idx, produced, room, g_drain_th, rmt);
 								                    }
 								                }
-												Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs

- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0').
- Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7),
  HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG.
- P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve)
  without writing into objects, then bulk-push into FC, update meta/active counters once.
- Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md.

Notes
- Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs.
- Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.

											
										
										
											2025-11-09 23:15:02 +09:00
+								                return produced;
 								            }
 								            // fallthrough to regular path
 								        }
 								    } while (0);
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    // Compute how many we can actually push into SLL without overflow
 								    uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
 								    int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
 								    if (room <= 0) {
 								#if HAKMEM_DEBUG_COUNTERS
 								        g_rf_early_no_room[class_idx]++;
 								#endif
 								        return 0;
 								    }
 								    // For hot tiny classes (0..3), allow an env override to increase batch size
 								    uint32_t want = (uint32_t)max_take;
 								    if (class_idx <= 3) {
 								        static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
 								        if (__builtin_expect(g_hot_override == -2, 0)) {
 								            const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
 								            int v = (e && *e) ? atoi(e) : -1;
 								            if (v < 0) v = -1; if (v > 256) v = 256; // clamp
 								            g_hot_override = v;
 								        }
 								        if (g_hot_override > 0) want = (uint32_t)g_hot_override;
 								    } else {
 								        // Mid classes (>=4): optional override for batch size
 								        static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
 								        if (__builtin_expect(g_mid_override == -2, 0)) {
 								            const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
 								            int v = (e && *e) ? atoi(e) : -1;
 								            if (v < 0) v = -1; if (v > 256) v = 256; // clamp
 								            g_mid_override = v;
 								        }
 								        if (g_mid_override > 0) want = (uint32_t)g_mid_override;
 								    }
 								    if (want > (uint32_t)room) want = (uint32_t)room;
 								    if (want == 0) {
 								#if HAKMEM_DEBUG_COUNTERS
 								        g_rf_early_want_zero[class_idx]++;
 								#endif
 								        return 0;
 								    }
-												Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.

											
										
										
											2025-11-09 18:55:50 +09:00
+								    // Effective stride: class block size + 1-byte header for classes 0..6
 								    size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    int total_taken = 0;
 								    // === P0 Batch Carving Loop ===
 								    while (want > 0) {
-												Fix: CRITICAL double-allocation bug in trc_linear_carve()

Root Cause:
trc_linear_carve() used meta->used as cursor, but meta->used decrements
on free, causing already-allocated blocks to be re-carved.

Evidence:
- [LINEAR_CARVE] used=61 batch=1 → block 61 created
- (blocks freed, used decrements 62→59)
- [LINEAR_CARVE] used=59 batch=3 → blocks 59,60,61 RE-CREATED!
- Result: double-allocation → memory corruption → SEGV

Fix Implementation:
1. Added TinySlabMeta.carved (monotonic counter, never decrements)
2. Changed trc_linear_carve() to use carved instead of used
3. carved tracks carve progress, used tracks active count

Files Modified:
- core/superslab/superslab_types.h: Add carved field
- core/tiny_refill_opt.h: Use carved in trc_linear_carve()
- core/hakmem_tiny_superslab.c: Initialize carved=0
- core/tiny_alloc_fast.inc.h: Add next pointer validation
- core/hakmem_tiny_free.inc: Add drain/free validation

Test Results:
✅ bench_random_mixed: 950,037 ops/s (no crash)
✅ Fail-fast mode: 651,627 ops/s (with diagnostic logs)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:18:37 +09:00
+								        // Calculate slab base for validation (accounts for 2048 offset in slab 0)
-												Phase 6-2.3~6-2.5: Critical bug fixes + SuperSlab optimization (WIP)

## Phase 6-2.3: Fix 4T Larson crash (active counter bug) ✅
**Problem:** 4T Larson crashed with "free(): invalid pointer", OOM errors
**Root cause:** core/hakmem_tiny_refill_p0.inc.h:103
  - P0 batch refill moved freelist blocks to TLS cache
  - Active counter NOT incremented → double-decrement on free
  - Counter underflows → SuperSlab appears full → OOM → crash
**Fix:** Added ss_active_add(tls->ss, from_freelist);
**Result:** 4T stable at 838K ops/s ✅

## Phase 6-2.4: Fix SEGV in random_mixed/mid_large_mt benchmarks ✅
**Problem:** bench_random_mixed_hakmem, bench_mid_large_mt_hakmem → immediate SEGV
**Root cause #1:** core/box/hak_free_api.inc.h:92-95
  - "Guess loop" dereferenced unmapped memory when registry lookup failed
**Root cause #2:** core/box/hak_free_api.inc.h:115
  - Header magic check dereferenced unmapped memory
**Fix:**
  1. Removed dangerous guess loop (lines 92-95)
  2. Added hak_is_memory_readable() check before dereferencing header
     (core/hakmem_internal.h:277-294 - uses mincore() syscall)
**Result:**
  - random_mixed (2KB): SEGV → 2.22M ops/s ✅
  - random_mixed (4KB): SEGV → 2.58M ops/s ✅
  - Larson 4T: no regression (838K ops/s) ✅

## Phase 6-2.5: Performance investigation + SuperSlab fix (WIP) ⚠️
**Problem:** Severe performance gaps (19-26x slower than system malloc)
**Investigation:** Task agent identified root cause
  - hak_is_memory_readable() syscall overhead (100-300 cycles per free)
  - ALL frees hit unmapped_header_fallback path
  - SuperSlab lookup NEVER called
  - Why? g_use_superslab = 0 (disabled by diet mode)

**Root cause:** core/hakmem_tiny_init.inc:104-105
  - Diet mode (default ON) disables SuperSlab
  - SuperSlab defaults to 1 (hakmem_config.c:334)
  - BUT diet mode overrides it to 0 during init

**Fix:** Separate SuperSlab from diet mode
  - SuperSlab: Performance-critical (fast alloc/free)
  - Diet mode: Memory efficiency (magazine capacity limits only)
  - Both are independent features, should not interfere

**Status:** ⚠️ INCOMPLETE - New SEGV discovered after fix
  - SuperSlab lookup now works (confirmed via debug output)
  - But benchmark crashes (Exit 139) after ~20 lookups
  - Needs further investigation

**Files modified:**
- core/hakmem_tiny_init.inc:99-109 - Removed diet mode override
- PERFORMANCE_INVESTIGATION_REPORT.md - Task agent analysis (303x instruction gap)

**Next steps:**
- Investigate new SEGV (likely SuperSlab free path bug)
- OR: Revert Phase 6-2.5 changes if blocking progress

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 20:31:01 +09:00
+								        uintptr_t ss_base = 0;
 								        uintptr_t ss_limit = 0;
-												Fix: CRITICAL double-allocation bug in trc_linear_carve()

Root Cause:
trc_linear_carve() used meta->used as cursor, but meta->used decrements
on free, causing already-allocated blocks to be re-carved.

Evidence:
- [LINEAR_CARVE] used=61 batch=1 → block 61 created
- (blocks freed, used decrements 62→59)
- [LINEAR_CARVE] used=59 batch=3 → blocks 59,60,61 RE-CREATED!
- Result: double-allocation → memory corruption → SEGV

Fix Implementation:
1. Added TinySlabMeta.carved (monotonic counter, never decrements)
2. Changed trc_linear_carve() to use carved instead of used
3. carved tracks carve progress, used tracks active count

Files Modified:
- core/superslab/superslab_types.h: Add carved field
- core/tiny_refill_opt.h: Use carved in trc_linear_carve()
- core/hakmem_tiny_superslab.c: Initialize carved=0
- core/tiny_alloc_fast.inc.h: Add next pointer validation
- core/hakmem_tiny_free.inc: Add drain/free validation

Test Results:
✅ bench_random_mixed: 950,037 ops/s (no crash)
✅ Fail-fast mode: 651,627 ops/s (with diagnostic logs)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:18:37 +09:00
+								        if (tls->ss && tls->slab_idx >= 0) {
 								            uint8_t* slab_base = tiny_slab_base_for(tls->ss, tls->slab_idx);
 								            ss_base = (uintptr_t)slab_base;
 								            // Limit is end of current slab
 								            ss_limit = ss_base + SLAB_SIZE;
 								            if (tls->slab_idx == 0) {
 								                ss_limit = ss_base + (SLAB_SIZE - SUPERSLAB_SLAB0_DATA_OFFSET);
 								            }
-												Phase 6-2.3~6-2.5: Critical bug fixes + SuperSlab optimization (WIP)

## Phase 6-2.3: Fix 4T Larson crash (active counter bug) ✅
**Problem:** 4T Larson crashed with "free(): invalid pointer", OOM errors
**Root cause:** core/hakmem_tiny_refill_p0.inc.h:103
  - P0 batch refill moved freelist blocks to TLS cache
  - Active counter NOT incremented → double-decrement on free
  - Counter underflows → SuperSlab appears full → OOM → crash
**Fix:** Added ss_active_add(tls->ss, from_freelist);
**Result:** 4T stable at 838K ops/s ✅

## Phase 6-2.4: Fix SEGV in random_mixed/mid_large_mt benchmarks ✅
**Problem:** bench_random_mixed_hakmem, bench_mid_large_mt_hakmem → immediate SEGV
**Root cause #1:** core/box/hak_free_api.inc.h:92-95
  - "Guess loop" dereferenced unmapped memory when registry lookup failed
**Root cause #2:** core/box/hak_free_api.inc.h:115
  - Header magic check dereferenced unmapped memory
**Fix:**
  1. Removed dangerous guess loop (lines 92-95)
  2. Added hak_is_memory_readable() check before dereferencing header
     (core/hakmem_internal.h:277-294 - uses mincore() syscall)
**Result:**
  - random_mixed (2KB): SEGV → 2.22M ops/s ✅
  - random_mixed (4KB): SEGV → 2.58M ops/s ✅
  - Larson 4T: no regression (838K ops/s) ✅

## Phase 6-2.5: Performance investigation + SuperSlab fix (WIP) ⚠️
**Problem:** Severe performance gaps (19-26x slower than system malloc)
**Investigation:** Task agent identified root cause
  - hak_is_memory_readable() syscall overhead (100-300 cycles per free)
  - ALL frees hit unmapped_header_fallback path
  - SuperSlab lookup NEVER called
  - Why? g_use_superslab = 0 (disabled by diet mode)

**Root cause:** core/hakmem_tiny_init.inc:104-105
  - Diet mode (default ON) disables SuperSlab
  - SuperSlab defaults to 1 (hakmem_config.c:334)
  - BUT diet mode overrides it to 0 during init

**Fix:** Separate SuperSlab from diet mode
  - SuperSlab: Performance-critical (fast alloc/free)
  - Diet mode: Memory efficiency (magazine capacity limits only)
  - Both are independent features, should not interfere

**Status:** ⚠️ INCOMPLETE - New SEGV discovered after fix
  - SuperSlab lookup now works (confirmed via debug output)
  - But benchmark crashes (Exit 139) after ~20 lookups
  - Needs further investigation

**Files modified:**
- core/hakmem_tiny_init.inc:99-109 - Removed diet mode override
- PERFORMANCE_INVESTIGATION_REPORT.md - Task agent analysis (303x instruction gap)

**Next steps:**
- Investigate new SEGV (likely SuperSlab free path bug)
- OR: Revert Phase 6-2.5 changes if blocking progress

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 20:31:01 +09:00
+								        }
-												Fix: CRITICAL multi-threaded freelist/remote queue race condition

Root Cause:
===========
Freelist and remote queue contained the SAME blocks, causing use-after-free:

1. Thread A (owner): pops block X from freelist → allocates to user
2. User writes data ("ab") to block X
3. Thread B (remote): free(block X) → adds to remote queue
4. Thread A (later): drains remote queue → *(void**)block_X = chain_head
   → OVERWRITES USER DATA! 💥

The freelist pop path did NOT drain the remote queue first, so blocks could
be simultaneously in both freelist and remote queue.

Fix:
====
Add remote queue drain BEFORE freelist pop in refill path:

core/hakmem_tiny_refill_p0.inc.h:
  - Call _ss_remote_drain_to_freelist_unsafe() BEFORE trc_pop_from_freelist()
  - Add #include "superslab/superslab_inline.h"
  - This ensures freelist and remote queue are mutually exclusive

Test Results:
=============
BEFORE:
  larson_hakmem (4 threads): ❌ SEGV in seconds (freelist corruption)

AFTER:
  larson_hakmem (4 threads): ✅ 931,629 ops/s (1073 sec stable run)
  bench_random_mixed:        ✅ 1,020,163 ops/s (no crashes)

Evidence:
  - Fail-Fast logs showed next pointer corruption: 0x...6261 (ASCII "ab")
  - Single-threaded benchmarks worked (865K ops/s)
  - Multi-threaded Larson crashed immediately
  - Fix eliminates all crashes in both benchmarks

Files:
  - core/hakmem_tiny_refill_p0.inc.h: Add remote drain before freelist pop
  - CURRENT_TASK.md: Document fix details

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:35:45 +09:00
 								        // CRITICAL FIX: Drain remote queue BEFORE popping from freelist
 								        // Without this, blocks in both freelist and remote queue can be double-allocated
 								        // (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data)
-												Perf: Optimize remote queue drain to skip when empty

Optimization:
=============
Check remote_counts[slab_idx] BEFORE calling drain function.
If remote queue is empty (count == 0), skip the drain entirely.

Impact:
- Single-threaded: remote_count is ALWAYS 0 → drain calls = 0
- Multi-threaded: only drain when there are actual remote frees
- Reduces unnecessary function call overhead in common case

Code:
  if (tls->ss && tls->slab_idx >= 0) {
      uint32_t remote_count = atomic_load_explicit(
          &tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
      if (remote_count > 0) {
          _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
      }
  }

Benchmark Results:
==================
bench_random_mixed (1 thread):
  Before: 1,020,163 ops/s
  After:  1,015,347 ops/s  (-0.5%, within noise)

larson_hakmem (4 threads):
  Before: 931,629 ops/s (1073 sec)
  After:  929,709 ops/s (1075 sec)  (-0.2%, within noise)

Note: Performance unchanged, but code is cleaner and avoids
unnecessary work in single-threaded case. Real bottleneck
appears to be elsewhere (Magazine layer overhead per CLAUDE.md).

Next: Profile with perf to find actual hotspots.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:44:24 +09:00
+								        // OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter)
-												Fix: CRITICAL multi-threaded freelist/remote queue race condition

Root Cause:
===========
Freelist and remote queue contained the SAME blocks, causing use-after-free:

1. Thread A (owner): pops block X from freelist → allocates to user
2. User writes data ("ab") to block X
3. Thread B (remote): free(block X) → adds to remote queue
4. Thread A (later): drains remote queue → *(void**)block_X = chain_head
   → OVERWRITES USER DATA! 💥

The freelist pop path did NOT drain the remote queue first, so blocks could
be simultaneously in both freelist and remote queue.

Fix:
====
Add remote queue drain BEFORE freelist pop in refill path:

core/hakmem_tiny_refill_p0.inc.h:
  - Call _ss_remote_drain_to_freelist_unsafe() BEFORE trc_pop_from_freelist()
  - Add #include "superslab/superslab_inline.h"
  - This ensures freelist and remote queue are mutually exclusive

Test Results:
=============
BEFORE:
  larson_hakmem (4 threads): ❌ SEGV in seconds (freelist corruption)

AFTER:
  larson_hakmem (4 threads): ✅ 931,629 ops/s (1073 sec stable run)
  bench_random_mixed:        ✅ 1,020,163 ops/s (no crashes)

Evidence:
  - Fail-Fast logs showed next pointer corruption: 0x...6261 (ASCII "ab")
  - Single-threaded benchmarks worked (865K ops/s)
  - Multi-threaded Larson crashed immediately
  - Fix eliminates all crashes in both benchmarks

Files:
  - core/hakmem_tiny_refill_p0.inc.h: Add remote drain before freelist pop
  - CURRENT_TASK.md: Document fix details

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:35:45 +09:00
+								        if (tls->ss && tls->slab_idx >= 0) {
-												Perf: Optimize remote queue drain to skip when empty

Optimization:
=============
Check remote_counts[slab_idx] BEFORE calling drain function.
If remote queue is empty (count == 0), skip the drain entirely.

Impact:
- Single-threaded: remote_count is ALWAYS 0 → drain calls = 0
- Multi-threaded: only drain when there are actual remote frees
- Reduces unnecessary function call overhead in common case

Code:
  if (tls->ss && tls->slab_idx >= 0) {
      uint32_t remote_count = atomic_load_explicit(
          &tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
      if (remote_count > 0) {
          _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
      }
  }

Benchmark Results:
==================
bench_random_mixed (1 thread):
  Before: 1,020,163 ops/s
  After:  1,015,347 ops/s  (-0.5%, within noise)

larson_hakmem (4 threads):
  Before: 931,629 ops/s (1073 sec)
  After:  929,709 ops/s (1075 sec)  (-0.2%, within noise)

Note: Performance unchanged, but code is cleaner and avoids
unnecessary work in single-threaded case. Real bottleneck
appears to be elsewhere (Magazine layer overhead per CLAUDE.md).

Next: Profile with perf to find actual hotspots.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:44:24 +09:00
+								            uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
 								            if (remote_count > 0) {
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								                // Runtime A/B: allow skipping remote drain for切り分け
 								                static int no_drain = -1;
 								                if (__builtin_expect(no_drain == -1, 0)) {
 								                    const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
 								                    no_drain = (e && *e && *e != '0') ? 1 : 0;
 								                }
 								                if (!no_drain) {
 								                    _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
 								                }
-												Perf: Optimize remote queue drain to skip when empty

Optimization:
=============
Check remote_counts[slab_idx] BEFORE calling drain function.
If remote queue is empty (count == 0), skip the drain entirely.

Impact:
- Single-threaded: remote_count is ALWAYS 0 → drain calls = 0
- Multi-threaded: only drain when there are actual remote frees
- Reduces unnecessary function call overhead in common case

Code:
  if (tls->ss && tls->slab_idx >= 0) {
      uint32_t remote_count = atomic_load_explicit(
          &tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
      if (remote_count > 0) {
          _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
      }
  }

Benchmark Results:
==================
bench_random_mixed (1 thread):
  Before: 1,020,163 ops/s
  After:  1,015,347 ops/s  (-0.5%, within noise)

larson_hakmem (4 threads):
  Before: 931,629 ops/s (1073 sec)
  After:  929,709 ops/s (1075 sec)  (-0.2%, within noise)

Note: Performance unchanged, but code is cleaner and avoids
unnecessary work in single-threaded case. Real bottleneck
appears to be elsewhere (Magazine layer overhead per CLAUDE.md).

Next: Profile with perf to find actual hotspots.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:44:24 +09:00
+								            }
-												Fix: CRITICAL multi-threaded freelist/remote queue race condition

Root Cause:
===========
Freelist and remote queue contained the SAME blocks, causing use-after-free:

1. Thread A (owner): pops block X from freelist → allocates to user
2. User writes data ("ab") to block X
3. Thread B (remote): free(block X) → adds to remote queue
4. Thread A (later): drains remote queue → *(void**)block_X = chain_head
   → OVERWRITES USER DATA! 💥

The freelist pop path did NOT drain the remote queue first, so blocks could
be simultaneously in both freelist and remote queue.

Fix:
====
Add remote queue drain BEFORE freelist pop in refill path:

core/hakmem_tiny_refill_p0.inc.h:
  - Call _ss_remote_drain_to_freelist_unsafe() BEFORE trc_pop_from_freelist()
  - Add #include "superslab/superslab_inline.h"
  - This ensures freelist and remote queue are mutually exclusive

Test Results:
=============
BEFORE:
  larson_hakmem (4 threads): ❌ SEGV in seconds (freelist corruption)

AFTER:
  larson_hakmem (4 threads): ✅ 931,629 ops/s (1073 sec stable run)
  bench_random_mixed:        ✅ 1,020,163 ops/s (no crashes)

Evidence:
  - Fail-Fast logs showed next pointer corruption: 0x...6261 (ASCII "ab")
  - Single-threaded benchmarks worked (865K ops/s)
  - Multi-threaded Larson crashed immediately
  - Fix eliminates all crashes in both benchmarks

Files:
  - core/hakmem_tiny_refill_p0.inc.h: Add remote drain before freelist pop
  - CURRENT_TASK.md: Document fix details

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:35:45 +09:00
+								        }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								        // Handle freelist items first (usually 0)
 								        TinyRefillChain chain;
-												Phase 6-2.3~6-2.5: Critical bug fixes + SuperSlab optimization (WIP)

## Phase 6-2.3: Fix 4T Larson crash (active counter bug) ✅
**Problem:** 4T Larson crashed with "free(): invalid pointer", OOM errors
**Root cause:** core/hakmem_tiny_refill_p0.inc.h:103
  - P0 batch refill moved freelist blocks to TLS cache
  - Active counter NOT incremented → double-decrement on free
  - Counter underflows → SuperSlab appears full → OOM → crash
**Fix:** Added ss_active_add(tls->ss, from_freelist);
**Result:** 4T stable at 838K ops/s ✅

## Phase 6-2.4: Fix SEGV in random_mixed/mid_large_mt benchmarks ✅
**Problem:** bench_random_mixed_hakmem, bench_mid_large_mt_hakmem → immediate SEGV
**Root cause #1:** core/box/hak_free_api.inc.h:92-95
  - "Guess loop" dereferenced unmapped memory when registry lookup failed
**Root cause #2:** core/box/hak_free_api.inc.h:115
  - Header magic check dereferenced unmapped memory
**Fix:**
  1. Removed dangerous guess loop (lines 92-95)
  2. Added hak_is_memory_readable() check before dereferencing header
     (core/hakmem_internal.h:277-294 - uses mincore() syscall)
**Result:**
  - random_mixed (2KB): SEGV → 2.22M ops/s ✅
  - random_mixed (4KB): SEGV → 2.58M ops/s ✅
  - Larson 4T: no regression (838K ops/s) ✅

## Phase 6-2.5: Performance investigation + SuperSlab fix (WIP) ⚠️
**Problem:** Severe performance gaps (19-26x slower than system malloc)
**Investigation:** Task agent identified root cause
  - hak_is_memory_readable() syscall overhead (100-300 cycles per free)
  - ALL frees hit unmapped_header_fallback path
  - SuperSlab lookup NEVER called
  - Why? g_use_superslab = 0 (disabled by diet mode)

**Root cause:** core/hakmem_tiny_init.inc:104-105
  - Diet mode (default ON) disables SuperSlab
  - SuperSlab defaults to 1 (hakmem_config.c:334)
  - BUT diet mode overrides it to 0 during init

**Fix:** Separate SuperSlab from diet mode
  - SuperSlab: Performance-critical (fast alloc/free)
  - Diet mode: Memory efficiency (magazine capacity limits only)
  - Both are independent features, should not interfere

**Status:** ⚠️ INCOMPLETE - New SEGV discovered after fix
  - SuperSlab lookup now works (confirmed via debug output)
  - But benchmark crashes (Exit 139) after ~20 lookups
  - Needs further investigation

**Files modified:**
- core/hakmem_tiny_init.inc:99-109 - Removed diet mode override
- PERFORMANCE_INVESTIGATION_REPORT.md - Task agent analysis (303x instruction gap)

**Next steps:**
- Investigate new SEGV (likely SuperSlab free path bug)
- OR: Revert Phase 6-2.5 changes if blocking progress

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 20:31:01 +09:00
+								        uint32_t from_freelist = trc_pop_from_freelist(
 								            meta, class_idx, ss_base, ss_limit, bs, want, &chain);
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								        if (from_freelist > 0) {
 								            trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
-												Fix: Active counter double-decrement in P0 batch refill (4T crash → stable)

## Problem
HAKMEM 4T crashed with "free(): invalid pointer" on startup:
- System/mimalloc: 3.3M ops/s ✅
- HAKMEM 1T: 838K ops/s (-75%) ⚠️
- HAKMEM 4T: Crash (Exit 134) ❌

Error: superslab_refill returned NULL (OOM), active=0, bitmap=0x00000000

## Root Cause (Ultrathink Task Agent Investigation)
Active counter double-decrement when re-allocating from freelist:

1. Free → counter-- ✅
2. Remote drain → add to freelist (no counter change) ✅
3. P0 batch refill → move to TLS cache (forgot counter++) ❌ BUG!
4. Next free → counter-- ❌ Double decrement!

Result: Counter underflow → SuperSlab appears "full" → OOM → crash

## Fix (1 line)
File: core/hakmem_tiny_refill_p0.inc.h:103

+ss_active_add(tls->ss, from_freelist);

Reason: Freelist re-allocation moves block from "free" to "allocated" state,
so active counter MUST increment.

## Verification
| Setting        | Before  | After          | Result       |
|----------------|---------|----------------|--------------|
| 4T default     | ❌ Crash | ✅ 838,445 ops/s | 🎉 Stable    |
| Stability (2x) | -       | ✅ Same score   | Reproducible |

## Remaining Issue
❌ HAKMEM_TINY_REFILL_COUNT_HOT=64 triggers crash (class=4 OOM)
- Suspected: TLS cache over-accumulation or memory leak
- Next: Investigate HAKMEM_TINY_FAST_CAP interaction

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 12:37:23 +09:00
+								            // FIX: Blocks from freelist were decremented when freed, must increment when allocated
 								            ss_active_add(tls->ss, from_freelist);
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								            // FIX: Keep TinySlabMeta::used consistent with non-P0 path
 								            meta->used = (uint16_t)((uint32_t)meta->used + from_freelist);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								            extern unsigned long long g_rf_freelist_items[];
 								            g_rf_freelist_items[class_idx] += from_freelist;
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								            total_taken += from_freelist;
 								            want -= from_freelist;
 								            if (want == 0) break;
 								        }
 								        // === Linear Carve (P0 Key Optimization!) ===
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								        // Use monotonic 'carved' to track linear progression (used can decrement on free)
 								        if (meta->carved >= meta->capacity) {
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								            // Slab exhausted, try to get another
 								            if (superslab_refill(class_idx) == NULL) break;
-												Tiny P0/FC tuning: per-class FastCache caps honored; defaults C5=96, C7=48. Raise direct-FC drain threshold default to 64. Default class7 direct-FC OFF for stability. 256B fixed-size shows branch-miss drop (~11%→~8.9%) and ~4.5M ops/s on Ryzen 7 5825U. Note: 1KB fixed-size currently SEGVs even with direct-FC OFF, pointing to non-direct P0 path; propose gating P0 for C7 and triage next (adopt-before-map recheck, bounds asserts). Update CURRENT_TASK.md with changes and results path.

											
										
										
											2025-11-10 00:25:02 +09:00
+								            // CRITICAL FIX: Reload tls pointer after superslab_refill() binds new slab
 								            tls = &g_tls_slabs[class_idx];
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								            meta = tls->meta;
 								            if (!meta) break;
 								            continue;
 								        }
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								        uint32_t available = meta->capacity - meta->carved;
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								        uint32_t batch = want;
 								        if (batch > available) batch = available;
 								        if (batch == 0) break;
 								        // Get slab base
 								        uint8_t* slab_base = tls->slab_base ? tls->slab_base
 								                                            : tiny_slab_base_for(tls->ss, tls->slab_idx);
-												Fix: CRITICAL double-allocation bug in trc_linear_carve()

Root Cause:
trc_linear_carve() used meta->used as cursor, but meta->used decrements
on free, causing already-allocated blocks to be re-carved.

Evidence:
- [LINEAR_CARVE] used=61 batch=1 → block 61 created
- (blocks freed, used decrements 62→59)
- [LINEAR_CARVE] used=59 batch=3 → blocks 59,60,61 RE-CREATED!
- Result: double-allocation → memory corruption → SEGV

Fix Implementation:
1. Added TinySlabMeta.carved (monotonic counter, never decrements)
2. Changed trc_linear_carve() to use carved instead of used
3. carved tracks carve progress, used tracks active count

Files Modified:
- core/superslab/superslab_types.h: Add carved field
- core/tiny_refill_opt.h: Use carved in trc_linear_carve()
- core/hakmem_tiny_superslab.c: Initialize carved=0
- core/tiny_alloc_fast.inc.h: Add next pointer validation
- core/hakmem_tiny_free.inc: Add drain/free validation

Test Results:
✅ bench_random_mixed: 950,037 ops/s (no crash)
✅ Fail-fast mode: 651,627 ops/s (with diagnostic logs)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-08 01:18:37 +09:00
 								        // Diagnostic log (one-shot)
 								        static _Atomic int g_carve_log_printed = 0;
 								        if (atomic_load(&g_carve_log_printed) == 0 &&
 								            atomic_exchange(&g_carve_log_printed, 1) == 0) {
 								            fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n",
 								                    class_idx, tls->slab_idx, meta->used, meta->capacity, batch,
 								                    (void*)slab_base, bs);
 								            fflush(stderr);
 								        }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								        TinyRefillChain carve;
 								        trc_linear_carve(slab_base, bs, meta, batch, &carve);
 								        trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
 								        // FIX: Update SuperSlab active counter (was missing!)
 								        ss_active_add(tls->ss, batch);
-												CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消

**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M ✅  |  4T: SEGV 💀
After:  1T: 2.41M ✅  |  4T: 4.19M ✅  (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅

# 4 threads: 完走（以前は SEGV）
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-07 01:27:04 +09:00
+								        extern unsigned long long g_rf_carve_items[];
 								        g_rf_carve_items[class_idx] += batch;
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
 								        total_taken += batch;
 								        want -= batch;
 								    }
 								#if HAKMEM_DEBUG_COUNTERS
 								    // Track successful SLL refills from SuperSlab (compile-time gated)
 								    // NOTE: Increment unconditionally to verify counter is working
 								    g_rf_hit_slab[class_idx]++;
 								#endif
-												Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.

											
										
										
											2025-11-09 22:12:34 +09:00
+								    if (tls->ss && p0_should_log()) {
 								        uint32_t active_after = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
 								        int32_t delta = (int32_t)active_after - (int32_t)active_before;
 								        if ((int32_t)total_taken != delta) {
 								            fprintf(stderr,
 								                    "[P0_COUNTER_MISMATCH] cls=%d slab=%d taken=%d active_delta=%d used=%u carved=%u cap=%u freelist=%p\n",
 								                    class_idx, tls->slab_idx, total_taken, delta,
 								                    (unsigned)meta->used, (unsigned)meta->carved, (unsigned)meta->capacity,
 								                    meta->freelist);
 								        } else {
 								            fprintf(stderr,
 								                    "[P0_COUNTER_OK] cls=%d slab=%d taken=%d active_delta=%d\n",
 								                    class_idx, tls->slab_idx, total_taken, delta);
 								        }
 								    }
-												Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

											
										
										
											2025-11-05 12:31:14 +09:00
+								    return total_taken;
 								}
 								#endif // HAKMEM_TINY_REFILL_P0_INC_H