2025-11-05 12:31:14 +09:00
|
|
|
|
// hakmem_tiny_refill_p0.inc.h
|
|
|
|
|
|
// ChatGPT Pro P0: Complete Batch Refill (SLL用)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Purpose: Optimize sll_refill_small_from_ss with batch carving
|
|
|
|
|
|
// Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Key optimization: ss_active_inc × 64 → ss_active_add × 1
|
|
|
|
|
|
//
|
|
|
|
|
|
// Maintains: Existing g_tls_sll_head fast path (no changes to hot path!)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Enable P0 by default for testing (set to 0 to disable)
|
|
|
|
|
|
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
|
2025-11-09 22:12:34 +09:00
|
|
|
|
#define HAKMEM_TINY_P0_BATCH_REFILL 0
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef HAKMEM_TINY_REFILL_P0_INC_H
|
|
|
|
|
|
#define HAKMEM_TINY_REFILL_P0_INC_H
|
|
|
|
|
|
|
|
|
|
|
|
// Debug counters (compile-time gated)
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
extern unsigned long long g_rf_hit_slab[];
|
|
|
|
|
|
// Diagnostic counters for refill early returns
|
|
|
|
|
|
extern unsigned long long g_rf_early_no_ss[]; // Line 27: !g_use_superslab
|
|
|
|
|
|
extern unsigned long long g_rf_early_no_meta[]; // Line 35: !meta
|
|
|
|
|
|
extern unsigned long long g_rf_early_no_room[]; // Line 40: room <= 0
|
|
|
|
|
|
extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
|
|
|
|
|
|
#include "tiny_refill_opt.h"
|
2025-11-09 23:15:02 +09:00
|
|
|
|
#include "tiny_fc_api.h"
|
2025-11-08 01:35:45 +09:00
|
|
|
|
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
|
2025-11-09 22:12:34 +09:00
|
|
|
|
// Optional P0 diagnostic logging helper
|
|
|
|
|
|
static inline int p0_should_log(void) {
|
|
|
|
|
|
static int en = -1;
|
|
|
|
|
|
if (__builtin_expect(en == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_LOG");
|
|
|
|
|
|
en = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
return en;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
|
2025-11-10 01:59:11 +09:00
|
|
|
|
// Conservative guard: class7(1KB) uses legacy path by default until fully stabilized.
|
|
|
|
|
|
// Opt-in via HAKMEM_TINY_P0_C7_ENABLE=1
|
|
|
|
|
|
if (__builtin_expect(class_idx == 7, 0)) {
|
|
|
|
|
|
static int c7_en = -1;
|
|
|
|
|
|
if (c7_en == -1) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_C7_ENABLE");
|
|
|
|
|
|
c7_en = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!c7_en) return 0;
|
|
|
|
|
|
}
|
2025-11-09 22:12:34 +09:00
|
|
|
|
// Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path.
|
|
|
|
|
|
do {
|
|
|
|
|
|
static int g_p0_disable = -1;
|
|
|
|
|
|
if (__builtin_expect(g_p0_disable == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_DISABLE");
|
|
|
|
|
|
g_p0_disable = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(g_p0_disable, 0)) {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
} while (0);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
if (!g_use_superslab || max_take <= 0) {
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
|
2025-11-09 22:12:34 +09:00
|
|
|
|
uint32_t active_before = 0;
|
|
|
|
|
|
if (tls->ss) {
|
|
|
|
|
|
active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
|
|
|
|
|
|
}
|
2025-11-10 00:25:02 +09:00
|
|
|
|
|
|
|
|
|
|
// CRITICAL DEBUG: Log class 7 pre-warm
|
|
|
|
|
|
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
|
|
|
|
|
|
fprintf(stderr, "[P0_DEBUG_C7] Entry: tls->ss=%p tls->meta=%p max_take=%d\n",
|
|
|
|
|
|
(void*)tls->ss, (void*)tls->meta, max_take);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
if (!tls->ss) {
|
|
|
|
|
|
// Try to obtain a SuperSlab for this class
|
2025-11-10 00:25:02 +09:00
|
|
|
|
if (superslab_refill(class_idx) == NULL) {
|
|
|
|
|
|
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
|
|
|
|
|
|
fprintf(stderr, "[P0_DEBUG_C7] superslab_refill() returned NULL\n");
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
|
|
|
|
|
|
fprintf(stderr, "[P0_DEBUG_C7] After superslab_refill(): tls->ss=%p tls->meta=%p\n",
|
|
|
|
|
|
(void*)tls->ss, (void*)tls->meta);
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
}
|
|
|
|
|
|
TinySlabMeta* meta = tls->meta;
|
|
|
|
|
|
if (!meta) {
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
g_rf_early_no_meta[class_idx]++;
|
|
|
|
|
|
#endif
|
2025-11-10 00:25:02 +09:00
|
|
|
|
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
|
|
|
|
|
|
fprintf(stderr, "[P0_DEBUG_C7] meta is NULL after superslab_refill, returning 0\n");
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-09 23:15:02 +09:00
|
|
|
|
// Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
|
|
|
|
|
|
// env:
|
|
|
|
|
|
// - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
|
|
|
|
|
|
// - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
|
|
|
|
|
|
do {
|
|
|
|
|
|
static int g_direct_fc = -1;
|
|
|
|
|
|
static int g_direct_fc_c7 = -1;
|
|
|
|
|
|
if (__builtin_expect(g_direct_fc == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
|
|
|
|
|
|
// Default ON when unset
|
|
|
|
|
|
g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
|
|
|
|
|
|
const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
|
2025-11-10 00:25:02 +09:00
|
|
|
|
// Default OFF for class7 (1KB) until stability is fully verified; opt-in via env
|
|
|
|
|
|
g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0;
|
2025-11-09 23:15:02 +09:00
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
|
|
|
|
|
|
int room = tiny_fc_room(class_idx);
|
|
|
|
|
|
if (room <= 0) return 0;
|
|
|
|
|
|
// Drain only if above threshold
|
|
|
|
|
|
uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
|
|
|
|
|
|
static int g_drain_th = -1;
|
|
|
|
|
|
if (__builtin_expect(g_drain_th == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
|
2025-11-10 00:25:02 +09:00
|
|
|
|
g_drain_th = (e && *e) ? atoi(e) : 64;
|
2025-11-09 23:15:02 +09:00
|
|
|
|
if (g_drain_th < 0) g_drain_th = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (rmt >= (uint32_t)g_drain_th) {
|
|
|
|
|
|
static int no_drain = -1;
|
|
|
|
|
|
if (__builtin_expect(no_drain == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
|
|
|
|
|
|
no_drain = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!no_drain) {
|
|
|
|
|
|
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Gather pointers without writing into objects
|
|
|
|
|
|
void* out[128]; int produced = 0;
|
|
|
|
|
|
TinySlabMeta* m = tls->meta;
|
|
|
|
|
|
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
|
|
|
|
|
|
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
|
|
|
|
|
|
while (produced < room) {
|
|
|
|
|
|
if (__builtin_expect(m->freelist != NULL, 0)) {
|
|
|
|
|
|
void* p = m->freelist; m->freelist = *(void**)p; m->used++;
|
|
|
|
|
|
out[produced++] = p;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(m->carved < m->capacity, 1)) {
|
|
|
|
|
|
void* p = (void*)(base + ((size_t)m->carved * bs));
|
|
|
|
|
|
m->carved++; m->used++;
|
|
|
|
|
|
out[produced++] = p;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Need to move to another slab with space
|
|
|
|
|
|
if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
|
|
|
|
|
|
// Rebind
|
|
|
|
|
|
tls = &g_tls_slabs[class_idx];
|
|
|
|
|
|
m = tls->meta;
|
|
|
|
|
|
base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (produced > 0) {
|
|
|
|
|
|
ss_active_add(tls->ss, (uint32_t)produced);
|
|
|
|
|
|
int pushed = tiny_fc_push_bulk(class_idx, out, produced);
|
|
|
|
|
|
(void)pushed; // roomに合わせているので一致するはず
|
2025-11-10 00:25:02 +09:00
|
|
|
|
if (p0_should_log()) {
|
|
|
|
|
|
static _Atomic int g_logged = 0;
|
|
|
|
|
|
int exp = 0;
|
|
|
|
|
|
if (atomic_compare_exchange_strong(&g_logged, &exp, 1)) {
|
|
|
|
|
|
fprintf(stderr, "[P0_DIRECT_FC_TAKE] cls=%d take=%d room=%d drain_th=%d remote_cnt=%u\n",
|
|
|
|
|
|
class_idx, produced, room, g_drain_th, rmt);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-09 23:15:02 +09:00
|
|
|
|
return produced;
|
|
|
|
|
|
}
|
|
|
|
|
|
// fallthrough to regular path
|
|
|
|
|
|
}
|
|
|
|
|
|
} while (0);
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Compute how many we can actually push into SLL without overflow
|
|
|
|
|
|
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
|
|
|
|
|
|
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
|
|
|
|
|
|
if (room <= 0) {
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
g_rf_early_no_room[class_idx]++;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// For hot tiny classes (0..3), allow an env override to increase batch size
|
|
|
|
|
|
uint32_t want = (uint32_t)max_take;
|
|
|
|
|
|
if (class_idx <= 3) {
|
|
|
|
|
|
static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
|
|
|
|
|
|
if (__builtin_expect(g_hot_override == -2, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
|
|
|
|
|
|
int v = (e && *e) ? atoi(e) : -1;
|
|
|
|
|
|
if (v < 0) v = -1; if (v > 256) v = 256; // clamp
|
|
|
|
|
|
g_hot_override = v;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (g_hot_override > 0) want = (uint32_t)g_hot_override;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// Mid classes (>=4): optional override for batch size
|
|
|
|
|
|
static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
|
|
|
|
|
|
if (__builtin_expect(g_mid_override == -2, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
|
|
|
|
|
|
int v = (e && *e) ? atoi(e) : -1;
|
|
|
|
|
|
if (v < 0) v = -1; if (v > 256) v = 256; // clamp
|
|
|
|
|
|
g_mid_override = v;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (g_mid_override > 0) want = (uint32_t)g_mid_override;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (want > (uint32_t)room) want = (uint32_t)room;
|
|
|
|
|
|
if (want == 0) {
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
g_rf_early_want_zero[class_idx]++;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-09 18:55:50 +09:00
|
|
|
|
// Effective stride: class block size + 1-byte header for classes 0..6
|
|
|
|
|
|
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
int total_taken = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// === P0 Batch Carving Loop ===
|
|
|
|
|
|
while (want > 0) {
|
2025-11-08 01:18:37 +09:00
|
|
|
|
// Calculate slab base for validation (accounts for 2048 offset in slab 0)
|
2025-11-07 20:31:01 +09:00
|
|
|
|
uintptr_t ss_base = 0;
|
|
|
|
|
|
uintptr_t ss_limit = 0;
|
2025-11-08 01:18:37 +09:00
|
|
|
|
if (tls->ss && tls->slab_idx >= 0) {
|
|
|
|
|
|
uint8_t* slab_base = tiny_slab_base_for(tls->ss, tls->slab_idx);
|
|
|
|
|
|
ss_base = (uintptr_t)slab_base;
|
|
|
|
|
|
// Limit is end of current slab
|
|
|
|
|
|
ss_limit = ss_base + SLAB_SIZE;
|
|
|
|
|
|
if (tls->slab_idx == 0) {
|
|
|
|
|
|
ss_limit = ss_base + (SLAB_SIZE - SUPERSLAB_SLAB0_DATA_OFFSET);
|
|
|
|
|
|
}
|
2025-11-07 20:31:01 +09:00
|
|
|
|
}
|
2025-11-08 01:35:45 +09:00
|
|
|
|
|
|
|
|
|
|
// CRITICAL FIX: Drain remote queue BEFORE popping from freelist
|
|
|
|
|
|
// Without this, blocks in both freelist and remote queue can be double-allocated
|
|
|
|
|
|
// (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data)
|
Perf: Optimize remote queue drain to skip when empty
Optimization:
=============
Check remote_counts[slab_idx] BEFORE calling drain function.
If remote queue is empty (count == 0), skip the drain entirely.
Impact:
- Single-threaded: remote_count is ALWAYS 0 → drain calls = 0
- Multi-threaded: only drain when there are actual remote frees
- Reduces unnecessary function call overhead in common case
Code:
if (tls->ss && tls->slab_idx >= 0) {
uint32_t remote_count = atomic_load_explicit(
&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
if (remote_count > 0) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
}
}
Benchmark Results:
==================
bench_random_mixed (1 thread):
Before: 1,020,163 ops/s
After: 1,015,347 ops/s (-0.5%, within noise)
larson_hakmem (4 threads):
Before: 931,629 ops/s (1073 sec)
After: 929,709 ops/s (1075 sec) (-0.2%, within noise)
Note: Performance unchanged, but code is cleaner and avoids
unnecessary work in single-threaded case. Real bottleneck
appears to be elsewhere (Magazine layer overhead per CLAUDE.md).
Next: Profile with perf to find actual hotspots.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 01:44:24 +09:00
|
|
|
|
// OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter)
|
2025-11-08 01:35:45 +09:00
|
|
|
|
if (tls->ss && tls->slab_idx >= 0) {
|
Perf: Optimize remote queue drain to skip when empty
Optimization:
=============
Check remote_counts[slab_idx] BEFORE calling drain function.
If remote queue is empty (count == 0), skip the drain entirely.
Impact:
- Single-threaded: remote_count is ALWAYS 0 → drain calls = 0
- Multi-threaded: only drain when there are actual remote frees
- Reduces unnecessary function call overhead in common case
Code:
if (tls->ss && tls->slab_idx >= 0) {
uint32_t remote_count = atomic_load_explicit(
&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
if (remote_count > 0) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
}
}
Benchmark Results:
==================
bench_random_mixed (1 thread):
Before: 1,020,163 ops/s
After: 1,015,347 ops/s (-0.5%, within noise)
larson_hakmem (4 threads):
Before: 931,629 ops/s (1073 sec)
After: 929,709 ops/s (1075 sec) (-0.2%, within noise)
Note: Performance unchanged, but code is cleaner and avoids
unnecessary work in single-threaded case. Real bottleneck
appears to be elsewhere (Magazine layer overhead per CLAUDE.md).
Next: Profile with perf to find actual hotspots.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 01:44:24 +09:00
|
|
|
|
uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
|
|
|
|
|
|
if (remote_count > 0) {
|
2025-11-09 22:12:34 +09:00
|
|
|
|
// Runtime A/B: allow skipping remote drain for切り分け
|
|
|
|
|
|
static int no_drain = -1;
|
|
|
|
|
|
if (__builtin_expect(no_drain == -1, 0)) {
|
|
|
|
|
|
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
|
|
|
|
|
|
no_drain = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!no_drain) {
|
|
|
|
|
|
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
|
|
|
|
|
|
}
|
Perf: Optimize remote queue drain to skip when empty
Optimization:
=============
Check remote_counts[slab_idx] BEFORE calling drain function.
If remote queue is empty (count == 0), skip the drain entirely.
Impact:
- Single-threaded: remote_count is ALWAYS 0 → drain calls = 0
- Multi-threaded: only drain when there are actual remote frees
- Reduces unnecessary function call overhead in common case
Code:
if (tls->ss && tls->slab_idx >= 0) {
uint32_t remote_count = atomic_load_explicit(
&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
if (remote_count > 0) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
}
}
Benchmark Results:
==================
bench_random_mixed (1 thread):
Before: 1,020,163 ops/s
After: 1,015,347 ops/s (-0.5%, within noise)
larson_hakmem (4 threads):
Before: 931,629 ops/s (1073 sec)
After: 929,709 ops/s (1075 sec) (-0.2%, within noise)
Note: Performance unchanged, but code is cleaner and avoids
unnecessary work in single-threaded case. Real bottleneck
appears to be elsewhere (Magazine layer overhead per CLAUDE.md).
Next: Profile with perf to find actual hotspots.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 01:44:24 +09:00
|
|
|
|
}
|
2025-11-08 01:35:45 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Handle freelist items first (usually 0)
|
|
|
|
|
|
TinyRefillChain chain;
|
2025-11-07 20:31:01 +09:00
|
|
|
|
uint32_t from_freelist = trc_pop_from_freelist(
|
|
|
|
|
|
meta, class_idx, ss_base, ss_limit, bs, want, &chain);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
if (from_freelist > 0) {
|
|
|
|
|
|
trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
|
2025-11-07 12:37:23 +09:00
|
|
|
|
// FIX: Blocks from freelist were decremented when freed, must increment when allocated
|
|
|
|
|
|
ss_active_add(tls->ss, from_freelist);
|
2025-11-09 22:12:34 +09:00
|
|
|
|
// FIX: Keep TinySlabMeta::used consistent with non-P0 path
|
|
|
|
|
|
meta->used = (uint16_t)((uint32_t)meta->used + from_freelist);
|
2025-11-07 01:27:04 +09:00
|
|
|
|
extern unsigned long long g_rf_freelist_items[];
|
|
|
|
|
|
g_rf_freelist_items[class_idx] += from_freelist;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
total_taken += from_freelist;
|
|
|
|
|
|
want -= from_freelist;
|
|
|
|
|
|
if (want == 0) break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// === Linear Carve (P0 Key Optimization!) ===
|
2025-11-09 22:12:34 +09:00
|
|
|
|
// Use monotonic 'carved' to track linear progression (used can decrement on free)
|
|
|
|
|
|
if (meta->carved >= meta->capacity) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Slab exhausted, try to get another
|
|
|
|
|
|
if (superslab_refill(class_idx) == NULL) break;
|
2025-11-10 00:25:02 +09:00
|
|
|
|
// CRITICAL FIX: Reload tls pointer after superslab_refill() binds new slab
|
|
|
|
|
|
tls = &g_tls_slabs[class_idx];
|
2025-11-05 12:31:14 +09:00
|
|
|
|
meta = tls->meta;
|
|
|
|
|
|
if (!meta) break;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-09 22:12:34 +09:00
|
|
|
|
uint32_t available = meta->capacity - meta->carved;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
uint32_t batch = want;
|
|
|
|
|
|
if (batch > available) batch = available;
|
|
|
|
|
|
if (batch == 0) break;
|
|
|
|
|
|
|
|
|
|
|
|
// Get slab base
|
|
|
|
|
|
uint8_t* slab_base = tls->slab_base ? tls->slab_base
|
|
|
|
|
|
: tiny_slab_base_for(tls->ss, tls->slab_idx);
|
2025-11-08 01:18:37 +09:00
|
|
|
|
|
|
|
|
|
|
// Diagnostic log (one-shot)
|
|
|
|
|
|
static _Atomic int g_carve_log_printed = 0;
|
|
|
|
|
|
if (atomic_load(&g_carve_log_printed) == 0 &&
|
|
|
|
|
|
atomic_exchange(&g_carve_log_printed, 1) == 0) {
|
|
|
|
|
|
fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n",
|
|
|
|
|
|
class_idx, tls->slab_idx, meta->used, meta->capacity, batch,
|
|
|
|
|
|
(void*)slab_base, bs);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
TinyRefillChain carve;
|
|
|
|
|
|
trc_linear_carve(slab_base, bs, meta, batch, &carve);
|
|
|
|
|
|
trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
|
|
|
|
|
|
// FIX: Update SuperSlab active counter (was missing!)
|
|
|
|
|
|
ss_active_add(tls->ss, batch);
|
2025-11-07 01:27:04 +09:00
|
|
|
|
extern unsigned long long g_rf_carve_items[];
|
|
|
|
|
|
g_rf_carve_items[class_idx] += batch;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
total_taken += batch;
|
|
|
|
|
|
want -= batch;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
// Track successful SLL refills from SuperSlab (compile-time gated)
|
|
|
|
|
|
// NOTE: Increment unconditionally to verify counter is working
|
|
|
|
|
|
g_rf_hit_slab[class_idx]++;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-11-09 22:12:34 +09:00
|
|
|
|
if (tls->ss && p0_should_log()) {
|
|
|
|
|
|
uint32_t active_after = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
|
|
|
|
|
|
int32_t delta = (int32_t)active_after - (int32_t)active_before;
|
|
|
|
|
|
if ((int32_t)total_taken != delta) {
|
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
|
"[P0_COUNTER_MISMATCH] cls=%d slab=%d taken=%d active_delta=%d used=%u carved=%u cap=%u freelist=%p\n",
|
|
|
|
|
|
class_idx, tls->slab_idx, total_taken, delta,
|
|
|
|
|
|
(unsigned)meta->used, (unsigned)meta->carved, (unsigned)meta->capacity,
|
|
|
|
|
|
meta->freelist);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
|
"[P0_COUNTER_OK] cls=%d slab=%d taken=%d active_delta=%d\n",
|
|
|
|
|
|
class_idx, tls->slab_idx, total_taken, delta);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-05 12:31:14 +09:00
|
|
|
|
return total_taken;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#endif // HAKMEM_TINY_REFILL_P0_INC_H
|