Files
hakmem/core/hakmem_tiny_refill_p0.inc.h
Moe Charm (CI) 72b38bc994 Phase E3-FINAL: Fix Box API offset bugs - ALL classes now use correct offsets
## Root Cause Analysis (GPT5)

**Physical Layout Constraints**:
- Class 0: 8B = [1B header][7B payload] → offset 1 = 9B needed =  IMPOSSIBLE
- Class 1-6: >=16B = [1B header][15B+ payload] → offset 1 =  POSSIBLE
- Class 7: 1KB → offset 0 (compatibility)

**Correct Specification**:
- HAKMEM_TINY_HEADER_CLASSIDX != 0:
  - Class 0, 7: next at offset 0 (overwrites header when on freelist)
  - Class 1-6: next at offset 1 (after header)
- HAKMEM_TINY_HEADER_CLASSIDX == 0:
  - All classes: next at offset 0

**Previous Bug**:
- Attempted "ALL classes offset 1" unification
- Class 0 with offset 1 caused immediate SEGV (9B > 8B block size)
- Mixed 2-arg/3-arg API caused confusion

## Fixes Applied

### 1. Restored 3-Argument Box API (core/box/tiny_next_ptr_box.h)
```c
// Correct signatures
void tiny_next_write(int class_idx, void* base, void* next_value)
void* tiny_next_read(int class_idx, const void* base)

// Correct offset calculation
size_t offset = (class_idx == 0 || class_idx == 7) ? 0 : 1;
```

### 2. Updated 123+ Call Sites Across 34 Files
- hakmem_tiny_hot_pop_v4.inc.h (4 locations)
- hakmem_tiny_fastcache.inc.h (3 locations)
- hakmem_tiny_tls_list.h (12 locations)
- superslab_inline.h (5 locations)
- tiny_fastcache.h (3 locations)
- ptr_trace.h (macro definitions)
- tls_sll_box.h (2 locations)
- + 27 additional files

Pattern: `tiny_next_read(base)` → `tiny_next_read(class_idx, base)`
Pattern: `tiny_next_write(base, next)` → `tiny_next_write(class_idx, base, next)`

### 3. Added Sentinel Detection Guards
- tiny_fast_push(): Block nodes with sentinel in ptr or ptr->next
- tls_list_push(): Block nodes with sentinel in ptr or ptr->next
- Defense-in-depth against remote free sentinel leakage

## Verification (GPT5 Report)

**Test Command**: `./out/release/bench_random_mixed_hakmem --iterations=70000`

**Results**:
-  Main loop completed successfully
-  Drain phase completed successfully
-  NO SEGV (previous crash at iteration 66151 is FIXED)
- ℹ️ Final log: "tiny_alloc(1024) failed" is normal fallback to Mid/ACE layers

**Analysis**:
- Class 0 immediate SEGV:  RESOLVED (correct offset 0 now used)
- 66K iteration crash:  RESOLVED (offset consistency fixed)
- Box API conflicts:  RESOLVED (unified 3-arg API)

## Technical Details

### Offset Logic Justification
```
Class 0:  8B block → next pointer (8B) fits ONLY at offset 0
Class 1: 16B block → next pointer (8B) fits at offset 1 (after 1B header)
Class 2: 32B block → next pointer (8B) fits at offset 1
...
Class 6: 512B block → next pointer (8B) fits at offset 1
Class 7: 1024B block → offset 0 for legacy compatibility
```

### Files Modified (Summary)
- Core API: `box/tiny_next_ptr_box.h`
- Hot paths: `hakmem_tiny_hot_pop*.inc.h`, `tiny_fastcache.h`
- TLS layers: `hakmem_tiny_tls_list.h`, `hakmem_tiny_tls_ops.h`
- SuperSlab: `superslab_inline.h`, `tiny_superslab_*.inc.h`
- Refill: `hakmem_tiny_refill.inc.h`, `tiny_refill_opt.h`
- Free paths: `tiny_free_magazine.inc.h`, `tiny_superslab_free.inc.h`
- Documentation: Multiple Phase E3 reports

## Remaining Work

None for Box API offset bugs - all structural issues resolved.

Future enhancements (non-critical):
- Periodic `grep -R '*(void**)' core/` to detect direct pointer access violations
- Enforce Box API usage via static analysis
- Document offset rationale in architecture docs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 06:50:20 +09:00

417 lines
18 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_refill_p0.inc.h
// ChatGPT Pro P0: Complete Batch Refill (SLL用)
//
// Purpose: Optimize sll_refill_small_from_ss with batch carving
// Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126)
//
// Key optimization: ss_active_inc × 64 → ss_active_add × 1
//
// Maintains: Existing g_tls_sll_head fast path (no changes to hot path!)
//
// Enable P0 by default for testing (set to 0 to disable)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
#define HAKMEM_TINY_P0_BATCH_REFILL 0
#endif
#ifndef HAKMEM_TINY_REFILL_P0_INC_H
#define HAKMEM_TINY_REFILL_P0_INC_H
#include "tiny_box_geometry.h" // Box 3: Geometry & Capacity Calculator
// Debug counters (compile-time gated)
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_hit_slab[];
// Diagnostic counters for refill early returns
extern unsigned long long g_rf_early_no_ss[]; // Line 27: !g_use_superslab
extern unsigned long long g_rf_early_no_meta[]; // Line 35: !meta
extern unsigned long long g_rf_early_no_room[]; // Line 40: room <= 0
extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0
#endif
// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
#include "tiny_refill_opt.h"
#include "tiny_fc_api.h"
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
#include "box/integrity_box.h" // Box I: Integrity verification (Priority ALPHA)
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
// Optional P0 diagnostic logging helper
static inline int p0_should_log(void) {
static int en = -1;
if (__builtin_expect(en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_LOG");
en = (e && *e && *e != '0') ? 1 : 0;
}
return en;
}
static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
// Phase E1-CORRECT: C7 now has headers, can use P0 batch refill
// Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path.
do {
static int g_p0_disable = -1;
if (__builtin_expect(g_p0_disable == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DISABLE");
g_p0_disable = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(g_p0_disable, 0)) {
return 0;
}
} while (0);
if (!g_use_superslab || max_take <= 0) {
#if HAKMEM_DEBUG_COUNTERS
if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
#endif
return 0;
}
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
uint32_t active_before = 0;
if (tls->ss) {
active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
}
// CRITICAL DEBUG: Log class 7 pre-warm
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
fprintf(stderr, "[P0_DEBUG_C7] Entry: tls->ss=%p tls->meta=%p max_take=%d\n",
(void*)tls->ss, (void*)tls->meta, max_take);
}
if (!tls->ss) {
// Try to obtain a SuperSlab for this class
if (superslab_refill(class_idx) == NULL) {
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
fprintf(stderr, "[P0_DEBUG_C7] superslab_refill() returned NULL\n");
}
return 0;
}
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
fprintf(stderr, "[P0_DEBUG_C7] After superslab_refill(): tls->ss=%p tls->meta=%p\n",
(void*)tls->ss, (void*)tls->meta);
}
}
TinySlabMeta* meta = tls->meta;
if (!meta) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_no_meta[class_idx]++;
#endif
return 0;
}
/* BOX_BOUNDARY: Box 2 (Refill) → Box I (Integrity Check) */
#if HAKMEM_INTEGRITY_LEVEL >= 4
uint8_t* initial_slab_base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
SlabMetadataState meta_initial = integrity_capture_slab_metadata(meta, initial_slab_base, class_idx);
INTEGRITY_CHECK_SLAB_METADATA(meta_initial, "P0 refill entry");
#endif
/* BOX_BOUNDARY: Box I → Box 2 (Integrity Verified) */
if (!meta) {
if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
fprintf(stderr, "[P0_DEBUG_C7] meta is NULL after superslab_refill, returning 0\n");
}
return 0;
}
// Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
// env:
// - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
// - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
do {
static int g_direct_fc = -1;
static int g_direct_fc_c7 = -1;
if (__builtin_expect(g_direct_fc == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
// Default ON when unset
g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
// Default OFF for class7 (1KB) until stability is fully verified; opt-in via env
g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0;
}
if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
int room = tiny_fc_room(class_idx);
if (room <= 0) return 0;
// Drain only if above threshold
uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
static int g_drain_th = -1;
if (__builtin_expect(g_drain_th == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
g_drain_th = (e && *e) ? atoi(e) : 64;
if (g_drain_th < 0) g_drain_th = 0;
}
if (rmt >= (uint32_t)g_drain_th) {
static int no_drain = -1;
if (__builtin_expect(no_drain == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
no_drain = (e && *e && *e != '0') ? 1 : 0;
}
if (!no_drain) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
}
}
// Gather pointers without writing into objects
void* out[128]; int produced = 0;
TinySlabMeta* m = tls->meta;
// Box 3: Get stride (block size + header, except C7 which is headerless)
size_t bs = tiny_stride_for_class(class_idx);
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
while (produced < room) {
if (__builtin_expect(m->freelist != NULL, 0)) {
// Phase E1-CORRECT: Use Box API for freelist next pointer read
void* p = m->freelist; m->freelist = tiny_next_read(class_idx, p); m->used++;
out[produced++] = p;
continue;
}
if (__builtin_expect(m->carved < m->capacity, 1)) {
void* p = (void*)(base + ((size_t)m->carved * bs));
m->carved++; m->used++;
out[produced++] = p;
continue;
}
// Need to move to another slab with space
if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
// Rebind
tls = &g_tls_slabs[class_idx];
m = tls->meta;
base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
}
if (produced > 0) {
ss_active_add(tls->ss, (uint32_t)produced);
int pushed = tiny_fc_push_bulk(class_idx, out, produced);
(void)pushed; // roomに合わせているので一致するはず
if (p0_should_log()) {
static _Atomic int g_logged = 0;
int exp = 0;
if (atomic_compare_exchange_strong(&g_logged, &exp, 1)) {
fprintf(stderr, "[P0_DIRECT_FC_TAKE] cls=%d take=%d room=%d drain_th=%d remote_cnt=%u\n",
class_idx, produced, room, g_drain_th, rmt);
}
}
return produced;
}
// fallthrough to regular path
}
} while (0);
// Compute how many we can actually push into SLL without overflow
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
if (room <= 0) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_no_room[class_idx]++;
#endif
return 0;
}
// For hot tiny classes (0..3), allow an env override to increase batch size
uint32_t want = (uint32_t)max_take;
if (class_idx <= 3) {
static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
if (__builtin_expect(g_hot_override == -2, 0)) {
const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
int v = (e && *e) ? atoi(e) : -1;
if (v < 0) v = -1; if (v > 256) v = 256; // clamp
g_hot_override = v;
}
if (g_hot_override > 0) want = (uint32_t)g_hot_override;
} else {
// Mid classes (>=4): optional override for batch size
static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
if (__builtin_expect(g_mid_override == -2, 0)) {
const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
int v = (e && *e) ? atoi(e) : -1;
if (v < 0) v = -1; if (v > 256) v = 256; // clamp
g_mid_override = v;
}
if (g_mid_override > 0) want = (uint32_t)g_mid_override;
}
if (want > (uint32_t)room) want = (uint32_t)room;
if (want == 0) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_want_zero[class_idx]++;
#endif
return 0;
}
// Box 3: Get stride (block size + header, except C7 which is headerless)
size_t bs = tiny_stride_for_class(class_idx);
int total_taken = 0;
// === P0 Batch Carving Loop ===
while (want > 0) {
// Calculate slab base for validation (accounts for 2048 offset in slab 0)
uintptr_t ss_base = 0;
uintptr_t ss_limit = 0;
if (tls->ss && tls->slab_idx >= 0) {
// Box 3: Get slab base (handles Slab 0 offset)
uint8_t* slab_base = tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
ss_base = (uintptr_t)slab_base;
// Box 3: Get usable bytes for limit calculation
ss_limit = ss_base + tiny_usable_bytes_for_slab(tls->slab_idx);
}
// CRITICAL FIX: Drain remote queue BEFORE popping from freelist
// Without this, blocks in both freelist and remote queue can be double-allocated
// (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data)
// OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter)
if (tls->ss && tls->slab_idx >= 0) {
uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
if (remote_count > 0) {
// Runtime A/B: allow skipping remote drain for切り分け
static int no_drain = -1;
if (__builtin_expect(no_drain == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
no_drain = (e && *e && *e != '0') ? 1 : 0;
}
if (!no_drain) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
}
}
}
// Handle freelist items first (usually 0)
TinyRefillChain chain;
uint32_t from_freelist = trc_pop_from_freelist(
meta, class_idx, ss_base, ss_limit, bs, want, &chain);
if (from_freelist > 0) {
trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
// FIX: Blocks from freelist were decremented when freed, must increment when allocated
ss_active_add(tls->ss, from_freelist);
// FIX: Keep TinySlabMeta::used consistent with non-P0 path
meta->used = (uint16_t)((uint32_t)meta->used + from_freelist);
/* BOX_BOUNDARY: Box 2 → Box I (Verify metadata after freelist pop) */
#if HAKMEM_INTEGRITY_LEVEL >= 4
SlabMetadataState meta_after_freelist = integrity_capture_slab_metadata(
meta, ss_base, class_idx);
INTEGRITY_CHECK_SLAB_METADATA(meta_after_freelist, "P0 after freelist pop");
#endif
/* BOX_BOUNDARY: Box I → Box 2 */
extern unsigned long long g_rf_freelist_items[];
g_rf_freelist_items[class_idx] += from_freelist;
total_taken += from_freelist;
want -= from_freelist;
if (want == 0) break;
}
// === Linear Carve (P0 Key Optimization!) ===
// Use monotonic 'carved' to track linear progression (used can decrement on free)
if (meta->carved >= meta->capacity) {
// Slab exhausted, try to get another
if (superslab_refill(class_idx) == NULL) break;
// CRITICAL FIX: Reload tls pointer after superslab_refill() binds new slab
tls = &g_tls_slabs[class_idx];
meta = tls->meta;
if (!meta) break;
/* BOX_BOUNDARY: Box 2 → Box I (Verify new slab after superslab_refill) */
#if HAKMEM_INTEGRITY_LEVEL >= 4
uint8_t* new_slab_base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
SlabMetadataState meta_after_refill = integrity_capture_slab_metadata(
meta, new_slab_base, class_idx);
INTEGRITY_CHECK_SLAB_METADATA(meta_after_refill, "P0 after superslab_refill");
#endif
/* BOX_BOUNDARY: Box I → Box 2 */
continue;
}
uint32_t available = meta->capacity - meta->carved;
uint32_t batch = want;
if (batch > available) batch = available;
if (batch == 0) break;
// Get slab base
uint8_t* slab_base = tls->slab_base ? tls->slab_base
: tiny_slab_base_for(tls->ss, tls->slab_idx);
// Diagnostic log (one-shot)
#if !HAKMEM_BUILD_RELEASE
static _Atomic int g_carve_log_printed = 0;
if (atomic_load(&g_carve_log_printed) == 0 &&
atomic_exchange(&g_carve_log_printed, 1) == 0) {
fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n",
class_idx, tls->slab_idx, meta->used, meta->capacity, batch,
(void*)slab_base, bs);
fflush(stderr);
}
#endif
TinyRefillChain carve;
trc_linear_carve(slab_base, bs, meta, batch, class_idx, &carve);
// One-shot sanity: validate first few nodes are within the slab and stride-aligned
#if !HAKMEM_BUILD_RELEASE
do {
static _Atomic int g_once = 0;
int exp = 0;
if (atomic_compare_exchange_strong(&g_once, &exp, 1)) {
uintptr_t base_chk = (uintptr_t)(tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx));
uintptr_t limit_chk = base_chk + tiny_usable_bytes_for_slab(tls->slab_idx);
void* node = carve.head;
for (int i = 0; i < 3 && node; i++) {
uintptr_t a = (uintptr_t)node;
if (!(a >= base_chk && a < limit_chk)) {
fprintf(stderr, "[P0_SANITY_FAIL] out_of_range cls=%d node=%p base=%p limit=%p bs=%zu\n",
class_idx, node, (void*)base_chk, (void*)limit_chk, bs);
abort();
}
size_t off = (size_t)(a - base_chk);
if ((off % bs) != 0) {
fprintf(stderr, "[P0_SANITY_FAIL] misaligned cls=%d node=%p off=%zu bs=%zu base=%p\n",
class_idx, node, off, bs, (void*)base_chk);
abort();
}
node = tiny_next_read(class_idx, node);
}
}
} while (0);
#endif
trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
// FIX: Update SuperSlab active counter (was missing!)
ss_active_add(tls->ss, batch);
/* BOX_BOUNDARY: Box 2 → Box I (Verify metadata after linear carve) */
#if HAKMEM_INTEGRITY_LEVEL >= 4
SlabMetadataState meta_after_carve = integrity_capture_slab_metadata(
meta, slab_base, class_idx);
INTEGRITY_CHECK_SLAB_METADATA(meta_after_carve, "P0 after linear carve");
#endif
/* BOX_BOUNDARY: Box I → Box 2 */
extern unsigned long long g_rf_carve_items[];
g_rf_carve_items[class_idx] += batch;
total_taken += batch;
want -= batch;
}
#if HAKMEM_DEBUG_COUNTERS
// Track successful SLL refills from SuperSlab (compile-time gated)
// NOTE: Increment unconditionally to verify counter is working
g_rf_hit_slab[class_idx]++;
#endif
if (tls->ss && p0_should_log()) {
uint32_t active_after = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
int32_t delta = (int32_t)active_after - (int32_t)active_before;
if ((int32_t)total_taken != delta) {
fprintf(stderr,
"[P0_COUNTER_MISMATCH] cls=%d slab=%d taken=%d active_delta=%d used=%u carved=%u cap=%u freelist=%p\n",
class_idx, tls->slab_idx, total_taken, delta,
(unsigned)meta->used, (unsigned)meta->carved, (unsigned)meta->capacity,
meta->freelist);
} else {
fprintf(stderr,
"[P0_COUNTER_OK] cls=%d slab=%d taken=%d active_delta=%d\n",
class_idx, tls->slab_idx, total_taken, delta);
}
}
return total_taken;
}
#endif // HAKMEM_TINY_REFILL_P0_INC_H