Files
hakmem/core/tiny_region_id.h
Moe Charm (CI) 6154e7656c 根治修正: unified_cache_refill SEGVAULT + コンパイラ最適化対策
問題:
  - リリース版sh8benchでunified_cache_refill+0x46fでSEGVAULT
  - コンパイラ最適化により、ヘッダー書き込みとtiny_next_read()の
    順序が入れ替わり、破損したポインタをout[]に格納

根本原因:
  - ヘッダー書き込みがtiny_next_read()の後にあった
  - volatile barrierがなく、コンパイラが自由に順序を変更
  - ASan版では最適化が制限されるため問題が隠蔽されていた

修正内容(P1-P3):

P1: unified_cache_refill SEGVAULT修正 (core/front/tiny_unified_cache.c:341-350)
  - ヘッダー書き込みをtiny_next_read()の前に移動
  - __atomic_thread_fence(__ATOMIC_RELEASE)追加
  - コンパイラ最適化による順序入れ替えを防止

P2: 二重書き込み削除 (core/box/tiny_front_cold_box.h:75-82)
  - tiny_region_id_write_header()削除
  - unified_cache_refillが既にヘッダー書き込み済み
  - 不要なメモリ操作を削除して効率化

P3: tiny_next_read()安全性強化 (core/tiny_nextptr.h:73-86)
  - __atomic_thread_fence(__ATOMIC_ACQUIRE)追加
  - メモリ操作の順序を保証

P4: ヘッダー書き込みデフォルトON (core/tiny_region_id.h - ChatGPT修正)
  - g_write_headerのデフォルトを1に変更
  - HAKMEM_TINY_WRITE_HEADER=0で旧挙動に戻せる

テスト結果:
   unified_cache_refill SEGVAULT: 解消(sh8bench実行可能に)
   TLS_SLL_HDR_RESET: まだ発生中(別の根本原因、調査継続)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 09:57:12 +09:00

418 lines
15 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_region_id.h - Region-ID Direct Lookup API (Phase 7)
// Purpose: O(1) class_idx lookup from pointer (eliminates SuperSlab lookup)
// Design: Smart Headers - 1-byte class_idx embedded before each block
// Performance: 2-3 cycles (vs 100+ cycles for SuperSlab lookup)
//
// Expected Impact: 1.2M → 40-60M ops/s (30-50x improvement)
#ifndef TINY_REGION_ID_H
#define TINY_REGION_ID_H
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <execinfo.h>
#include <dlfcn.h>
#include "hakmem_build_flags.h"
#include "tiny_box_geometry.h"
#include "ptr_track.h"
#include "hakmem_super_registry.h"
#include "superslab/superslab_inline.h"
#include "hakmem_tiny.h" // For TinyTLSSLL type
#include "tiny_debug_api.h" // Guard/failfast declarations
// Feature flag: Enable header-based class_idx lookup
#ifndef HAKMEM_TINY_HEADER_CLASSIDX
#define HAKMEM_TINY_HEADER_CLASSIDX 0
#endif
#if HAKMEM_TINY_HEADER_CLASSIDX
// ========== Header Layout ==========
//
// Memory layout:
// [Header: 1 byte] [User block: N bytes]
// ^ ^
// ptr-1 ptr (returned to user)
//
// Header format (1 byte):
// - Bits 0-3: class_idx (0-15, only 0-7 used for Tiny)
// - Bits 4-7: magic (0xA for validation in debug mode)
//
// Example:
// class_idx = 3 → header = 0xA3 (debug) or 0x03 (release)
#define HEADER_MAGIC 0xA0
#define HEADER_CLASS_MASK 0x0F
// ========== Address Watcher (Debug Only) ==========
#if !HAKMEM_BUILD_RELEASE
// Helper: Get current thread ID (watcher-local version to avoid redefinition)
static inline uint32_t watcher_self_u32(void) {
return (uint32_t)(uintptr_t)pthread_self();
}
// Address watcher: Tracks when a specific address is allocated or freed
// Usage: HAKMEM_WATCH_ADDR=0x7f1234567890 ./program
static inline uintptr_t get_watch_addr(void) {
#if !HAKMEM_BUILD_RELEASE
static uintptr_t watch_addr = 0;
static int initialized = 0;
if (!initialized) {
const char* env = getenv("HAKMEM_WATCH_ADDR");
if (env && *env) {
// Parse hex address (with or without 0x prefix)
if (env[0] == '0' && (env[1] == 'x' || env[1] == 'X')) {
watch_addr = (uintptr_t)strtoull(env + 2, NULL, 16);
} else {
watch_addr = (uintptr_t)strtoull(env, NULL, 16);
}
if (watch_addr != 0) {
fprintf(stderr, "[WATCH_INIT] Watching address: %p\n", (void*)watch_addr);
fflush(stderr);
}
}
initialized = 1;
}
return watch_addr;
#else
return 0;
#endif
}
// Allocation source tracking
typedef enum {
ALLOC_SOURCE_UNKNOWN = 0,
ALLOC_SOURCE_TLS_SLL, // TLS freelist pop
ALLOC_SOURCE_FREELIST, // Slab freelist pop
ALLOC_SOURCE_CARVE, // Linear carve from slab
ALLOC_SOURCE_NEW_SLAB, // Newly allocated slab
} AllocSource;
static __thread AllocSource g_last_alloc_source = ALLOC_SOURCE_UNKNOWN;
// Use int to match extern declarations in other files
static inline void set_alloc_source(int source) {
g_last_alloc_source = (AllocSource)source;
}
static inline const char* alloc_source_name(AllocSource source) {
switch (source) {
case ALLOC_SOURCE_TLS_SLL: return "TLS_SLL";
case ALLOC_SOURCE_FREELIST: return "FREELIST";
case ALLOC_SOURCE_CARVE: return "CARVE";
case ALLOC_SOURCE_NEW_SLAB: return "NEW_SLAB";
default: return "UNKNOWN";
}
}
// Watch trigger: Called when watch address is allocated
static inline void watch_alloc_trigger(void* base, int class_idx, AllocSource source) {
extern __thread TinyTLSSLL g_tls_sll[];
extern _Atomic uint64_t g_debug_op_count;
uint64_t op = atomic_load(&g_debug_op_count);
uint32_t tls_count = g_tls_sll[class_idx].count;
void* freelist_head = g_tls_sll[class_idx].head;
fprintf(stderr, "\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "[WATCH_ALLOC_HIT] Address %p allocated!\n", base);
fprintf(stderr, "========================================\n");
fprintf(stderr, " Operation: #%lu\n", (unsigned long)op);
fprintf(stderr, " Class: %d (%zu bytes)\n", class_idx, tiny_stride_for_class(class_idx));
fprintf(stderr, " Source: %s\n", alloc_source_name(source));
fprintf(stderr, " TLS count: %u\n", tls_count);
fprintf(stderr, " TLS head: %p\n", freelist_head);
fprintf(stderr, " Thread: %u\n", (unsigned)watcher_self_u32());
// Try to get slab metadata if available
struct SuperSlab* ss = hak_super_lookup(base);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
int slab_idx = slab_index_for(ss, base);
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
fprintf(stderr, " Slab metadata:\n");
fprintf(stderr, " SuperSlab: %p\n", (void*)ss);
fprintf(stderr, " Slab index: %d\n", slab_idx);
fprintf(stderr, " Slab class: %u\n", (unsigned)meta->class_idx);
fprintf(stderr, " Used: %u\n", (unsigned)meta->used);
fprintf(stderr, " Capacity: %u\n", (unsigned)meta->capacity);
fprintf(stderr, " Freelist: %p\n", meta->freelist);
fprintf(stderr, " Owner TID: %u\n", (unsigned)meta->owner_tid_low);
}
}
fprintf(stderr, "========================================\n");
fprintf(stderr, "\n");
fflush(stderr);
// Print backtrace for debugging
void* bt[16];
int frames = backtrace(bt, 16);
fprintf(stderr, "[WATCH_BACKTRACE] %d frames:\n", frames);
backtrace_symbols_fd(bt, frames, fileno(stderr));
fprintf(stderr, "\n");
fflush(stderr);
// Abort to capture the exact moment
fprintf(stderr, "[WATCH_ABORT] Aborting to preserve state...\n");
fflush(stderr);
abort();
}
#endif // !HAKMEM_BUILD_RELEASE
// ========== Write Header (Allocation) ==========
// Write class_idx to header (called after allocation)
// Input: base (block start from SuperSlab)
// Returns: user pointer (base + 1, skipping header)
static inline void* tiny_region_id_write_header(void* base, int class_idx) {
if (!base) return base;
#if !HAKMEM_BUILD_RELEASE
// Address watcher: Check if this is the watched address
uintptr_t watch = get_watch_addr();
if (watch != 0 && (uintptr_t)base == watch) {
watch_alloc_trigger(base, class_idx, g_last_alloc_source);
}
#endif
// Phase E1-CORRECT: ALL classes (C0-C7) have 1-byte header (no exceptions)
// Rationale: Unified box structure enables:
// - O(1) class identification (no registry lookup)
// - All classes use same fast path
// - Zero special cases across all layers
// Cost: 0.1% memory overhead for C7 (1024B → 1023B usable)
// Benefit: 100% safety, architectural simplicity, maximum performance
// Write header at block start (ALL classes including C7)
uint8_t* header_ptr = (uint8_t*)base;
// Phase 6-A: Debug validation (disabled in release builds for performance)
// perf profiling showed hak_super_lookup() costs 15.84% CPU on hot path
// Expected gain: +12-15% throughput by removing this in release builds
#if !HAKMEM_BUILD_RELEASE
// Debug: detect header writes with class_idx that disagrees with slab metadata.
do {
static _Atomic uint32_t g_hdr_meta_mis = 0;
struct SuperSlab* ss = hak_super_lookup(base);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
int slab_idx = slab_index_for(ss, base);
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
uint8_t meta_cls = ss->slabs[slab_idx].class_idx;
if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) {
uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_mis, 1, memory_order_relaxed);
if (n < 8) {
void* ra = __builtin_return_address(0);
const char* sym = "(unknown)";
#ifdef __GLIBC__
Dl_info info;
if (dladdr(ra, &info) && info.dli_sname) {
sym = info.dli_sname;
}
#endif
fprintf(stderr,
"[HDR_META_MISMATCH] cls=%d meta_cls=%u base=%p slab_idx=%d ss=%p ra=%p fn=%s\n",
class_idx,
(unsigned)meta_cls,
base,
slab_idx,
(void*)ss,
ra,
sym);
if (n < 4) {
void* bt[8];
int frames = backtrace(bt, 8);
backtrace_symbols_fd(bt, frames, fileno(stderr));
}
fflush(stderr);
}
}
}
}
} while (0);
#endif // !HAKMEM_BUILD_RELEASE
// P3: Header writeをデフォルトONTLS SLL向けに常時復元、ENVでOFF可能
// class_map があっても TLS SLL 境界でヘッダーが必要になるため、A/B 切替は
// HAKMEM_TINY_WRITE_HEADER=0 でのみ OFF旧デフォルトにする。
// Memory layout preserved: user = base + 1ヘッダー領域は常に予約
static int g_write_header = -1;
if (__builtin_expect(g_write_header == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_WRITE_HEADER");
g_write_header = (e && *e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_write_header, 1)) {
// Legacy mode: write header for debugging or compatibility
*header_ptr = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
PTR_TRACK_HEADER_WRITE(base, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
}
void* user = header_ptr + 1; // skip header for user pointer (layout preserved)
PTR_TRACK_MALLOC(base, 0, class_idx); // Track at BASE (where header is)
// ========== ALLOCATION LOGGING (Debug builds only) ==========
#if !HAKMEM_BUILD_RELEASE
{
extern _Atomic uint64_t g_debug_op_count;
extern __thread TinyTLSSLL g_tls_sll[];
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
if (op < 2000) { // ALL classes for comprehensive tracing
fprintf(stderr, "[OP#%04lu ALLOC] cls=%d ptr=%p base=%p from=write_header tls_count=%u\n",
(unsigned long)op, class_idx, user, base,
g_tls_sll[class_idx].count);
fflush(stderr);
}
}
#endif
// ========== END ALLOCATION LOGGING ==========
// Optional guard: log stride/base/user for targeted class
if (tiny_guard_is_enabled()) {
size_t stride = tiny_stride_for_class(class_idx);
tiny_guard_on_alloc(class_idx, base, user, stride);
}
return user;
}
// ========== Read Header (Free) ==========
// Read class_idx from header (called during free)
// Returns: class_idx (0-7), or -1 if invalid
static inline int tiny_region_id_read_header(void* ptr) {
if (!ptr) return -1;
if ((uintptr_t)ptr < 4096) return -1; // reject invalid tiny values
uint8_t* header_ptr = (uint8_t*)ptr - 1;
uint8_t header = *header_ptr;
// CRITICAL FIX (Pool TLS Phase 1): ALWAYS validate magic when Pool TLS is enabled
// Reason: Pool TLS uses different magic (0xb0 vs 0xa0), MUST distinguish them!
// Without this, Pool TLS allocations are wrongly routed to Tiny freelist → corruption
// Always validate magic byte to catch non-header allocations (release included).
// Reason: mmap-zero or mid/large frees can otherwise be misrouted as class 0.
uint8_t magic = header & 0xF0;
#if HAKMEM_DEBUG_VERBOSE
static int debug_count = 0;
if (debug_count < 5) {
fprintf(stderr, "[TINY_READ_HEADER] ptr=%p header=0x%02x magic=0x%02x expected=0x%02x\n",
ptr, header, magic, HEADER_MAGIC);
debug_count++;
}
#endif
if (magic != HEADER_MAGIC) {
#if !HAKMEM_BUILD_RELEASE
static int invalid_count = 0;
if (invalid_count < 5) {
fprintf(stderr, "[HEADER_INVALID] ptr=%p, header=%02x, magic=%02x (expected %02x)\n",
ptr, header, magic, HEADER_MAGIC);
invalid_count++;
}
#endif
// Optional guard hook for invalid header
if (tiny_guard_is_enabled()) tiny_guard_on_invalid(ptr, header);
return -1;
}
int class_idx = (int)(header & HEADER_CLASS_MASK);
// CRITICAL: Always validate class_idx range (even in release builds)
// Reason: Corrupted headers could cause out-of-bounds array access
#ifndef TINY_NUM_CLASSES
#define TINY_NUM_CLASSES 8
#endif
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) {
// Corrupted header
return -1;
}
return class_idx;
}
// ========== Header Validation ==========
// Check if pointer has valid header (debug mode)
static inline int tiny_region_id_has_header(void* ptr) {
#if !HAKMEM_BUILD_RELEASE
if (!ptr) return 0;
if ((uintptr_t)ptr < 4096) return 0;
uint8_t* header_ptr = (uint8_t*)ptr - 1;
uint8_t header = *header_ptr;
uint8_t magic = header & 0xF0;
return (magic == HEADER_MAGIC);
#else
// Release: Assume all allocations have headers
(void)ptr;
return 1;
#endif
}
// ========== Allocation Size Adjustment ==========
// Calculate allocation size including header (1 byte)
static inline size_t tiny_region_id_alloc_size(size_t user_size) {
return user_size + 1; // Add 1 byte for header
}
// Calculate user size from allocation size
static inline size_t tiny_region_id_user_size(size_t alloc_size) {
return alloc_size - 1;
}
// ========== Performance Notes ==========
//
// Header Read Performance:
// - Best case: 2 cycles (L1 hit, no validation)
// - Average: 3 cycles (with class_idx extraction)
// - Worst case: 5 cycles (debug validation)
// - vs SuperSlab lookup: 100+ cycles (50x faster!)
//
// Memory Overhead:
// - Per block: 1 byte
// - 8-byte blocks: 12.5% overhead
// - 128-byte blocks: 0.8% overhead
// - Average (typical workload): ~1.5%
// - Slab[0]: 0% (reuses 960B wasted padding)
//
// Cache Impact:
// - Excellent: Header is inline with user data
// - Prefetch: Header loaded with first user data access
// - No additional cache lines required
#else // !HAKMEM_TINY_HEADER_CLASSIDX
// Disabled: No-op implementations
static inline void* tiny_region_id_write_header(void* ptr, int class_idx) {
(void)class_idx;
return ptr;
}
static inline int tiny_region_id_read_header(void* ptr) {
(void)ptr;
return -1; // Not supported
}
static inline int tiny_region_id_has_header(void* ptr) {
(void)ptr;
return 0; // No headers
}
static inline size_t tiny_region_id_alloc_size(size_t user_size) {
return user_size; // No header
}
static inline size_t tiny_region_id_user_size(size_t alloc_size) {
return alloc_size;
}
#endif // HAKMEM_TINY_HEADER_CLASSIDX
#endif // TINY_REGION_ID_H