Files
hakmem/core/ptr_trace.h
Moe Charm (CI) 984cca41ef P0 Optimization: Shared Pool fast path with O(1) metadata lookup
Performance Results:
- Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement)
- sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer
- Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints)

Core Optimizations:

1. O(1) Metadata Lookup (superslab_types.h)
   - Added `shared_meta` pointer field to SuperSlab struct
   - Eliminates O(N) linear search through ss_metadata[] array
   - First access: O(N) scan + cache | Subsequent: O(1) direct return

2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c)
   - Check cached ss->shared_meta first before linear scan
   - Cache pointer after successful linear scan for future lookups
   - Reduces 7.8% CPU hotspot to near-zero for hot paths

3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c)
   - Try class_hints[class_idx] FIRST before full metadata scan
   - Uses O(1) ss->shared_meta lookup for hint validation
   - __builtin_expect() for branch prediction optimization
   - 80-90% of acquire calls now skip full metadata scan

4. Proper Initialization (ss_allocation_box.c)
   - Initialize shared_meta = NULL in superslab_allocate()
   - Ensures correct NULL-check semantics for new SuperSlabs

Additional Improvements:
- Updated ptr_trace and debug ring for release build efficiency
- Enhanced ENV variable documentation and analysis
- Added learner_env_box.h for configuration management
- Various Box optimizations for reduced overhead

Thread Safety:
- All atomic operations use correct memory ordering
- shared_meta cached under mutex protection
- Lock-free Stage 2 uses proper CAS with acquire/release semantics

Testing:
- Benchmark: 1M iterations, 3.8M ops/s stable
- Build: Clean compile RELEASE=0 and RELEASE=1
- No crashes, memory leaks, or correctness issues

Next Optimization Candidates:
- P1: Per-SuperSlab free slot bitmap for O(1) slot claiming
- P2: Reduce Stage 2 critical section size
- P3: Page pre-faulting (MAP_POPULATE)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00

162 lines
6.8 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ptr_trace.h - Pointer link read/write tracing (Debug-only, macro-replaceable)
//
// Purpose:
// - Centrally instrument single-linked list next pointer reads/writes
// - Zero overhead in release builds (macros devolve to raw ops)
// - Compact TLS ring for triage; optional dump at exit via env
//
// Usage:
// - Replace direct next ops with PTR_NEXT_WRITE / PTR_NEXT_READ
// - Tag with a short literal (e.g., "tls_push", "tls_pop", "splice")
// - Offsets should be header-aware (C0C6: +1, C7: +0)
//
// Control:
// - Compile-time: HAKMEM_PTR_TRACE (default: 1 for debug, 0 for release)
// - Runtime dump: HAKMEM_PTR_TRACE_DUMP=1 (prints ring at exit)
//
// Phase E1-CORRECT: Uses Box API (tiny_next_ptr_box.h) for offset calculation
#pragma once
#include <stdint.h>
#include <stddef.h>
#include "hakmem_trace_master.h" // Phase 4c: Master trace control
#include "hakmem_stats_master.h" // Phase 4d: Master stats/dump control
#include "box/tiny_next_ptr_box.h" // Box API: tiny_next_read/write
#ifndef HAKMEM_PTR_TRACE
# if !HAKMEM_BUILD_RELEASE
# define HAKMEM_PTR_TRACE 1
# else
# define HAKMEM_PTR_TRACE 0
# endif
#endif
#if HAKMEM_PTR_TRACE
#include <stdio.h>
typedef struct {
const char* tag; // literal tag
int class_idx; // tiny class (0..7)
void* node; // node address (base)
void* val; // written/read value
size_t off; // offset used for access
} ptr_trace_ev_t;
#ifndef PTR_TRACE_CAP
# define PTR_TRACE_CAP 256
#endif
static __thread ptr_trace_ev_t g_ptr_trace_ring[PTR_TRACE_CAP];
static __thread uint32_t g_ptr_trace_idx = 0;
static __thread int g_ptr_trace_dump_registered = 0;
static inline void ptr_trace_record(const char* tag, int cls, void* node, void* val, size_t off) {
uint32_t i = g_ptr_trace_idx++;
g_ptr_trace_ring[i & (PTR_TRACE_CAP - 1)] = (ptr_trace_ev_t){ tag, cls, node, val, off };
}
static inline void ptr_trace_dump(void) {
fprintf(stderr, "\n[PTR_TRACE_DUMP] last=%u (cap=%u)\n", g_ptr_trace_idx, (unsigned)PTR_TRACE_CAP);
uint32_t n = (g_ptr_trace_idx < PTR_TRACE_CAP) ? g_ptr_trace_idx : PTR_TRACE_CAP;
for (uint32_t k = 0; k < n; k++) {
const ptr_trace_ev_t* e = &g_ptr_trace_ring[(g_ptr_trace_idx - n + k) & (PTR_TRACE_CAP - 1)];
fprintf(stderr, "[%3u] tag=%s cls=%d node=%p val=%p off=%zu\n",
k, e->tag ? e->tag : "?", e->class_idx, e->node, e->val, e->off);
}
}
static inline void ptr_trace_try_register_dump(void) {
#if HAKMEM_BUILD_RELEASE
return;
#else
if (g_ptr_trace_dump_registered) return;
g_ptr_trace_dump_registered = 1;
static int g_dump_enabled = -1;
if (__builtin_expect(g_dump_enabled == -1, 0)) {
g_dump_enabled = hak_stats_check("HAKMEM_PTR_TRACE_DUMP", "trace");
}
if (!g_dump_enabled) return;
atexit(ptr_trace_dump);
#endif
}
// Immediate dump (Debug only) — static inline to avoid ODR/link conflicts under LTO
// Only dumps if HAKMEM_PTR_TRACE_VERBOSE=1 to avoid excessive output in debug builds
#if HAKMEM_PTR_TRACE
static inline void ptr_trace_dump_now(const char* reason) {
#if HAKMEM_BUILD_RELEASE
(void)reason;
return;
#else
static int verbose_mode = -1;
if (__builtin_expect(verbose_mode == -1, 0)) {
verbose_mode = hak_trace_check("HAKMEM_PTR_TRACE_VERBOSE", "ptr");
}
if (!verbose_mode) return; // Skip verbose logging unless explicitly enabled
fprintf(stderr, "\n[PTR_TRACE_NOW] reason=%s last=%u (cap=%u)\n",
reason ? reason : "(null)", g_ptr_trace_idx, (unsigned)PTR_TRACE_CAP);
uint32_t n = (g_ptr_trace_idx < PTR_TRACE_CAP) ? g_ptr_trace_idx : PTR_TRACE_CAP;
for (uint32_t k = 0; k < n; k++) {
const ptr_trace_ev_t* e = &g_ptr_trace_ring[(g_ptr_trace_idx - n + k) & (PTR_TRACE_CAP - 1)];
fprintf(stderr, "[%3u] tag=%s cls=%d node=%p val=%p off=%zu\n",
k, e->tag ? e->tag : "?", e->class_idx, e->node, e->val, e->off);
}
#endif
}
#else
static inline void ptr_trace_dump_now(const char* reason) { (void)reason; }
#endif
// Phase E1-CORRECT: Use Box API for all next pointer operations
// Box API handles offset calculation internally based on class_idx.
// `off` は呼び出し元互換用に受け取るが、アドレス計算には使わない(ログ専用)。
#define PTR_NEXT_WRITE(tag, cls, node, off, value) do { \
g_tiny_next_tag = (tag); \
g_tiny_next_file = __FILE__; \
g_tiny_next_line = __LINE__; \
/* Use only depth-0 return address; deeper frames are unsafe. */ \
g_tiny_next_ra0 = __builtin_return_address(0); \
g_tiny_next_ra1 = NULL; \
g_tiny_next_ra2 = NULL; \
(void)(off); \
tiny_next_write((cls), (node), (value)); \
ptr_trace_record((tag), (cls), (node), (value), (size_t)(off)); \
ptr_trace_try_register_dump(); \
} while (0)
#define PTR_NEXT_READ(tag, cls, node, off, out_var) do { \
(void)(off); \
void* _tmp = tiny_next_read((cls), (node)); \
(out_var) = _tmp; \
ptr_trace_record((tag), (cls), (node), (out_var), (size_t)(off)); \
ptr_trace_try_register_dump(); \
} while (0)
#else // HAKMEM_PTR_TRACE == 0
// Phase E1-CORRECT: Use Box API for all next pointer operations (Release mode)
// `off` は互換用のダミーで、Box API が offset を決定する。
#define PTR_NEXT_WRITE(tag, cls, node, off, value) \
do { \
g_tiny_next_tag = (tag); \
g_tiny_next_file = __FILE__; \
g_tiny_next_line = __LINE__; \
/* Depth-0 return address only; avoid unsafe frame walks. */ \
g_tiny_next_ra0 = __builtin_return_address(0); \
g_tiny_next_ra1 = NULL; \
g_tiny_next_ra2 = NULL; \
(void)(tag); (void)(off); \
tiny_next_write((cls), (node), (value)); \
} while (0)
#define PTR_NEXT_READ(tag, cls, node, off, out_var) \
do { (void)(tag); (void)(off); (out_var) = tiny_next_read((cls), (node)); } while (0)
// Always provide a stub for release builds so callers can link
static inline void ptr_trace_dump_now(const char* reason) { (void)reason; }
#endif // HAKMEM_PTR_TRACE