Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
198 lines
5.5 KiB
C
198 lines
5.5 KiB
C
// hakmem_debug.h - Debug Timing Infrastructure (Box理論: 計測箱)
|
|
// Purpose: Lightweight timing measurement for performance analysis
|
|
//
|
|
// License: MIT
|
|
// Date: 2025-10-21
|
|
|
|
#pragma once
|
|
#include <stdint.h>
|
|
#include <time.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
// ============================================================================
|
|
// Configuration (環境変数で制御可能)
|
|
// ============================================================================
|
|
|
|
// Build-time guard: HAKMEM_DEBUG_TIMING=1 enables all timing code
|
|
#ifndef HAKMEM_DEBUG_TIMING
|
|
#define HAKMEM_DEBUG_TIMING 0 // Default: OFF (zero overhead)
|
|
#endif
|
|
|
|
// Runtime guard: HAKMEM_TIMING=1 enables timing at runtime
|
|
// Sampling rate: HAKMEM_TIMING_SAMPLE=N (default: 64, means 1/64 sampling)
|
|
|
|
// ============================================================================
|
|
// Timing Categories (計測カテゴリID)
|
|
// ============================================================================
|
|
|
|
typedef enum {
|
|
// Syscalls and fallbacks
|
|
HKM_CAT_SYSCALL_MMAP = 0,
|
|
HKM_CAT_SYSCALL_MUNMAP,
|
|
HKM_CAT_SYSCALL_MADVISE,
|
|
HKM_CAT_FALLBACK_MALLOC,
|
|
|
|
// Whale Fast-Path
|
|
HKM_CAT_WHALE_GET,
|
|
HKM_CAT_WHALE_PUT,
|
|
|
|
// L2.5 Pool (Phase 6.11.3: Added for profiling)
|
|
HKM_CAT_L25_GET,
|
|
HKM_CAT_L25_PUT,
|
|
HKM_CAT_L25_REFILL,
|
|
|
|
// Tiny Pool (Phase 6.11.3: Added for profiling)
|
|
HKM_CAT_TINY_ALLOC,
|
|
HKM_CAT_TINY_FREE,
|
|
HKM_CAT_TINY_SLAB_SEARCH,
|
|
|
|
// BigCache (Phase 6.11.3: Added for profiling)
|
|
HKM_CAT_BIGCACHE_GET,
|
|
HKM_CAT_BIGCACHE_PUT,
|
|
HKM_CAT_BIGCACHE_EVICT_SCAN,
|
|
|
|
// Site Rules (Phase 6.11.3: Added for profiling)
|
|
HKM_CAT_SITE_RULES_LOOKUP,
|
|
HKM_CAT_SITE_RULES_ADOPT,
|
|
|
|
// ELO Learning (Phase 6.11.3: Added for profiling)
|
|
HKM_CAT_ELO_SELECT,
|
|
HKM_CAT_ELO_UPDATE,
|
|
|
|
// Top-level API (Phase 6.11.3: Added for profiling)
|
|
HKM_CAT_HAK_ALLOC,
|
|
HKM_CAT_HAK_FREE,
|
|
|
|
// Legacy (keep for compatibility)
|
|
HKM_CAT_REGION_GET,
|
|
HKM_CAT_REGION_PUT,
|
|
HKM_CAT_HASH_FNV1A,
|
|
|
|
// Mid (L2 Pool) fine-grained
|
|
HKM_CAT_POOL_GET,
|
|
HKM_CAT_POOL_LOCK,
|
|
HKM_CAT_POOL_REFILL,
|
|
HKM_CAT_TC_DRAIN,
|
|
HKM_CAT_TLS_FAST,
|
|
HKM_CAT_SHARD_STEAL,
|
|
// Additional Mid (L2) fine-grained (NEW)
|
|
HKM_CAT_POOL_TLS_RING_POP,
|
|
HKM_CAT_POOL_TLS_LIFO_POP,
|
|
HKM_CAT_POOL_REMOTE_PUSH,
|
|
HKM_CAT_POOL_ALLOC_TLS_PAGE,
|
|
|
|
// L2.5 (LargePool) fine-grained (NEW)
|
|
HKM_CAT_L25_LOCK,
|
|
HKM_CAT_L25_TLS_RING_POP,
|
|
HKM_CAT_L25_TLS_LIFO_POP,
|
|
HKM_CAT_L25_TC_DRAIN,
|
|
HKM_CAT_L25_REMOTE_PUSH,
|
|
HKM_CAT_L25_ALLOC_TLS_PAGE,
|
|
HKM_CAT_L25_SHARD_STEAL,
|
|
|
|
HKM_CAT_MAX // Total number of categories
|
|
} HkmTimingCategory;
|
|
|
|
// ============================================================================
|
|
// Timing Statistics (TLS per-thread)
|
|
// ============================================================================
|
|
|
|
typedef struct {
|
|
uint64_t count; // Number of calls
|
|
uint64_t cycles; // Total cycles (TSC or nanoseconds)
|
|
} HkmTimingStat;
|
|
|
|
// ============================================================================
|
|
// Public API
|
|
// ============================================================================
|
|
|
|
// Initialize timing subsystem (called from hak_init)
|
|
void hkm_timing_init(void);
|
|
|
|
// Shutdown timing subsystem (called from hak_shutdown, dumps stats)
|
|
void hkm_timing_shutdown(void);
|
|
|
|
// Get current timestamp (TSC or clock_gettime)
|
|
static inline uint64_t hkm_tsc_now(void);
|
|
|
|
// Increment counter for category
|
|
void hkm_count_inc(HkmTimingCategory cat);
|
|
|
|
// Add cycles to category
|
|
void hkm_cycles_add(HkmTimingCategory cat, uint64_t cycles);
|
|
|
|
// Dump all statistics (called at exit or on demand)
|
|
void hkm_timing_dump(void);
|
|
|
|
// ============================================================================
|
|
// Timing Macros (inline化、ビルド時に消える)
|
|
// ============================================================================
|
|
|
|
#if HAKMEM_DEBUG_TIMING
|
|
|
|
// HKM_TIME_START(var) - Start timing, stores start time in var
|
|
#define HKM_TIME_START(var) \
|
|
uint64_t var = hkm_tsc_now()
|
|
|
|
// HKM_TIME_END(cat, var) - End timing, add elapsed time to category
|
|
#define HKM_TIME_END(cat, var) \
|
|
do { \
|
|
uint64_t _end = hkm_tsc_now(); \
|
|
hkm_cycles_add(cat, _end - var); \
|
|
hkm_count_inc(cat); \
|
|
} while (0)
|
|
|
|
// HKM_CNT(cat) - Increment count only (no timing)
|
|
#define HKM_CNT(cat) \
|
|
hkm_count_inc(cat)
|
|
|
|
#else
|
|
|
|
// Build-time disabled: declare variables but no timing code (zero overhead)
|
|
#define HKM_TIME_START(var) \
|
|
uint64_t var = 0
|
|
|
|
#define HKM_TIME_END(cat, var) \
|
|
do { (void)(cat); (void)(var); } while (0)
|
|
|
|
#define HKM_CNT(cat) \
|
|
do { (void)(cat); } while (0)
|
|
|
|
#endif
|
|
|
|
// ============================================================================
|
|
// Internal: TSC/Clock Implementation (inline for zero overhead)
|
|
// ============================================================================
|
|
|
|
#if HAKMEM_DEBUG_TIMING
|
|
|
|
static inline uint64_t hkm_tsc_now(void) {
|
|
#if defined(__x86_64__) || defined(__i386__)
|
|
// x86/x64: Use RDTSC (fast, ~10 cycles)
|
|
uint32_t lo, hi;
|
|
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
|
|
return ((uint64_t)hi << 32) | lo;
|
|
#else
|
|
// Non-x86: Use clock_gettime (slower, but portable)
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
|
|
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
|
|
#endif
|
|
}
|
|
|
|
#else
|
|
|
|
// Build-time disabled: empty inline function
|
|
static inline uint64_t hkm_tsc_now(void) {
|
|
return 0;
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|