Phase v7-5a: Hot path stats removal (C6 v7 極限最適化)

- Remove per-page stats from hot path (alloc_count, free_count, live_current)
- Add ENV-gated global atomic stats (HAKMEM_V7_HOT_STATS)
- Stats now collected only at retire time (cold path)
- Header write kept at alloc time (freelist overlaps block[0])

A/B Result: -4.3% overhead → ±0% (target: legacy ±2%)
v7 OFF avg: 9.26M ops/s, v7 ON avg: 9.27M ops/s (+0.15%)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-12 04:51:17 +09:00
parent 580e8f57f7
commit 17ceed619c
3 changed files with 204 additions and 44 deletions

View File

@ -1,9 +1,14 @@
// smallobject_hotbox_v7_box.h - SmallObject HotBox v7 (Phase v7-2: C6-only impl)
// smallobject_hotbox_v7_box.h - SmallObject HotBox v7 (Phase v7-5a: Hot path極限最適化)
//
// Role:
// - SmallObject v7 fast path for alloc/free
// - C6-only implementation (512B blocks, 64KiB pages, 2MiB segments)
// - Uses SmallHeapCtx_v7 + SmallSegment_v7 + ColdIface_v7
//
// v7-5a optimizations:
// - Stats (alloc_count, free_count, live_current) removed from hot path
// - Global atomic stats gated by ENV (HAKMEM_V7_HOT_STATS)
// - Header write kept (required due to intrusive freelist overlapping block[0])
#pragma once
@ -11,6 +16,7 @@
#include <stddef.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h> // for getenv()
#include "smallsegment_v7_box.h"
#include "smallobject_cold_iface_v7_box.h"
#include "region_id_v6_box.h"
@ -22,7 +28,7 @@
#endif
// ============================================================================
// Debug/Observe Support
// Debug/Observe Support (v7-5a: ENV-gated for hot path)
// ============================================================================
// V7 stats functions (defined in smallobject_cold_iface_v7.c)
@ -31,6 +37,23 @@ extern void small_v7_stat_free(void);
extern void small_v7_stat_refill(void);
extern void small_v7_stat_retire(void);
// v7-5a: ENV gate for hot path stats (default OFF for performance)
// Set HAKMEM_V7_HOT_STATS=1 to enable per-alloc/free atomic counters
static inline int small_v7_hot_stats_enabled(void) {
static int g_enabled = -1;
if (__builtin_expect(g_enabled < 0, 0)) {
const char* e = getenv("HAKMEM_V7_HOT_STATS");
g_enabled = (e && *e && *e != '0') ? 1 : 0;
}
return g_enabled;
}
// Conditional stat increment (only if ENV enabled)
#define SMALL_V7_HOT_STAT_ALLOC() \
do { if (__builtin_expect(small_v7_hot_stats_enabled(), 0)) small_v7_stat_alloc(); } while(0)
#define SMALL_V7_HOT_STAT_FREE() \
do { if (__builtin_expect(small_v7_hot_stats_enabled(), 0)) small_v7_stat_free(); } while(0)
// Class mismatch logging (for hint validation)
static inline void small_v7_log_class_mismatch(void* ptr, uint8_t hint, uint8_t actual) {
// TODO: Make this ENV-controlled
@ -44,7 +67,7 @@ static inline void small_v7_log_class_mismatch(void* ptr, uint8_t hint, uint8_t
// Alloc Fast Path
// ============================================================================
// small_heap_alloc_fast_v7() - v7 alloc (C6-only for v7-2)
// small_heap_alloc_fast_v7() - v7 alloc (C6-only, v7-5a: Hot path極限最適化)
//
// Flow:
// 1. Get TLS context
@ -53,6 +76,10 @@ static inline void small_v7_log_class_mismatch(void* ptr, uint8_t hint, uint8_t
// 4. If no partial, call ColdIface refill
// 5. Pop from freelist and return USER ptr
//
// v7-5a optimizations:
// - Per-page stats (alloc_count, live_current) removed from hot path
// - Global atomic stats gated by ENV (HAKMEM_V7_HOT_STATS)
//
static inline void* small_heap_alloc_fast_v7(size_t size, uint8_t class_idx) {
// v7-2: Only C6 is implemented
if (unlikely(class_idx != SMALL_V7_C6_CLASS_IDX)) {
@ -69,17 +96,13 @@ static inline void* small_heap_alloc_fast_v7(size_t size, uint8_t class_idx) {
p->free_list = *(void**)base;
p->used++;
// Update stats
p->alloc_count++;
p->live_current++;
if (p->live_current > p->peak_live) {
p->peak_live = p->live_current;
}
// Write header (HEADER_MAGIC | class_idx) for front compatibility
// Note: Cannot move to carve time due to intrusive freelist overlapping block[0]
((uint8_t*)base)[0] = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
small_v7_stat_alloc();
// v7-5a: Stats moved to cold path (ENV-gated only)
SMALL_V7_HOT_STAT_ALLOC();
// Return USER ptr (base + 1 for header compatibility with front)
return (uint8_t*)base + 1;
}
@ -96,16 +119,10 @@ static inline void* small_heap_alloc_fast_v7(size_t size, uint8_t class_idx) {
p->free_list = *(void**)base;
p->used++;
p->alloc_count++;
p->live_current++;
if (p->live_current > p->peak_live) {
p->peak_live = p->live_current;
}
// Write header (HEADER_MAGIC | class_idx) for front compatibility
// Write header
((uint8_t*)base)[0] = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
small_v7_stat_alloc();
SMALL_V7_HOT_STAT_ALLOC();
return (uint8_t*)base + 1;
}
}
@ -124,16 +141,10 @@ static inline void* small_heap_alloc_fast_v7(size_t size, uint8_t class_idx) {
p->free_list = *(void**)base;
p->used++;
p->alloc_count++;
p->live_current++;
if (p->live_current > p->peak_live) {
p->peak_live = p->live_current;
}
// Write header (HEADER_MAGIC | class_idx) for front compatibility
// Write header
((uint8_t*)base)[0] = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
small_v7_stat_alloc();
SMALL_V7_HOT_STAT_ALLOC();
return (uint8_t*)base + 1;
}
@ -141,12 +152,15 @@ static inline void* small_heap_alloc_fast_v7(size_t size, uint8_t class_idx) {
// Free Fast Path
// ============================================================================
// small_heap_free_fast_v7() - v7 free (C6-only, Phase v7-3: TLS fast path)
// small_heap_free_fast_v7() - v7 free (C6-only, v7-5a: Hot path極限最適化)
//
// Flow:
// 1. Same-page cache hit (fastest)
// 2. TLS segment hint hit (skip RegionIdBox)
// 3. RegionIdBox fallback (cold path)
// 1. TLS segment hint hit (skip RegionIdBox)
// 2. RegionIdBox fallback (cold path)
//
// v7-5a optimizations:
// - Stats (free_count, live_current) removed from hot path
// - Global atomic stats gated by ENV
//
// @param ptr: USER pointer to free
// @param class_idx_hint: Class index hint from front/header (may be ignored)
@ -190,9 +204,7 @@ static inline bool small_heap_free_fast_v7(void* ptr, uint8_t class_idx_hint) {
*(void**)base = page->free_list;
page->free_list = base;
// Update stats
page->free_count++;
page->live_current--;
// v7-5a: Stats removed from hot path
// Retire if empty
if (unlikely(--page->used == 0)) {
@ -200,7 +212,7 @@ static inline bool small_heap_free_fast_v7(void* ptr, uint8_t class_idx_hint) {
small_cold_v7_retire_page(ctx, page);
}
small_v7_stat_free();
SMALL_V7_HOT_STAT_FREE();
return true;
}
@ -243,9 +255,7 @@ regionid_fallback:
*(void**)base = page->free_list;
page->free_list = base;
// Update stats
page->free_count++;
page->live_current--;
// v7-5a: Stats removed from hot path
// Decrement used count
if (unlikely(--page->used == 0)) {
@ -253,7 +263,7 @@ regionid_fallback:
small_cold_v7_retire_page(ctx, page);
}
small_v7_stat_free();
SMALL_V7_HOT_STAT_FREE();
return true;
}
}