Phase MID-V35-HOTPATH-OPT-1 complete: +7.3% on C6-heavy

Step 0: Geometry SSOT
  - New: core/box/smallobject_mid_v35_geom_box.h (L1/L2 consistency)
  - Fix: C6 slots/page 102→128 in L2 (smallobject_cold_iface_mid_v3.c)
  - Applied: smallobject_mid_v35.c, smallobject_segment_mid_v3.c

Step 1-3: ENV gates for hotpath optimizations
  - New: core/box/mid_v35_hotpath_env_box.h
    * HAKMEM_MID_V35_HEADER_PREFILL (default 0)
    * HAKMEM_MID_V35_HOT_COUNTS (default 1)
    * HAKMEM_MID_V35_C6_FASTPATH (default 0)
  - Implementation: smallobject_mid_v35.c
    * Header prefill at refill boundary (Step 1)
    * Gated alloc_count++ in hot path (Step 2)
    * C6 specialized fast path with constant slot_size (Step 3)

A/B Results:
  C6-heavy (257–768B): 8.75M→9.39M ops/s (+7.3%, 5-run mean) 
  Mixed (16–1024B): 9.98M→9.96M ops/s (-0.2%, within noise) ✓

Decision: FROZEN - defaults OFF, C6-heavy推奨ON, Mixed現状維持
Documentation: ENV_PROFILE_PRESETS.md updated

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-12 19:19:25 +09:00
parent e95e61f0ff
commit fe70e3baf5
9 changed files with 211 additions and 57 deletions

View File

@ -1,6 +1,27 @@
# 本線タスク(現在) # 本線タスク(現在)
## 現在地: Phase MID-V35-HOTPATH-OPT-1-DESIGN へ ## 現在地: Phase MID-V35-HOTPATH-OPT-1 完了 → 次フェーズ選定待ち
---
### Status: Phase MID-V35-HOTPATH-OPT-1 FROZEN ✅
**Summary**:
- **Design**: Step 0-3Geometry SSOT + Header prefill + Hot counts + C6 fastpath
- **C6-heavy (257768B)**: **+7.3%** improvement ✅ (8.75M → 9.39M ops/s, 5-run mean)
- **Mixed (161024B)**: **-0.2%** (誤差範囲, ±2%以内) ✓
- **Decision**: デフォルトOFF/FROZEN全3、C6-heavy推奨ON、Mixed現状維持
- **Key Finding**:
- Step 0: L1/L2 geometry mismatch 修正C6 102→128 slots
- Step 1-3: refill 境界移動 + 分岐削減 + constant 最適化で +7.3%
- Mixed では MID_V3(C6-only) 固定なため効果微小
**Deliverables**:
- `core/box/smallobject_mid_v35_geom_box.h` (新規)
- `core/box/mid_v35_hotpath_env_box.h` (新規)
- `core/smallobject_mid_v35.c` (Step 1-3 統合)
- `core/smallobject_cold_iface_mid_v3.c` (Step 0 + Step 1)
- `docs/analysis/ENV_PROFILE_PRESETS.md` (更新)
--- ---

View File

@ -0,0 +1,52 @@
// mid_v35_hotpath_env_box.h - Phase MID-V35-HOTPATH-OPT-1 ENV gates
//
// Step 1: HAKMEM_MID_V35_HEADER_PREFILL (default 0)
// - ON: prefill headers at refill boundary, skip in hot path
// Step 2: HAKMEM_MID_V35_HOT_COUNTS (default 1)
// - OFF: skip alloc_count++ in hot path (free_count/retire kept)
// Step 3: HAKMEM_MID_V35_C6_FASTPATH (default 0)
// - ON: use specialized C6 fast path with constant slot size
//
#ifndef HAKMEM_MID_V35_HOTPATH_ENV_BOX_H
#define HAKMEM_MID_V35_HOTPATH_ENV_BOX_H
#include <stdlib.h>
#include <stdbool.h>
// Step 1: Header prefill at refill boundary
// Default OFF (conservative)
static inline bool mid_v35_header_prefill_enabled(void) {
static int g_enabled = -1;
if (__builtin_expect(g_enabled >= 0, 1)) {
return g_enabled == 1;
}
const char* e = getenv("HAKMEM_MID_V35_HEADER_PREFILL");
g_enabled = (e && *e == '1') ? 1 : 0;
return g_enabled == 1;
}
// Step 2: Hot counts (alloc_count++ in hot path)
// Default ON (for compatibility/correctness)
static inline bool mid_v35_hot_counts_enabled(void) {
static int g_enabled = -1;
if (__builtin_expect(g_enabled >= 0, 1)) {
return g_enabled == 1;
}
const char* e = getenv("HAKMEM_MID_V35_HOT_COUNTS");
g_enabled = (e && *e == '0') ? 0 : 1; // default ON
return g_enabled == 1;
}
// Step 3: C6 specialized fast path
// Default OFF (conservative)
static inline bool mid_v35_c6_fastpath_enabled(void) {
static int g_enabled = -1;
if (__builtin_expect(g_enabled >= 0, 1)) {
return g_enabled == 1;
}
const char* e = getenv("HAKMEM_MID_V35_C6_FASTPATH");
g_enabled = (e && *e == '1') ? 1 : 0;
return g_enabled == 1;
}
#endif // HAKMEM_MID_V35_HOTPATH_ENV_BOX_H

View File

@ -0,0 +1,45 @@
// smallobject_mid_v35_geom_box.h - Phase MID-V35-HOTPATH-OPT-1 Step 0
//
// Single Source of Truth for MID v3.5 geometry (C5-C7)
// All layers (L1 HotBox, L2 ColdIface, Segment) must use these helpers.
//
// Geometry (64KB page):
// C5: 384B slots, 170 slots/page (257-384B)
// C6: 512B slots, 128 slots/page (385-512B)
// C7: 1024B slots, 64 slots/page (513-1024B)
//
#ifndef HAKMEM_SMALLOBJECT_MID_V35_GEOM_BOX_H
#define HAKMEM_SMALLOBJECT_MID_V35_GEOM_BOX_H
#include <stdint.h>
#include <stddef.h>
// Page size for MID v3.5 (64KB)
#define MID_V35_PAGE_SIZE (64 * 1024)
// Slot size by class_idx (C5-C7 only, others return 0)
static inline size_t mid_v35_slot_size(uint32_t class_idx) {
switch (class_idx) {
case 5: return 384; // C5: 257-384B
case 6: return 512; // C6: 385-512B
case 7: return 1024; // C7: 513-1024B
default: return 0;
}
}
// Slots per page by class_idx (C5-C7 only, others return 0)
static inline uint32_t mid_v35_slots_per_page(uint32_t class_idx) {
switch (class_idx) {
case 5: return 170; // 65536 / 384 = 170
case 6: return 128; // 65536 / 512 = 128
case 7: return 64; // 65536 / 1024 = 64
default: return 0;
}
}
// Validate class_idx is in MID v3.5 range (C5-C7)
static inline int mid_v35_class_valid(uint32_t class_idx) {
return (class_idx >= 5 && class_idx <= 7);
}
#endif // HAKMEM_SMALLOBJECT_MID_V35_GEOM_BOX_H

View File

@ -5,6 +5,9 @@
#include <stdio.h> #include <stdio.h>
#include "box/smallobject_cold_iface_mid_v3_box.h" #include "box/smallobject_cold_iface_mid_v3_box.h"
#include "box/smallobject_stats_mid_v3_box.h" #include "box/smallobject_stats_mid_v3_box.h"
#include "box/smallobject_mid_v35_geom_box.h" // Phase MID-V35-HOTPATH-OPT-1: geometry SSOT
#include "box/mid_v35_hotpath_env_box.h" // Phase MID-V35-HOTPATH-OPT-1: Step 1-3 ENV gates
#include "tiny_region_id.h" // For tiny_region_id_write_header
// SmallPageMeta is defined in smallobject_segment_mid_v3_box.h // SmallPageMeta is defined in smallobject_segment_mid_v3_box.h
#include "box/smallobject_segment_mid_v3_box.h" #include "box/smallobject_segment_mid_v3_box.h"
@ -19,15 +22,9 @@ static __thread SmallSegment_MID_v3 *tls_mid_segment = NULL;
// ============================================================================ // ============================================================================
// Helper: class_idx to slots // Helper: class_idx to slots
// ============================================================================ // ============================================================================
// Phase MID-V35-HOTPATH-OPT-1: Use geom_box as Single Source of Truth
static uint32_t class_idx_to_slots(uint32_t class_idx) { // See: core/box/smallobject_mid_v35_geom_box.h
switch (class_idx) { // (Removed local class_idx_to_slots() which had wrong C6 value: 102 instead of 128)
case 5: return 170; // C5: 257-384B
case 6: return 102; // C6: 385-640B
case 7: return 64; // C7: 641-1024B
default: return 0;
}
}
// ============================================================================ // ============================================================================
// Cold Interface Implementation // Cold Interface Implementation
@ -63,10 +60,21 @@ SmallPageMeta_MID_v3* small_cold_mid_v3_refill_page(uint32_t class_idx) {
// Initialize page for allocation // Initialize page for allocation
page->class_idx = class_idx; page->class_idx = class_idx;
page->capacity = class_idx_to_slots(class_idx); page->capacity = mid_v35_slots_per_page(class_idx);
page->alloc_count = 0; page->alloc_count = 0;
page->free_count = 0; page->free_count = 0;
// Phase MID-V35-HOTPATH-OPT-1 Step 1: Header prefill at refill boundary
// When enabled, write all headers now so hot path can skip per-alloc header writes
if (mid_v35_header_prefill_enabled()) {
size_t slot_size = mid_v35_slot_size(class_idx);
uint32_t cap = page->capacity;
uint8_t *p = (uint8_t*)page_ptr;
for (uint32_t i = 0; i < cap; i++) {
tiny_region_id_write_header(p + i * slot_size, class_idx);
}
}
return page; return page;
} }

View File

@ -13,6 +13,8 @@
#include "box/smallobject_mid_v35_box.h" #include "box/smallobject_mid_v35_box.h"
#include "box/smallobject_segment_mid_v3_box.h" #include "box/smallobject_segment_mid_v3_box.h"
#include "box/smallobject_cold_iface_mid_v3_box.h" #include "box/smallobject_cold_iface_mid_v3_box.h"
#include "box/smallobject_mid_v35_geom_box.h" // Phase MID-V35-HOTPATH-OPT-1: geometry SSOT
#include "box/mid_v35_hotpath_env_box.h" // Phase MID-V35-HOTPATH-OPT-1: Step 1-3 ENV gates
#include "tiny_region_id.h" // For tiny_region_id_write_header #include "tiny_region_id.h" // For tiny_region_id_write_header
// SmallPageMeta is defined in smallobject_segment_mid_v3_box.h // SmallPageMeta is defined in smallobject_segment_mid_v3_box.h
@ -34,26 +36,8 @@ static __thread SmallMidV35TlsCtx tls_mid_v35_ctx = {0};
// ============================================================================ // ============================================================================
// Slot Configuration (C5/C6/C7) // Slot Configuration (C5/C6/C7)
// ============================================================================ // ============================================================================
// Phase MID-V35-HOTPATH-OPT-1: Use geom_box as Single Source of Truth
// Slot sizes for C5, C6, C7 // See: core/box/smallobject_mid_v35_geom_box.h
static const size_t g_slot_sizes[8] = {
0, // C0: not used
0, // C1: not used
0, // C2: not used
0, // C3: not used
0, // C4: not used (ULTRA handles this)
384, // C5: 257-384 bytes → 384 byte slots
512, // C6: 385-512 bytes → 512 byte slots
1024, // C7: 513-1024 bytes → 1024 byte slots (ULTRA handles this)
};
// Slots per 64KB page
static const uint32_t g_slots_per_page[8] = {
0, 0, 0, 0, 0,
170, // C5: 65536 / 384 = 170
128, // C6: 65536 / 512 = 128
64, // C7: 65536 / 1024 = 64
};
// ============================================================================ // ============================================================================
// Init // Init
@ -74,26 +58,57 @@ void* small_mid_v35_alloc(uint32_t class_idx, size_t size) {
SmallMidV35TlsCtx *ctx = &tls_mid_v35_ctx; SmallMidV35TlsCtx *ctx = &tls_mid_v35_ctx;
// Fast path: allocate from TLS cached page // ========================================================================
// Step 3: C6 specialized fast path (constant slot size = 512)
// ========================================================================
if (mid_v35_c6_fastpath_enabled() && class_idx == 6) {
void *page = ctx->page[6];
uint32_t off = ctx->offset[6];
if (page && off < ctx->capacity[6]) {
// C6: slot_size = 512 (constant, compiler can optimize)
void *base = (char*)page + off * 512;
ctx->offset[6] = off + 1;
// Step 2: HOT_COUNTS gate
if (mid_v35_hot_counts_enabled() && ctx->meta[6]) {
ctx->meta[6]->alloc_count++;
}
// Step 1: HEADER_PREFILL gate
if (!mid_v35_header_prefill_enabled()) {
tiny_region_id_write_header(base, 6);
}
return (char*)base + 1;
}
// Fall through to slow path
}
// ========================================================================
// Generic fast path: allocate from TLS cached page
// ========================================================================
if (ctx->page[class_idx] && ctx->offset[class_idx] < ctx->capacity[class_idx]) { if (ctx->page[class_idx] && ctx->offset[class_idx] < ctx->capacity[class_idx]) {
size_t slot_size = g_slot_sizes[class_idx]; size_t slot_size = mid_v35_slot_size(class_idx);
void *base = (char*)ctx->page[class_idx] + ctx->offset[class_idx] * slot_size; void *base = (char*)ctx->page[class_idx] + ctx->offset[class_idx] * slot_size;
ctx->offset[class_idx]++; ctx->offset[class_idx]++;
// Update page metadata // Step 2: HOT_COUNTS gate - Update page metadata
if (ctx->meta[class_idx]) { if (mid_v35_hot_counts_enabled() && ctx->meta[class_idx]) {
ctx->meta[class_idx]->alloc_count++; ctx->meta[class_idx]->alloc_count++;
} }
// Write header (1-byte Tiny header with class_idx) // Step 1: HEADER_PREFILL gate - Write header if not prefilled
// Note: Assumes HAKMEM_TINY_HEADER_CLASSIDX is enabled if (!mid_v35_header_prefill_enabled()) {
tiny_region_id_write_header(base, class_idx); tiny_region_id_write_header(base, class_idx);
}
// Return USER pointer (BASE + 1 byte header) // Return USER pointer (BASE + 1 byte header)
return (char*)base + 1; return (char*)base + 1;
} }
// ========================================================================
// Slow path: need new page via ColdIface // Slow path: need new page via ColdIface
// ========================================================================
SmallPageMeta_MID_v3 *page = small_cold_mid_v3_refill_page(class_idx); SmallPageMeta_MID_v3 *page = small_cold_mid_v3_refill_page(class_idx);
if (!page) { if (!page) {
// Fallback to legacy or return NULL // Fallback to legacy or return NULL
@ -103,14 +118,18 @@ void* small_mid_v35_alloc(uint32_t class_idx, size_t size) {
// Update TLS cache // Update TLS cache
ctx->page[class_idx] = page->ptr; ctx->page[class_idx] = page->ptr;
ctx->offset[class_idx] = 1; // First slot already allocated ctx->offset[class_idx] = 1; // First slot already allocated
ctx->capacity[class_idx] = g_slots_per_page[class_idx]; ctx->capacity[class_idx] = mid_v35_slots_per_page(class_idx);
ctx->meta[class_idx] = page; ctx->meta[class_idx] = page;
// Record first allocation in page metadata // Step 2: HOT_COUNTS gate - Record first allocation in page metadata
if (mid_v35_hot_counts_enabled()) {
page->alloc_count = 1; page->alloc_count = 1;
}
// Write header for first slot // Step 1: HEADER_PREFILL gate - Write header for first slot if not prefilled
if (!mid_v35_header_prefill_enabled()) {
tiny_region_id_write_header(page->ptr, class_idx); tiny_region_id_write_header(page->ptr, class_idx);
}
// Return first slot (USER pointer) // Return first slot (USER pointer)
return (char*)page->ptr + 1; return (char*)page->ptr + 1;

View File

@ -7,6 +7,7 @@
#include <stdio.h> #include <stdio.h>
#include "box/smallobject_segment_mid_v3_box.h" #include "box/smallobject_segment_mid_v3_box.h"
#include "box/region_id_v6_box.h" #include "box/region_id_v6_box.h"
#include "box/smallobject_mid_v35_geom_box.h" // Phase MID-V35-HOTPATH-OPT-1: geometry SSOT
// SmallPageMeta is now defined in smallobject_segment_mid_v3_box.h // SmallPageMeta is now defined in smallobject_segment_mid_v3_box.h
@ -21,20 +22,9 @@
// ============================================================================ // ============================================================================
// Helper: class_idx to slots mapping // Helper: class_idx to slots mapping
// ============================================================================ // ============================================================================
// Phase MID-V35-HOTPATH-OPT-1: Use geom_box as Single Source of Truth
static uint32_t class_idx_to_slots(uint32_t class_idx) { // See: core/box/smallobject_mid_v35_geom_box.h
// MID v3.5 targets C5-C7 (257-1024B) // (Removed local class_idx_to_slots() which had wrong C6 value: 102 instead of 128)
// C5: 257-384B → 64KiB / 384B ≈ 170 slots
// C6: 385-640B → 64KiB / 640B ≈ 102 slots
// C7: 641-1024B → 64KiB / 1024B ≈ 64 slots
switch (class_idx) {
case 5: return 170; // C5
case 6: return 102; // C6
case 7: return 64; // C7
default: return 0;
}
}
// ============================================================================ // ============================================================================
// Segment Lifecycle // Segment Lifecycle

View File

@ -48,6 +48,19 @@ HAKMEM_TINY_HEAP_STATS=1
HAKMEM_TINY_HEAP_STATS_DUMP=1 HAKMEM_TINY_HEAP_STATS_DUMP=1
HAKMEM_SMALL_HEAP_V3_STATS=1 HAKMEM_SMALL_HEAP_V3_STATS=1
``` ```
- **Phase MID-V35-HOTPATH-OPT-1** (FROZEN - research only):
```sh
HAKMEM_MID_V35_HEADER_PREFILL=1 # refill境界でheader先行書き
HAKMEM_MID_V35_HOT_COUNTS=0 # alloc_count削除
HAKMEM_MID_V35_C6_FASTPATH=1 # C6特化 fast path
```
- **Status**: Default OFF, FROZEN (all 3 knobs)
- **Actual Results** (Phase MID-V35-HOTPATH-OPT-1 Mixed A/B):
- Mixed (161024B, MID_V35_OFF): **-0.2%** (誤差範囲, ±2%以内) ✓
- C6-heavy (257768B, MID_V35_ON): **+7.3%** improvement ✅
- **Finding**: Mixed は MID_V3(C6-only) 固定で効果微小C6-heavy のみ効果大
- **Recommendation**: C6_HEAVY_LEGACY_POOLV1 プリセットで推奨ON
- **NOT recommended for**: MIXED_TINYV3_C7_SAFE mainline (keep all defaults OFF)
- **Phase POLICY-FAST-PATH-V2** (FROZEN - research only): - **Phase POLICY-FAST-PATH-V2** (FROZEN - research only):
```sh ```sh
HAKMEM_TINY_FREE_POLICY_FAST_V2=1 # Fast-path free optimization HAKMEM_TINY_FREE_POLICY_FAST_V2=1 # Fast-path free optimization
@ -100,6 +113,12 @@ HAKMEM_MID_V3_ENABLED=1 # Phase MID-V3: 257-768B, C6 only
HAKMEM_MID_V3_CLASSES=0x40 # C6 only (+11% on C6-heavy) HAKMEM_MID_V3_CLASSES=0x40 # C6 only (+11% on C6-heavy)
HAKMEM_MID_V35_ENABLED=1 # Phase v11a-5: C6-heavy で +8% 改善 HAKMEM_MID_V35_ENABLED=1 # Phase v11a-5: C6-heavy で +8% 改善
HAKMEM_MID_V35_CLASSES=0x40 # C6 only (53.1M ops/s) HAKMEM_MID_V35_CLASSES=0x40 # C6 only (53.1M ops/s)
# Phase MID-V35-HOTPATH-OPT-1: C6-heavy 最速セット推奨ON
# 機能: header prefill + hot counts削除 + C6 fast path (組み合わせで +7.3%)
HAKMEM_MID_V35_HEADER_PREFILL=1 # refill境界でheader先行書き
HAKMEM_MID_V35_HOT_COUNTS=0 # alloc_count削除free_count/retire残す
HAKMEM_MID_V35_C6_FASTPATH=1 # C6特化 fast path (constant slot_size=512)
``` ```
- mid_desc_lookup TLS キャッシュを試すときだけ: `HAKMEM_MID_DESC_CACHE_ENABLED=1` を上乗せデフォルトは OFF)。 - mid_desc_lookup TLS キャッシュを試すときだけ: `HAKMEM_MID_DESC_CACHE_ENABLED=1` を上乗せデフォルトは OFF)。

BIN
perf.data.c6heavy_baseline Normal file

Binary file not shown.

BIN
perf.data.mixed_baseline Normal file

Binary file not shown.