Phase 3d-B: TLS Cache Merge - Unified g_tls_sll[] structure (+12-18% expected)

Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified
TinyTLSSLL struct to improve L1D cache locality. Expected performance gain:
+12-18% from reducing cache line splits (2 loads → 1 load per operation).

Changes:
- core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad)
- core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8]
- core/box/tls_sll_box.h: Update Box API (13 sites) for unified access
- Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head
- Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count
- core/hakmem_tiny_integrity.h: Unified canary guards
- core/box/integrity_box.c: Simplified canary validation
- Makefile: Added core/box/tiny_sizeclass_hist_box.o to link

Build:  PASS (10K ops sanity test)
Warnings: Only pre-existing LTO type mismatches (unrelated)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-20 07:32:30 +09:00
parent 38552c3f39
commit 9b0d746407
83 changed files with 7509 additions and 259 deletions

View File

@ -0,0 +1,203 @@
#include "tiny_ultra_heap.h"
#if HAKMEM_TINY_ULTRA_HEAP
// TinyTLS slab 配列は既存 Tiny 層の「page/local slab ビュー」
// UltraHeap ではこれを Box 経由で見るだけに留める(挙動はまだ変えない)。
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];
// Unified front removed (A/B test: OFF is faster)
// #include "../front/tiny_unified_cache.h"
#include "../tiny_region_id.h"
#include "../hakmem_tiny_unified_stats.h"
#include <stdlib.h>
#include <stdio.h>
__thread TinyUltraHeap g_tiny_ultra_heap = {0};
// UltraHeap L0 キャッシュ制御 (ENV: HAKMEM_TINY_ULTRA_L0)
static inline int tiny_ultra_l0_enabled(void)
{
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ULTRA_L0");
// デフォルト: 無効0。明示的に 1 を指定した場合のみ有効化。
g_enable = (e && *e && *e != '0') ? 1 : 0;
}
return g_enable;
}
// L0 から 1 ブロック取得BASE を返す)
static inline void*
tiny_ultra_heap_l0_pop(TinyUltraHeap* heap, int class_idx)
{
if (!tiny_ultra_l0_enabled()) {
return NULL;
}
TinyUltraL0* l0 = &heap->l0[class_idx];
if (l0->count == 0) {
return NULL;
}
return l0->slots[--l0->count];
}
// L0 を Unified Cache から補充BASE を複数取り出して slots[] に積む)
// DELETED (A/B test: Unified Cache OFF is faster)
static inline void
tiny_ultra_heap_l0_refill_from_unified(TinyUltraHeap* heap, int class_idx)
{
// Unified Cache removed - no refill possible
(void)heap;
(void)class_idx;
return;
}
// Box UH-1: size → class の境界を 1 箇所に集約
static inline int
tiny_ultra_heap_class_for_size(size_t size)
{
if (__builtin_expect(size == 0 || size > tiny_get_max_size(), 0)) {
return -1;
}
int class_idx = hak_tiny_size_to_class(size);
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
return -1;
}
return class_idx;
}
// Box UH-2: Unified front 統合の境界
// - hit/miss 判定と統計更新、header 書き込みまでを 1 箇所に閉じ込める。
// DELETED (A/B test: Unified Cache OFF is faster)
static inline void*
tiny_ultra_heap_try_unified(TinyUltraHeap* heap, int class_idx)
{
// Unified Cache removed - always return NULL
(void)heap;
(void)class_idx;
return NULL;
}
void tiny_ultra_heap_init(void)
{
if (__builtin_expect(g_tiny_ultra_heap.initialized, 1)) {
return;
}
// Box 1: TinyUltraHeap 自体の init
g_tiny_ultra_heap.initialized = 1;
// Box 2: PageLocal ビューの初期化g_tls_slabs を alias するだけ)
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
g_tiny_ultra_heap.page[cls].tls = &g_tls_slabs[cls];
g_tiny_ultra_heap.page[cls].cls = (uint8_t)cls;
g_tiny_ultra_heap.alloc_unified_hit[cls] = 0;
g_tiny_ultra_heap.alloc_unified_refill[cls] = 0;
g_tiny_ultra_heap.alloc_fallback_ultrafront[cls] = 0;
}
}
void* tiny_ultra_heap_alloc(size_t size)
{
tiny_ultra_heap_init();
// Box UH-1: size→class 変換
int class_idx = tiny_ultra_heap_class_for_size(size);
if (__builtin_expect(class_idx < 0, 0)) {
// UltraHeap は Tiny 範囲のみ担当。範囲外は NULL で Fail-Fast。
return NULL;
}
TinyUltraHeap* heap = &g_tiny_ultra_heap;
// UltraHeap L0 (実験用): ホットクラス (例: C2/C3) だけを対象に、
// Unified Cache に到達する前にローカル L0 からの供給を試す。
if (tiny_ultra_l0_enabled() && (class_idx == 2 || class_idx == 3)) {
void* base = tiny_ultra_heap_l0_pop(heap, class_idx);
if (!base) {
tiny_ultra_heap_l0_refill_from_unified(heap, class_idx);
base = tiny_ultra_heap_l0_pop(heap, class_idx);
}
if (base) {
#if HAKMEM_TINY_HEADER_CLASSIDX
return tiny_region_id_write_header(base, class_idx);
#else
return base;
#endif
}
}
// Unified Cache removed (A/B test: OFF is faster)
// Always use UltraFront fallback
void* fallback = tiny_ultrafront_malloc(size);
if (fallback) {
heap->alloc_fallback_ultrafront[class_idx]++;
}
return fallback;
}
int tiny_ultra_heap_free(void* ptr)
{
tiny_ultra_heap_init();
// Free については現状の UltraFront freeUnified pushに完全委譲。
// 将来、PageLocal の freelist 連携や page 返却をここに追加する。
return tiny_ultrafront_free(ptr);
}
void tiny_ultra_heap_stats_snapshot(uint64_t hit[TINY_NUM_CLASSES],
uint64_t refill[TINY_NUM_CLASSES],
uint64_t fallback[TINY_NUM_CLASSES],
int reset)
{
tiny_ultra_heap_init();
if (!hit || !refill || !fallback) {
return;
}
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
hit[cls] = g_tiny_ultra_heap.alloc_unified_hit[cls];
refill[cls] = g_tiny_ultra_heap.alloc_unified_refill[cls];
fallback[cls] = g_tiny_ultra_heap.alloc_fallback_ultrafront[cls];
}
if (reset) {
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
g_tiny_ultra_heap.alloc_unified_hit[cls] = 0;
g_tiny_ultra_heap.alloc_unified_refill[cls] = 0;
g_tiny_ultra_heap.alloc_fallback_ultrafront[cls] = 0;
}
}
}
// オプション: プロセス終了時に UltraHeap front 統計を 1 回だけダンプENV で制御)
// ENV: HAKMEM_TINY_ULTRA_HEAP_DUMP=1 で有効化(デフォルト: 無効)
static void tiny_ultra_heap_dump_stats(void) __attribute__((destructor));
static void tiny_ultra_heap_dump_stats(void)
{
const char* dump = getenv("HAKMEM_TINY_ULTRA_HEAP_DUMP");
if (!dump || !*dump || *dump == '0') {
return;
}
uint64_t hit[TINY_NUM_CLASSES] = {0};
uint64_t refill[TINY_NUM_CLASSES] = {0};
uint64_t fallback[TINY_NUM_CLASSES] = {0};
tiny_ultra_heap_stats_snapshot(hit, refill, fallback, 0);
fprintf(stderr, "[ULTRA_HEAP_STATS] class hit refill fallback\n");
for (int c = 0; c < TINY_NUM_CLASSES; c++) {
if (hit[c] || refill[c] || fallback[c]) {
fprintf(stderr, " C%d: %llu %llu %llu\n",
c,
(unsigned long long)hit[c],
(unsigned long long)refill[c],
(unsigned long long)fallback[c]);
}
}
}
#endif // HAKMEM_TINY_ULTRA_HEAP