Files
hakmem/core/tiny_ultra_fast.inc.h
Moe Charm (CI) 9b0d746407 Phase 3d-B: TLS Cache Merge - Unified g_tls_sll[] structure (+12-18% expected)
Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified
TinyTLSSLL struct to improve L1D cache locality. Expected performance gain:
+12-18% from reducing cache line splits (2 loads → 1 load per operation).

Changes:
- core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad)
- core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8]
- core/box/tls_sll_box.h: Update Box API (13 sites) for unified access
- Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head
- Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count
- core/hakmem_tiny_integrity.h: Unified canary guards
- core/box/integrity_box.c: Simplified canary validation
- Makefile: Added core/box/tiny_sizeclass_hist_box.o to link

Build:  PASS (10K ops sanity test)
Warnings: Only pre-existing LTO type mismatches (unrelated)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-20 07:32:30 +09:00

103 lines
3.8 KiB
C

#ifndef TINY_ULTRA_FAST_INC_H
#define TINY_ULTRA_FAST_INC_H
// ============================================================================
// HAKMEM Ultra Fast Path
// ============================================================================
// Phase E5: System malloc並みの超軽量fast path
//
// 目的:
// - FastCache/SFC/統計/プロファイリングを全てOFF
// - TLS SLL 1層のみのシンプル実装
// - 8-10命令でalloc/freeを完結
//
// 期待:
// - System malloc並みの性能 (90M+ ops/s)
// - 「賢い機能」のコストを定量化
// ============================================================================
#include "hakmem_tiny.h"
// External TLS arrays (defined in hakmem_tiny.c)
// Phase 3d-B: TLS Cache Merge - Unified structure (type in hakmem_tiny.h)
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
// ============================================================================
// Ultra-Fast Allocation (8-10 instructions)
// ============================================================================
static inline void* tiny_alloc_ultra_fast(size_t size) {
// 1. Size to class (direct calculation, no LUT)
// HAKMEM Tiny classes (from g_tiny_class_sizes):
// C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B
if (size == 0) size = 1;
if (size > 1024) return NULL; // Tiny範囲外
// Direct mapping: use BSR-style or simple branching
int cl;
if (size <= 8) cl = 0;
else if (size <= 16) cl = 1;
else if (size <= 32) cl = 2;
else if (size <= 64) cl = 3;
else if (size <= 128) cl = 4;
else if (size <= 256) cl = 5;
else if (size <= 512) cl = 6;
else cl = 7; // size <= 1024
// 2. TLS SLL pop (3-4 instructions)
// Phase 3d-B: Use unified struct (head+count in same cache line)
void* ptr = g_tls_sll[cl].head; // 1 load
if (!ptr) return NULL; // 1 branch (miss → slow path)
void* next = *(void**)ptr; // 1 load (next pointer)
g_tls_sll[cl].head = next; // 1 store
g_tls_sll[cl].count--; // 1 decrement
// 3. Return USER pointer (ptr is BASE, +1 for header)
// Phase 7 header-based fast free requires this
return (char*)ptr + 1;
}
// ============================================================================
// Ultra-Fast Free (6-8 instructions)
// ============================================================================
static inline int tiny_free_ultra_fast(void* ptr) {
if (!ptr) return 0;
// 1. Read header to get class_idx (Phase 7 header-based)
uint8_t header = *((uint8_t*)ptr - 1);
uint8_t class_idx = header & 0x0F;
// 2. Bounds check (safety - minimal overhead)
if (class_idx >= TINY_NUM_CLASSES) return 0; // Route to slow path
// 3. Convert USER → BASE
void* base = (char*)ptr - 1;
// 4. TLS SLL push (3-4 instructions)
// Phase 3d-B: Use unified struct (head+count in same cache line)
void* head = g_tls_sll[class_idx].head; // 1 load
*(void**)base = head; // 1 store (link)
g_tls_sll[class_idx].head = base; // 1 store
g_tls_sll[class_idx].count++; // 1 increment
return 1; // Success
}
// ============================================================================
// Ultra Mode Entry Point - TLS SLL Only (no fallback)
// ============================================================================
// NOTE: Ultra mode expects TLS SLL to be warm. If miss, returns NULL.
// Caller (wrapper) will fallback to full tiny_alloc_fast path.
static inline void* tiny_alloc_fast_ultra(size_t size) {
// Try ultra-fast path (TLS SLL only)
return tiny_alloc_ultra_fast(size);
}
static inline void tiny_free_fast_ultra(void* ptr) {
// Try ultra-fast free (TLS SLL push only)
tiny_free_ultra_fast(ptr);
}
#endif // TINY_ULTRA_FAST_INC_H