Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified TinyTLSSLL struct to improve L1D cache locality. Expected performance gain: +12-18% from reducing cache line splits (2 loads → 1 load per operation). Changes: - core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad) - core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8] - core/box/tls_sll_box.h: Update Box API (13 sites) for unified access - Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head - Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count - core/hakmem_tiny_integrity.h: Unified canary guards - core/box/integrity_box.c: Simplified canary validation - Makefile: Added core/box/tiny_sizeclass_hist_box.o to link Build: ✅ PASS (10K ops sanity test) Warnings: Only pre-existing LTO type mismatches (unrelated) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
103 lines
3.8 KiB
C
103 lines
3.8 KiB
C
#ifndef TINY_ULTRA_FAST_INC_H
|
|
#define TINY_ULTRA_FAST_INC_H
|
|
|
|
// ============================================================================
|
|
// HAKMEM Ultra Fast Path
|
|
// ============================================================================
|
|
// Phase E5: System malloc並みの超軽量fast path
|
|
//
|
|
// 目的:
|
|
// - FastCache/SFC/統計/プロファイリングを全てOFF
|
|
// - TLS SLL 1層のみのシンプル実装
|
|
// - 8-10命令でalloc/freeを完結
|
|
//
|
|
// 期待:
|
|
// - System malloc並みの性能 (90M+ ops/s)
|
|
// - 「賢い機能」のコストを定量化
|
|
// ============================================================================
|
|
|
|
#include "hakmem_tiny.h"
|
|
|
|
// External TLS arrays (defined in hakmem_tiny.c)
|
|
// Phase 3d-B: TLS Cache Merge - Unified structure (type in hakmem_tiny.h)
|
|
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
|
|
|
// ============================================================================
|
|
// Ultra-Fast Allocation (8-10 instructions)
|
|
// ============================================================================
|
|
static inline void* tiny_alloc_ultra_fast(size_t size) {
|
|
// 1. Size to class (direct calculation, no LUT)
|
|
// HAKMEM Tiny classes (from g_tiny_class_sizes):
|
|
// C0=8B, C1=16B, C2=32B, C3=64B, C4=128B, C5=256B, C6=512B, C7=1024B
|
|
if (size == 0) size = 1;
|
|
if (size > 1024) return NULL; // Tiny範囲外
|
|
|
|
// Direct mapping: use BSR-style or simple branching
|
|
int cl;
|
|
if (size <= 8) cl = 0;
|
|
else if (size <= 16) cl = 1;
|
|
else if (size <= 32) cl = 2;
|
|
else if (size <= 64) cl = 3;
|
|
else if (size <= 128) cl = 4;
|
|
else if (size <= 256) cl = 5;
|
|
else if (size <= 512) cl = 6;
|
|
else cl = 7; // size <= 1024
|
|
|
|
// 2. TLS SLL pop (3-4 instructions)
|
|
// Phase 3d-B: Use unified struct (head+count in same cache line)
|
|
void* ptr = g_tls_sll[cl].head; // 1 load
|
|
if (!ptr) return NULL; // 1 branch (miss → slow path)
|
|
|
|
void* next = *(void**)ptr; // 1 load (next pointer)
|
|
g_tls_sll[cl].head = next; // 1 store
|
|
g_tls_sll[cl].count--; // 1 decrement
|
|
|
|
// 3. Return USER pointer (ptr is BASE, +1 for header)
|
|
// Phase 7 header-based fast free requires this
|
|
return (char*)ptr + 1;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Ultra-Fast Free (6-8 instructions)
|
|
// ============================================================================
|
|
static inline int tiny_free_ultra_fast(void* ptr) {
|
|
if (!ptr) return 0;
|
|
|
|
// 1. Read header to get class_idx (Phase 7 header-based)
|
|
uint8_t header = *((uint8_t*)ptr - 1);
|
|
uint8_t class_idx = header & 0x0F;
|
|
|
|
// 2. Bounds check (safety - minimal overhead)
|
|
if (class_idx >= TINY_NUM_CLASSES) return 0; // Route to slow path
|
|
|
|
// 3. Convert USER → BASE
|
|
void* base = (char*)ptr - 1;
|
|
|
|
// 4. TLS SLL push (3-4 instructions)
|
|
// Phase 3d-B: Use unified struct (head+count in same cache line)
|
|
void* head = g_tls_sll[class_idx].head; // 1 load
|
|
*(void**)base = head; // 1 store (link)
|
|
g_tls_sll[class_idx].head = base; // 1 store
|
|
g_tls_sll[class_idx].count++; // 1 increment
|
|
|
|
return 1; // Success
|
|
}
|
|
|
|
// ============================================================================
|
|
// Ultra Mode Entry Point - TLS SLL Only (no fallback)
|
|
// ============================================================================
|
|
// NOTE: Ultra mode expects TLS SLL to be warm. If miss, returns NULL.
|
|
// Caller (wrapper) will fallback to full tiny_alloc_fast path.
|
|
|
|
static inline void* tiny_alloc_fast_ultra(size_t size) {
|
|
// Try ultra-fast path (TLS SLL only)
|
|
return tiny_alloc_ultra_fast(size);
|
|
}
|
|
|
|
static inline void tiny_free_fast_ultra(void* ptr) {
|
|
// Try ultra-fast free (TLS SLL push only)
|
|
tiny_free_ultra_fast(ptr);
|
|
}
|
|
|
|
#endif // TINY_ULTRA_FAST_INC_H
|