Phase 3d-B: TLS Cache Merge - Unified g_tls_sll[] structure (+12-18% expected)

Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified
TinyTLSSLL struct to improve L1D cache locality. Expected performance gain:
+12-18% from reducing cache line splits (2 loads → 1 load per operation).

Changes:
- core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad)
- core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8]
- core/box/tls_sll_box.h: Update Box API (13 sites) for unified access
- Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head
- Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count
- core/hakmem_tiny_integrity.h: Unified canary guards
- core/box/integrity_box.c: Simplified canary validation
- Makefile: Added core/box/tiny_sizeclass_hist_box.o to link

Build:  PASS (10K ops sanity test)
Warnings: Only pre-existing LTO type mismatches (unrelated)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-20 07:32:30 +09:00
parent 38552c3f39
commit 9b0d746407
83 changed files with 7509 additions and 259 deletions

View File

@ -73,8 +73,8 @@ extern unsigned long long g_free_via_tls_sll[];
// - Cross-thread allocation は考慮しないBackend が処理)
// External TLS variables (defined in hakmem_tiny.c)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
// External backend functions
// P0 Fix: Use appropriate refill function based on P0 status
@ -185,7 +185,7 @@ static inline void* tiny_alloc_fast_pop(int class_idx) {
uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1);
if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) {
fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n",
pop_call, class_idx, g_tls_sll_head[class_idx], g_tls_sll_count[class_idx]);
pop_call, class_idx, g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
fflush(stderr);
}
#endif
@ -323,7 +323,7 @@ static inline int sfc_refill_from_sll(int class_idx, int target_count) {
int want = (target_count * pct) / 100;
if (want <= 0) want = target_count / 2; // safety fallback
while (transferred < want && g_tls_sll_count[class_idx] > 0) {
while (transferred < want && g_tls_sll[class_idx].count > 0) {
// Check SFC capacity before transfer
if (g_sfc_count[class_idx] >= cap) {
break; // SFC full, stop
@ -525,8 +525,8 @@ static inline void* tiny_alloc_fast(size_t size) {
}
// Phase 3c L1D Opt: Prefetch TLS cache head early
__builtin_prefetch(&g_tls_sll_head[class_idx], 0, 3);
__builtin_prefetch(&g_tls_sll_count[class_idx], 0, 3);
// Phase 3d-B: Prefetch unified TLS SLL struct (single prefetch for both head+count)
__builtin_prefetch(&g_tls_sll[class_idx], 0, 3);
// Phase 22: Lazy per-class init (on first use)
lazy_init_class(class_idx);
@ -554,7 +554,7 @@ static inline void* tiny_alloc_fast(size_t size) {
if (0 && call_num > 14250 && call_num < 14280) {
fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n",
call_num, size, class_idx, class_idx,
g_tls_sll_head[class_idx], g_tls_sll_count[class_idx]);
g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
fflush(stderr);
}
#endif
@ -672,8 +672,8 @@ typedef struct {
static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
TinyAllocFastStats stats = {
.class_idx = class_idx,
.head = g_tls_sll_head[class_idx],
.count = g_tls_sll_count[class_idx]
.head = g_tls_sll[class_idx].head,
.count = g_tls_sll[class_idx].count
};
return stats;
}
@ -681,8 +681,8 @@ static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
// Reset TLS freelist (for testing/benchmarking)
// WARNING: This leaks memory! Only use in controlled test environments.
static inline void tiny_alloc_fast_reset(int class_idx) {
g_tls_sll_head[class_idx] = NULL;
g_tls_sll_count[class_idx] = 0;
g_tls_sll[class_idx].head = NULL;
g_tls_sll[class_idx].count = 0;
}
// ========== Performance Notes ==========