Phase 3d-B: TLS Cache Merge - Unified g_tls_sll[] structure (+12-18% expected)

Merge separate g_tls_sll_head[] and g_tls_sll_count[] arrays into unified
TinyTLSSLL struct to improve L1D cache locality. Expected performance gain:
+12-18% from reducing cache line splits (2 loads → 1 load per operation).

Changes:
- core/hakmem_tiny.h: Add TinyTLSSLL type (16B aligned, head+count+pad)
- core/hakmem_tiny.c: Replace separate arrays with g_tls_sll[8]
- core/box/tls_sll_box.h: Update Box API (13 sites) for unified access
- Updated 32+ files: All g_tls_sll_head[i] → g_tls_sll[i].head
- Updated 32+ files: All g_tls_sll_count[i] → g_tls_sll[i].count
- core/hakmem_tiny_integrity.h: Unified canary guards
- core/box/integrity_box.c: Simplified canary validation
- Makefile: Added core/box/tiny_sizeclass_hist_box.o to link

Build:  PASS (10K ops sanity test)
Warnings: Only pre-existing LTO type mismatches (unrelated)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-20 07:32:30 +09:00
parent 38552c3f39
commit 9b0d746407
83 changed files with 7509 additions and 259 deletions

View File

@ -34,9 +34,8 @@
#include "../tiny_debug_ring.h"
#include "tiny_next_ptr_box.h"
// External TLS SLL state (defined in hakmem_tiny.c or equivalent)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
// Phase 3d-B: Unified TLS SLL (defined in hakmem_tiny.c)
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern int g_tls_sll_class_mask; // bit i=1 → SLL allowed for class i
// ========== Debug guard ==========
@ -108,7 +107,7 @@ static inline bool tls_sll_push(int class_idx, void* ptr, uint32_t capacity)
#endif
// Capacity check BEFORE any writes.
uint32_t cur = g_tls_sll_count[class_idx];
uint32_t cur = g_tls_sll[class_idx].count;
if (!unlimited && cur >= capacity) {
return false;
}
@ -154,10 +153,10 @@ static inline bool tls_sll_push(int class_idx, void* ptr, uint32_t capacity)
#if !HAKMEM_BUILD_RELEASE
// Optional double-free detection: scan a bounded prefix of the list.
{
void* scan = g_tls_sll_head[class_idx];
void* scan = g_tls_sll[class_idx].head;
uint32_t scanned = 0;
const uint32_t limit = (g_tls_sll_count[class_idx] < 64)
? g_tls_sll_count[class_idx]
const uint32_t limit = (g_tls_sll[class_idx].count < 64)
? g_tls_sll[class_idx].count
: 64;
while (scan && scanned < limit) {
if (scan == ptr) {
@ -176,9 +175,9 @@ static inline bool tls_sll_push(int class_idx, void* ptr, uint32_t capacity)
#endif
// Link new node to current head via Box API (offset is handled inside tiny_nextptr).
PTR_NEXT_WRITE("tls_push", class_idx, ptr, 0, g_tls_sll_head[class_idx]);
g_tls_sll_head[class_idx] = ptr;
g_tls_sll_count[class_idx] = cur + 1;
PTR_NEXT_WRITE("tls_push", class_idx, ptr, 0, g_tls_sll[class_idx].head);
g_tls_sll[class_idx].head = ptr;
g_tls_sll[class_idx].count = cur + 1;
return true;
}
@ -197,15 +196,15 @@ static inline bool tls_sll_pop(int class_idx, void** out)
}
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
void* base = g_tls_sll_head[class_idx];
void* base = g_tls_sll[class_idx].head;
if (!base) {
return false;
}
// Sentinel guard: remote sentinel must never be in TLS SLL.
if (__builtin_expect((uintptr_t)base == TINY_REMOTE_SENTINEL, 0)) {
g_tls_sll_head[class_idx] = NULL;
g_tls_sll_count[class_idx] = 0;
g_tls_sll[class_idx].head = NULL;
g_tls_sll[class_idx].count = 0;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr,
"[TLS_SLL_POP] Remote sentinel detected at head; SLL reset (cls=%d)\n",
@ -251,8 +250,8 @@ static inline bool tls_sll_pop(int class_idx, void** out)
abort();
#else
// In release, fail-safe: drop list.
g_tls_sll_head[class_idx] = NULL;
g_tls_sll_count[class_idx] = 0;
g_tls_sll[class_idx].head = NULL;
g_tls_sll[class_idx].count = 0;
{
static int g_sll_ring_en = -1;
if (__builtin_expect(g_sll_ring_en == -1, 0)) {
@ -285,9 +284,9 @@ static inline bool tls_sll_pop(int class_idx, void** out)
}
#endif
g_tls_sll_head[class_idx] = next;
if (g_tls_sll_count[class_idx] > 0) {
g_tls_sll_count[class_idx]--;
g_tls_sll[class_idx].head = next;
if (g_tls_sll[class_idx].count > 0) {
g_tls_sll[class_idx].count--;
}
// Clear next inside popped node to avoid stale-chain issues.
@ -314,7 +313,7 @@ static inline uint32_t tls_sll_splice(int class_idx,
return 0;
}
uint32_t cur = g_tls_sll_count[class_idx];
uint32_t cur = g_tls_sll[class_idx].count;
if (cur >= capacity) {
return 0;
}
@ -361,10 +360,10 @@ static inline uint32_t tls_sll_splice(int class_idx,
// Link tail to existing head and install new head.
tls_sll_debug_guard(class_idx, tail, "splice_tail");
PTR_NEXT_WRITE("tls_splice_link", class_idx, tail, 0, g_tls_sll_head[class_idx]);
PTR_NEXT_WRITE("tls_splice_link", class_idx, tail, 0, g_tls_sll[class_idx].head);
g_tls_sll_head[class_idx] = chain_head;
g_tls_sll_count[class_idx] = cur + moved;
g_tls_sll[class_idx].head = chain_head;
g_tls_sll[class_idx].count = cur + moved;
return moved;
}