Fix cross-thread ownership check: Use bits 8-15 for owner_tid_low

Problem:
- TLS_SLL_PUSH_DUP crash in Larson multi-threaded benchmark
- Cross-thread frees incorrectly routed to same-thread TLS path
- Root cause: pthread_t on glibc is 256-byte aligned (TCB base)
  so lower 8 bits are ALWAYS 0x00 for ALL threads

Fix:
- Change owner_tid_low from (tid & 0xFF) to ((tid >> 8) & 0xFF)
- Bits 8-15 actually vary between threads, enabling correct detection
- Applied consistently across all ownership check locations:
  - superslab_inline.h: ss_owner_try_acquire/release/is_mine
  - slab_handle.h: slab_try_acquire
  - tiny_free_fast.inc.h: tiny_free_is_same_thread_ss
  - tiny_free_fast_v2.inc.h: cross-thread detection
  - tiny_superslab_free.inc.h: same-thread check
  - ss_allocation_box.c: slab initialization
  - hakmem_tiny_superslab.c: ownership handling

Also added:
- Address watcher debug infrastructure (tiny_region_id.h)
- Cross-thread detection in malloc_tiny_fast.h Front Gate

Test results:
- Larson 1T/2T/4T: PASS (no TLS_SLL_PUSH_DUP crash)
- random_mixed: PASS
- Performance: ~20M ops/s (regression from 48M, needs optimization)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-27 11:52:11 +09:00
parent 8af9123bcc
commit d8e3971dc2
9 changed files with 286 additions and 14 deletions

View File

@ -40,9 +40,12 @@ extern int g_tls_sll_enable; // Honored for fast free: when 0, fall back to slo
extern void hak_tiny_free(void* ptr); // Fallback for non-header allocations
// Inline helper: Get current thread ID (lower 32 bits)
#ifndef TINY_SELF_U32_LOCAL_DEFINED
#define TINY_SELF_U32_LOCAL_DEFINED
static inline uint32_t tiny_self_u32_local(void) {
return (uint32_t)(uintptr_t)pthread_self();
}
#endif
// ========== Ultra-Fast Free (Header-based) ==========
@ -198,8 +201,9 @@ static inline int hak_tiny_free_fast_v2(void* ptr) {
uint32_t self_tid = tiny_self_u32_local();
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
// Check if this is a cross-thread free (lower 8 bits mismatch)
if (__builtin_expect((owner_tid_low & 0xFF) != (self_tid & 0xFF), 0)) {
// Check if this is a cross-thread free (compare bits 8-15; low 8 bits are 0 on glibc)
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
// Cross-thread free → remote queue routing
TinySlabMeta* meta = &ss->slabs[slab_idx];
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
@ -220,12 +224,50 @@ static inline int hak_tiny_free_fast_v2(void* ptr) {
// Hypothesis: Box TLS-SLL acts as verification layer, masking underlying bugs
#if !HAKMEM_BUILD_RELEASE
// Debug: Log free operations (first 50, class 1 only)
// Address watcher: Check if this is the watched address being freed
{
extern uintptr_t get_watch_addr(void);
uintptr_t watch = get_watch_addr();
if (watch != 0 && (uintptr_t)base == watch) {
extern _Atomic uint64_t g_debug_op_count;
extern __thread TinyTLSSLL g_tls_sll[];
uint64_t op = atomic_load(&g_debug_op_count);
fprintf(stderr, "\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "[WATCH_FREE_HIT] Address %p freed!\n", base);
fprintf(stderr, "========================================\n");
fprintf(stderr, " Operation: #%lu\n", (unsigned long)op);
fprintf(stderr, " Class: %d\n", class_idx);
fprintf(stderr, " User ptr: %p\n", ptr);
fprintf(stderr, " Base ptr: %p\n", base);
fprintf(stderr, " TLS count: %u (before free)\n", g_tls_sll[class_idx].count);
fprintf(stderr, " TLS head: %p\n", g_tls_sll[class_idx].head);
fprintf(stderr, "========================================\n");
fprintf(stderr, "\n");
fflush(stderr);
// Print backtrace
void* bt[16];
int frames = backtrace(bt, 16);
fprintf(stderr, "[WATCH_FREE_BACKTRACE] %d frames:\n", frames);
backtrace_symbols_fd(bt, frames, fileno(stderr));
fprintf(stderr, "\n");
fflush(stderr);
// Abort to preserve state
fprintf(stderr, "[WATCH_ABORT] Aborting on watched free...\n");
fflush(stderr);
abort();
}
}
// Debug: Log free operations (first 2000, ALL classes)
{
extern _Atomic uint64_t g_debug_op_count;
extern __thread TinyTLSSLL g_tls_sll[];
uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
if (op < 50 && class_idx == 1) {
if (op < 2000) { // ALL classes, not just class 1
fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p tls_count_before=%u\n",
(unsigned long)op, class_idx, ptr, base,
g_tls_sll[class_idx].count);