diff --git a/core/box/ss_allocation_box.c b/core/box/ss_allocation_box.c index 38eed400..08812da4 100644 --- a/core/box/ss_allocation_box.c +++ b/core/box/ss_allocation_box.c @@ -415,7 +415,8 @@ void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_ meta->used = 0; meta->capacity = capacity; meta->carved = 0; - meta->owner_tid_low = (uint8_t)(owner_tid & 0xFFu); + // Store bits 8-15 of owner_tid (low 8 bits are 0 for glibc pthread IDs) + meta->owner_tid_low = (uint8_t)((owner_tid >> 8) & 0xFFu); // Fail-safe: stamp class_idx from geometry (stride → class). // This normalizes both legacy and shared pool paths. for (int i = 0; i < TINY_NUM_CLASSES; i++) { diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index c18b439e..8db1c9ba 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -24,12 +24,25 @@ #include #include #include +#include // For pthread_self() in cross-thread check #include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES +#include "../hakmem_super_registry.h" // For cross-thread owner check +#include "../superslab/superslab_inline.h" // For slab_index_for +#include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get +#include "../box/free_remote_box.h" // For tiny_free_remote_box #include "tiny_unified_cache.h" // For unified_cache_pop_or_refill #include "../tiny_region_id.h" // For tiny_region_id_write_header #include "../hakmem_tiny.h" // For hak_tiny_size_to_class +// Helper: current thread id (low 32 bits) for owner check +#ifndef TINY_SELF_U32_LOCAL_DEFINED +#define TINY_SELF_U32_LOCAL_DEFINED +static inline uint32_t tiny_self_u32_local(void) { + return (uint32_t)(uintptr_t)pthread_self(); +} +#endif + // ============================================================================ // ENV Control (cached, lazy init) // ============================================================================ @@ -132,6 +145,76 @@ static inline int free_tiny_fast(void* ptr) { // 4. BASE を計算して Unified Cache に push void* base = (void*)((char*)ptr - 1); + + // Cross-thread free detection (Larson MT crash fix, ENV gated) + { + static __thread int g_larson_fix = -1; + if (__builtin_expect(g_larson_fix == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); + g_larson_fix = (e && *e && *e != '0') ? 1 : 0; +#if !HAKMEM_BUILD_RELEASE + fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL"); + fflush(stderr); +#endif + } + + if (__builtin_expect(g_larson_fix, 0)) { + SuperSlab* ss = hak_super_lookup(base); + if (ss && ss->magic == SUPERSLAB_MAGIC) { + int slab_idx = slab_index_for(ss, base); + if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) { + uint32_t self_tid = tiny_self_u32_local(); + uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); + // LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes) + uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); +#if !HAKMEM_BUILD_RELEASE + static _Atomic uint64_t g_owner_check_count = 0; + uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1); + if (oc < 10) { + fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n", + ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp)); + fflush(stderr); + } +#endif + + if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { + // Cross-thread free → route to remote queue instead of poisoning TLS cache +#if !HAKMEM_BUILD_RELEASE + static _Atomic uint64_t g_cross_thread_count = 0; + uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1); + if (ct < 20) { + fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n", + ptr, owner_tid_low, self_tid_cmp, self_tid); + fflush(stderr); + } +#endif + TinySlabMeta* meta = &ss->slabs[slab_idx]; + if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { + return 1; // handled via remote queue + } + return 0; // remote push failed; fall back to normal path + } + } + } + } + } + + // Debug: Log free operations (first 5000, all classes) +#if !HAKMEM_BUILD_RELEASE + { + extern _Atomic uint64_t g_debug_op_count; + extern __thread TinyTLSSLL g_tls_sll[]; + uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); + // Note: Shares g_debug_op_count with alloc logging, so bump the window. + if (op < 5000) { + fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n", + (unsigned long)op, class_idx, ptr, base, + g_tls_sll[class_idx].count); + fflush(stderr); + } + } +#endif + int pushed = unified_cache_push(class_idx, base); if (__builtin_expect(pushed, 1)) { return 1; // Success diff --git a/core/hakmem_tiny_superslab.c b/core/hakmem_tiny_superslab.c index a537ed97..19fa35cf 100644 --- a/core/hakmem_tiny_superslab.c +++ b/core/hakmem_tiny_superslab.c @@ -595,8 +595,9 @@ static void* hak_tiny_alloc_superslab_backend_shared(int class_idx) // Initialize slab geometry once for this class. if (meta->capacity == 0) { size_t block_size = g_tiny_class_sizes[class_idx]; - // owner_tid_low is advisory; we can use 0 in this backend. - superslab_init_slab(ss, slab_idx, block_size, 0); + // LARSON FIX: Pass actual thread ID for cross-thread free detection + uint32_t my_tid = (uint32_t)(uintptr_t)pthread_self(); + superslab_init_slab(ss, slab_idx, block_size, my_tid); meta = &ss->slabs[slab_idx]; // CRITICAL FIX: Always set class_idx after init to avoid C0/C7 confusion. @@ -1195,7 +1196,8 @@ void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_ meta->used = 0; meta->capacity = capacity; meta->carved = 0; - meta->owner_tid_low = (uint8_t)(owner_tid & 0xFFu); + // LARSON FIX: Use bits 8-15 instead of 0-7 since pthread TIDs are aligned to 256 bytes + meta->owner_tid_low = (uint8_t)((owner_tid >> 8) & 0xFFu); // Fail-safe: stamp class_idx from geometry (stride → class). // This ensures legacy/shared/legacy-refill paths all end with a correct class. for (int i = 0; i < TINY_NUM_CLASSES; i++) { diff --git a/core/slab_handle.h b/core/slab_handle.h index b0c8f40a..1dde81e2 100644 --- a/core/slab_handle.h +++ b/core/slab_handle.h @@ -53,7 +53,7 @@ static inline SlabHandle slab_try_acquire(SuperSlab* ss, int idx, uint32_t tid) h.ss = ss; h.meta = m; h.slab_idx = (uint8_t)idx; - h.owner_tid_low = (uint8_t)tid; + h.owner_tid_low = (uint8_t)((tid >> 8) & 0xFFu); if (__builtin_expect(g_debug_remote_guard, 0)) { uint8_t cur = __atomic_load_n(&m->owner_tid_low, __ATOMIC_RELAXED); if (cur != h.owner_tid_low || cur == 0) { diff --git a/core/superslab/superslab_inline.h b/core/superslab/superslab_inline.h index 54cd64b7..28c6a8d7 100644 --- a/core/superslab/superslab_inline.h +++ b/core/superslab/superslab_inline.h @@ -89,7 +89,7 @@ static inline void superslab_ref_dec(SuperSlab* ss) static inline int ss_owner_try_acquire(TinySlabMeta* m, uint32_t tid) { if (!m) return 0; - uint8_t want = (uint8_t)(tid & 0xFFu); + uint8_t want = (uint8_t)((tid >> 8) & 0xFFu); uint8_t expected = 0; return __atomic_compare_exchange_n(&m->owner_tid_low, &expected, want, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED); @@ -98,7 +98,7 @@ static inline int ss_owner_try_acquire(TinySlabMeta* m, uint32_t tid) static inline void ss_owner_release(TinySlabMeta* m, uint32_t tid) { if (!m) return; - uint8_t expected = (uint8_t)(tid & 0xFFu); + uint8_t expected = (uint8_t)((tid >> 8) & 0xFFu); (void)__atomic_compare_exchange_n(&m->owner_tid_low, &expected, 0u, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED); } @@ -107,7 +107,7 @@ static inline int ss_owner_is_mine(TinySlabMeta* m, uint32_t tid) { if (!m) return 0; uint8_t cur = __atomic_load_n(&m->owner_tid_low, __ATOMIC_RELAXED); - return cur == (uint8_t)(tid & 0xFFu); + return cur == (uint8_t)((tid >> 8) & 0xFFu); } // Active block accounting (saturating dec by 1) diff --git a/core/tiny_free_fast.inc.h b/core/tiny_free_fast.inc.h index d4957e08..d1f9ac49 100644 --- a/core/tiny_free_fast.inc.h +++ b/core/tiny_free_fast.inc.h @@ -57,7 +57,7 @@ extern void tiny_alloc_fast_push(int class_idx, void* ptr); // Invariant: This check MUST be atomic (no TOCTOU between check and push) static inline int tiny_free_is_same_thread_ss(SuperSlab* ss, int slab_idx, uint32_t my_tid) { TinySlabMeta* meta = &ss->slabs[slab_idx]; - uint8_t my_tid_low = (uint8_t)my_tid; + uint8_t my_tid_low = (uint8_t)((my_tid >> 8) & 0xFFu); uint8_t owner = tiny_atomic_load_u8_relaxed(&meta->owner_tid_low); return (owner == my_tid_low && owner != 0); } diff --git a/core/tiny_free_fast_v2.inc.h b/core/tiny_free_fast_v2.inc.h index 7513fbcd..6fdaf126 100644 --- a/core/tiny_free_fast_v2.inc.h +++ b/core/tiny_free_fast_v2.inc.h @@ -40,9 +40,12 @@ extern int g_tls_sll_enable; // Honored for fast free: when 0, fall back to slo extern void hak_tiny_free(void* ptr); // Fallback for non-header allocations // Inline helper: Get current thread ID (lower 32 bits) +#ifndef TINY_SELF_U32_LOCAL_DEFINED +#define TINY_SELF_U32_LOCAL_DEFINED static inline uint32_t tiny_self_u32_local(void) { return (uint32_t)(uintptr_t)pthread_self(); } +#endif // ========== Ultra-Fast Free (Header-based) ========== @@ -198,8 +201,9 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { uint32_t self_tid = tiny_self_u32_local(); uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); - // Check if this is a cross-thread free (lower 8 bits mismatch) - if (__builtin_expect((owner_tid_low & 0xFF) != (self_tid & 0xFF), 0)) { + // Check if this is a cross-thread free (compare bits 8-15; low 8 bits are 0 on glibc) + uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); + if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { // Cross-thread free → remote queue routing TinySlabMeta* meta = &ss->slabs[slab_idx]; if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { @@ -220,12 +224,50 @@ static inline int hak_tiny_free_fast_v2(void* ptr) { // Hypothesis: Box TLS-SLL acts as verification layer, masking underlying bugs #if !HAKMEM_BUILD_RELEASE - // Debug: Log free operations (first 50, class 1 only) + // Address watcher: Check if this is the watched address being freed + { + extern uintptr_t get_watch_addr(void); + uintptr_t watch = get_watch_addr(); + if (watch != 0 && (uintptr_t)base == watch) { + extern _Atomic uint64_t g_debug_op_count; + extern __thread TinyTLSSLL g_tls_sll[]; + uint64_t op = atomic_load(&g_debug_op_count); + + fprintf(stderr, "\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "[WATCH_FREE_HIT] Address %p freed!\n", base); + fprintf(stderr, "========================================\n"); + fprintf(stderr, " Operation: #%lu\n", (unsigned long)op); + fprintf(stderr, " Class: %d\n", class_idx); + fprintf(stderr, " User ptr: %p\n", ptr); + fprintf(stderr, " Base ptr: %p\n", base); + fprintf(stderr, " TLS count: %u (before free)\n", g_tls_sll[class_idx].count); + fprintf(stderr, " TLS head: %p\n", g_tls_sll[class_idx].head); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "\n"); + fflush(stderr); + + // Print backtrace + void* bt[16]; + int frames = backtrace(bt, 16); + fprintf(stderr, "[WATCH_FREE_BACKTRACE] %d frames:\n", frames); + backtrace_symbols_fd(bt, frames, fileno(stderr)); + fprintf(stderr, "\n"); + fflush(stderr); + + // Abort to preserve state + fprintf(stderr, "[WATCH_ABORT] Aborting on watched free...\n"); + fflush(stderr); + abort(); + } + } + + // Debug: Log free operations (first 2000, ALL classes) { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); - if (op < 50 && class_idx == 1) { + if (op < 2000) { // ALL classes, not just class 1 fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p tls_count_before=%u\n", (unsigned long)op, class_idx, ptr, base, g_tls_sll[class_idx].count); diff --git a/core/tiny_region_id.h b/core/tiny_region_id.h index 6027544d..28c58fb9 100644 --- a/core/tiny_region_id.h +++ b/core/tiny_region_id.h @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include "hakmem_build_flags.h" @@ -17,6 +19,7 @@ #include "ptr_track.h" #include "hakmem_super_registry.h" #include "superslab/superslab_inline.h" +#include "hakmem_tiny.h" // For TinyTLSSLL type // Feature flag: Enable header-based class_idx lookup #ifndef HAKMEM_TINY_HEADER_CLASSIDX @@ -42,6 +45,122 @@ #define HEADER_MAGIC 0xA0 #define HEADER_CLASS_MASK 0x0F +// ========== Address Watcher (Debug Only) ========== + +#if !HAKMEM_BUILD_RELEASE +// Helper: Get current thread ID (watcher-local version to avoid redefinition) +static inline uint32_t watcher_self_u32(void) { + return (uint32_t)(uintptr_t)pthread_self(); +} + +// Address watcher: Tracks when a specific address is allocated or freed +// Usage: HAKMEM_WATCH_ADDR=0x7f1234567890 ./program +static inline uintptr_t get_watch_addr(void) { + static uintptr_t watch_addr = 0; + static int initialized = 0; + + if (!initialized) { + const char* env = getenv("HAKMEM_WATCH_ADDR"); + if (env && *env) { + // Parse hex address (with or without 0x prefix) + if (env[0] == '0' && (env[1] == 'x' || env[1] == 'X')) { + watch_addr = (uintptr_t)strtoull(env + 2, NULL, 16); + } else { + watch_addr = (uintptr_t)strtoull(env, NULL, 16); + } + if (watch_addr != 0) { + fprintf(stderr, "[WATCH_INIT] Watching address: %p\n", (void*)watch_addr); + fflush(stderr); + } + } + initialized = 1; + } + + return watch_addr; +} + +// Allocation source tracking +typedef enum { + ALLOC_SOURCE_UNKNOWN = 0, + ALLOC_SOURCE_TLS_SLL, // TLS freelist pop + ALLOC_SOURCE_FREELIST, // Slab freelist pop + ALLOC_SOURCE_CARVE, // Linear carve from slab + ALLOC_SOURCE_NEW_SLAB, // Newly allocated slab +} AllocSource; + +static __thread AllocSource g_last_alloc_source = ALLOC_SOURCE_UNKNOWN; + +// Use int to match extern declarations in other files +static inline void set_alloc_source(int source) { + g_last_alloc_source = (AllocSource)source; +} + +static inline const char* alloc_source_name(AllocSource source) { + switch (source) { + case ALLOC_SOURCE_TLS_SLL: return "TLS_SLL"; + case ALLOC_SOURCE_FREELIST: return "FREELIST"; + case ALLOC_SOURCE_CARVE: return "CARVE"; + case ALLOC_SOURCE_NEW_SLAB: return "NEW_SLAB"; + default: return "UNKNOWN"; + } +} + +// Watch trigger: Called when watch address is allocated +static inline void watch_alloc_trigger(void* base, int class_idx, AllocSource source) { + extern __thread TinyTLSSLL g_tls_sll[]; + extern _Atomic uint64_t g_debug_op_count; + + uint64_t op = atomic_load(&g_debug_op_count); + uint32_t tls_count = g_tls_sll[class_idx].count; + void* freelist_head = g_tls_sll[class_idx].head; + + fprintf(stderr, "\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "[WATCH_ALLOC_HIT] Address %p allocated!\n", base); + fprintf(stderr, "========================================\n"); + fprintf(stderr, " Operation: #%lu\n", (unsigned long)op); + fprintf(stderr, " Class: %d (%zu bytes)\n", class_idx, tiny_stride_for_class(class_idx)); + fprintf(stderr, " Source: %s\n", alloc_source_name(source)); + fprintf(stderr, " TLS count: %u\n", tls_count); + fprintf(stderr, " TLS head: %p\n", freelist_head); + fprintf(stderr, " Thread: %u\n", (unsigned)watcher_self_u32()); + + // Try to get slab metadata if available + struct SuperSlab* ss = hak_super_lookup(base); + if (ss && ss->magic == SUPERSLAB_MAGIC) { + int slab_idx = slab_index_for(ss, base); + if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { + TinySlabMeta* meta = &ss->slabs[slab_idx]; + fprintf(stderr, " Slab metadata:\n"); + fprintf(stderr, " SuperSlab: %p\n", (void*)ss); + fprintf(stderr, " Slab index: %d\n", slab_idx); + fprintf(stderr, " Slab class: %u\n", (unsigned)meta->class_idx); + fprintf(stderr, " Used: %u\n", (unsigned)meta->used); + fprintf(stderr, " Capacity: %u\n", (unsigned)meta->capacity); + fprintf(stderr, " Freelist: %p\n", meta->freelist); + fprintf(stderr, " Owner TID: %u\n", (unsigned)meta->owner_tid_low); + } + } + + fprintf(stderr, "========================================\n"); + fprintf(stderr, "\n"); + fflush(stderr); + + // Print backtrace for debugging + void* bt[16]; + int frames = backtrace(bt, 16); + fprintf(stderr, "[WATCH_BACKTRACE] %d frames:\n", frames); + backtrace_symbols_fd(bt, frames, fileno(stderr)); + fprintf(stderr, "\n"); + fflush(stderr); + + // Abort to capture the exact moment + fprintf(stderr, "[WATCH_ABORT] Aborting to preserve state...\n"); + fflush(stderr); + abort(); +} +#endif // !HAKMEM_BUILD_RELEASE + // ========== Write Header (Allocation) ========== // Write class_idx to header (called after allocation) @@ -50,6 +169,14 @@ static inline void* tiny_region_id_write_header(void* base, int class_idx) { if (!base) return base; +#if !HAKMEM_BUILD_RELEASE + // Address watcher: Check if this is the watched address + uintptr_t watch = get_watch_addr(); + if (watch != 0 && (uintptr_t)base == watch) { + watch_alloc_trigger(base, class_idx, g_last_alloc_source); + } +#endif + // Phase E1-CORRECT: ALL classes (C0-C7) have 1-byte header (no exceptions) // Rationale: Unified box structure enables: // - O(1) class identification (no registry lookup) @@ -105,6 +232,23 @@ static inline void* tiny_region_id_write_header(void* base, int class_idx) { PTR_TRACK_HEADER_WRITE(base, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); void* user = header_ptr + 1; // skip header for user pointer PTR_TRACK_MALLOC(base, 0, class_idx); // Track at BASE (where header is) + + // ========== ALLOCATION LOGGING (Debug builds only) ========== + // NOTE: This logging is ALWAYS active (not guarded by HAKMEM_BUILD_RELEASE) + // because we need to track allocations even in optimized debug builds + { + extern _Atomic uint64_t g_debug_op_count; + extern __thread TinyTLSSLL g_tls_sll[]; + uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); + if (op < 2000) { // ALL classes for comprehensive tracing + fprintf(stderr, "[OP#%04lu ALLOC] cls=%d ptr=%p base=%p from=write_header tls_count=%u\n", + (unsigned long)op, class_idx, user, base, + g_tls_sll[class_idx].count); + fflush(stderr); + } + } + // ========== END ALLOCATION LOGGING ========== + // Optional guard: log stride/base/user for targeted class extern int tiny_guard_is_enabled(void); extern void tiny_guard_on_alloc(int cls, void* base, void* user, size_t stride); diff --git a/core/tiny_superslab_free.inc.h b/core/tiny_superslab_free.inc.h index 19566a79..4ad6ba46 100644 --- a/core/tiny_superslab_free.inc.h +++ b/core/tiny_superslab_free.inc.h @@ -149,7 +149,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) { // Phase 6.23: Same-thread check (Phase 12: owner_tid_low) uint32_t my_tid = tiny_self_u32(); - uint8_t my_tid_low = (uint8_t)my_tid; + uint8_t my_tid_low = (uint8_t)((my_tid >> 8) & 0xFFu); const int debug_guard = g_debug_remote_guard; static __thread int g_debug_free_count = 0; // If owner is not set yet, claim ownership (low 8 bits) to avoid spurious remote path in 1T