// tiny_region_id.h - Region-ID Direct Lookup API (Phase 7) // Purpose: O(1) class_idx lookup from pointer (eliminates SuperSlab lookup) // Design: Smart Headers - 1-byte class_idx embedded before each block // Performance: 2-3 cycles (vs 100+ cycles for SuperSlab lookup) // // Expected Impact: 1.2M → 40-60M ops/s (30-50x improvement) #ifndef TINY_REGION_ID_H #define TINY_REGION_ID_H #include #include #include #include #include #include #include "hakmem_build_flags.h" #include "tiny_box_geometry.h" #include "ptr_track.h" #include "hakmem_super_registry.h" #include "superslab/superslab_inline.h" #include "hakmem_tiny.h" // For TinyTLSSLL type #include "tiny_debug_api.h" // Guard/failfast declarations #include "box/tiny_header_hotfull_env_box.h" // Phase 21: Hot/cold split ENV control // Feature flag: Enable header-based class_idx lookup #ifndef HAKMEM_TINY_HEADER_CLASSIDX #define HAKMEM_TINY_HEADER_CLASSIDX 0 #endif #if HAKMEM_TINY_HEADER_CLASSIDX // ========== Header Layout ========== // // Memory layout: // [Header: 1 byte] [User block: N bytes] // ^ ^ // ptr-1 ptr (returned to user) // // Header format (1 byte): // - Bits 0-3: class_idx (0-15, only 0-7 used for Tiny) // - Bits 4-7: magic (0xA for validation in debug mode) // // Example: // class_idx = 3 → header = 0xA3 (debug) or 0x03 (release) #define HEADER_MAGIC 0xA0 #define HEADER_CLASS_MASK 0x0F // ========== Address Watcher (Debug Only) ========== #if !HAKMEM_BUILD_RELEASE // Helper: Get current thread ID (watcher-local version to avoid redefinition) static inline uint32_t watcher_self_u32(void) { return (uint32_t)(uintptr_t)pthread_self(); } // Address watcher: Tracks when a specific address is allocated or freed // Usage: HAKMEM_WATCH_ADDR=0x7f1234567890 ./program static inline uintptr_t get_watch_addr(void) { #if !HAKMEM_BUILD_RELEASE static uintptr_t watch_addr = 0; static int initialized = 0; if (!initialized) { const char* env = getenv("HAKMEM_WATCH_ADDR"); if (env && *env) { // Parse hex address (with or without 0x prefix) if (env[0] == '0' && (env[1] == 'x' || env[1] == 'X')) { watch_addr = (uintptr_t)strtoull(env + 2, NULL, 16); } else { watch_addr = (uintptr_t)strtoull(env, NULL, 16); } if (watch_addr != 0) { fprintf(stderr, "[WATCH_INIT] Watching address: %p\n", (void*)watch_addr); fflush(stderr); } } initialized = 1; } return watch_addr; #else return 0; #endif } // Allocation source tracking typedef enum { ALLOC_SOURCE_UNKNOWN = 0, ALLOC_SOURCE_TLS_SLL, // TLS freelist pop ALLOC_SOURCE_FREELIST, // Slab freelist pop ALLOC_SOURCE_CARVE, // Linear carve from slab ALLOC_SOURCE_NEW_SLAB, // Newly allocated slab } AllocSource; static __thread AllocSource g_last_alloc_source = ALLOC_SOURCE_UNKNOWN; // Use int to match extern declarations in other files static inline void set_alloc_source(int source) { g_last_alloc_source = (AllocSource)source; } static inline const char* alloc_source_name(AllocSource source) { switch (source) { case ALLOC_SOURCE_TLS_SLL: return "TLS_SLL"; case ALLOC_SOURCE_FREELIST: return "FREELIST"; case ALLOC_SOURCE_CARVE: return "CARVE"; case ALLOC_SOURCE_NEW_SLAB: return "NEW_SLAB"; default: return "UNKNOWN"; } } // Watch trigger: Called when watch address is allocated static inline void watch_alloc_trigger(void* base, int class_idx, AllocSource source) { extern __thread TinyTLSSLL g_tls_sll[]; extern _Atomic uint64_t g_debug_op_count; uint64_t op = atomic_load(&g_debug_op_count); uint32_t tls_count = g_tls_sll[class_idx].count; void* freelist_head = g_tls_sll[class_idx].head; fprintf(stderr, "\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "[WATCH_ALLOC_HIT] Address %p allocated!\n", base); fprintf(stderr, "========================================\n"); fprintf(stderr, " Operation: #%lu\n", (unsigned long)op); fprintf(stderr, " Class: %d (%zu bytes)\n", class_idx, tiny_stride_for_class(class_idx)); fprintf(stderr, " Source: %s\n", alloc_source_name(source)); fprintf(stderr, " TLS count: %u\n", tls_count); fprintf(stderr, " TLS head: %p\n", freelist_head); fprintf(stderr, " Thread: %u\n", (unsigned)watcher_self_u32()); // Try to get slab metadata if available struct SuperSlab* ss = hak_super_lookup(base); if (ss && ss->magic == SUPERSLAB_MAGIC) { int slab_idx = slab_index_for(ss, base); if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { TinySlabMeta* meta = &ss->slabs[slab_idx]; fprintf(stderr, " Slab metadata:\n"); fprintf(stderr, " SuperSlab: %p\n", (void*)ss); fprintf(stderr, " Slab index: %d\n", slab_idx); fprintf(stderr, " Slab class: %u\n", (unsigned)meta->class_idx); fprintf(stderr, " Used: %u\n", (unsigned)meta->used); fprintf(stderr, " Capacity: %u\n", (unsigned)meta->capacity); fprintf(stderr, " Freelist: %p\n", meta->freelist); fprintf(stderr, " Owner TID: %u\n", (unsigned)meta->owner_tid_low); } } fprintf(stderr, "========================================\n"); fprintf(stderr, "\n"); fflush(stderr); // Print backtrace for debugging void* bt[16]; int frames = backtrace(bt, 16); fprintf(stderr, "[WATCH_BACKTRACE] %d frames:\n", frames); backtrace_symbols_fd(bt, frames, fileno(stderr)); fprintf(stderr, "\n"); fflush(stderr); // Abort to capture the exact moment fprintf(stderr, "[WATCH_ABORT] Aborting to preserve state...\n"); fflush(stderr); abort(); } #endif // !HAKMEM_BUILD_RELEASE // ========== Write Header (Allocation) ========== // Header write mode (bench-only switch; default FULL) enum tiny_header_mode { TINY_HEADER_MODE_FULL = 0, TINY_HEADER_MODE_LIGHT = 1, TINY_HEADER_MODE_OFF = 2, }; static inline int tiny_header_mode(void) { static int g_header_mode = -1; if (__builtin_expect(g_header_mode == -1, 0)) { const char* e = getenv("HAKMEM_TINY_HEADER_MODE"); if (e && *e) { char c = e[0]; if (c == 'l' || c == 'L' || c == '1') { g_header_mode = TINY_HEADER_MODE_LIGHT; } else if (c == 'o' || c == 'O' || c == '0') { g_header_mode = TINY_HEADER_MODE_OFF; } else { g_header_mode = TINY_HEADER_MODE_FULL; } } else { // Backward compatibility: HAKMEM_TINY_WRITE_HEADER=0 behaves like "off". const char* old = getenv("HAKMEM_TINY_WRITE_HEADER"); g_header_mode = (old && *old && *old == '0') ? TINY_HEADER_MODE_OFF : TINY_HEADER_MODE_FULL; } } return g_header_mode; } // Phase 21: Cold helper for non-FULL modes and guard-enabled cases // Handles LIGHT/OFF header write policy + guard hook __attribute__((cold, noinline)) static void* tiny_region_id_write_header_slow(void* base, int class_idx, uint8_t* header_ptr) { // Header write policy (bench-only switch, default FULL) int header_mode = tiny_header_mode(); uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); uint8_t existing_header = *header_ptr; if (__builtin_expect(header_mode == TINY_HEADER_MODE_FULL, 1)) { *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); } else if (header_mode == TINY_HEADER_MODE_LIGHT) { // Keep header consistent but avoid redundant stores. if (existing_header != desired_header) { *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); } } else { // TINY_HEADER_MODE_OFF (bench-only) // Only touch the header if it is clearly invalid to keep free() workable. uint8_t existing_magic = existing_header & 0xF0; if (existing_magic != HEADER_MAGIC || (existing_header & HEADER_CLASS_MASK) != (desired_header & HEADER_CLASS_MASK)) { *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); } } void* user = header_ptr + 1; // skip header for user pointer (layout preserved) PTR_TRACK_MALLOC(base, 0, class_idx); // Track at BASE (where header is) // ========== ALLOCATION LOGGING (Debug builds only) ========== #if !HAKMEM_BUILD_RELEASE { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); if (op < 2000) { // ALL classes for comprehensive tracing fprintf(stderr, "[OP#%04lu ALLOC] cls=%d ptr=%p base=%p from=write_header tls_count=%u\n", (unsigned long)op, class_idx, user, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif // ========== END ALLOCATION LOGGING ========== // Optional guard: log stride/base/user for targeted class if (header_mode != TINY_HEADER_MODE_OFF && tiny_guard_is_enabled()) { size_t stride = tiny_stride_for_class(class_idx); tiny_guard_on_alloc(class_idx, base, user, stride); } return user; } // Write class_idx to header (called after allocation) // Input: base (block start from SuperSlab) // Returns: user pointer (base + 1, skipping header) #if HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE __attribute__((always_inline)) #endif static inline void* tiny_region_id_write_header(void* base, int class_idx) { if (!base) return base; #if !HAKMEM_BUILD_RELEASE // Address watcher: Check if this is the watched address uintptr_t watch = get_watch_addr(); if (watch != 0 && (uintptr_t)base == watch) { watch_alloc_trigger(base, class_idx, g_last_alloc_source); } #endif // Phase E1-CORRECT: ALL classes (C0-C7) have 1-byte header (no exceptions) // Rationale: Unified box structure enables: // - O(1) class identification (no registry lookup) // - All classes use same fast path // - Zero special cases across all layers // Cost: 0.1% memory overhead for C7 (1024B → 1023B usable) // Benefit: 100% safety, architectural simplicity, maximum performance // Write header at block start (ALL classes including C7) uint8_t* header_ptr = (uint8_t*)base; // Phase 6-A: Debug validation (disabled in release builds for performance) // perf profiling showed hak_super_lookup() costs 15.84% CPU on hot path // Expected gain: +12-15% throughput by removing this in release builds #if !HAKMEM_BUILD_RELEASE // Debug: detect header writes with class_idx that disagrees with slab metadata. do { static _Atomic uint32_t g_hdr_meta_mis = 0; struct SuperSlab* ss = hak_super_lookup(base); if (ss && ss->magic == SUPERSLAB_MAGIC) { int slab_idx = slab_index_for(ss, base); if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { uint8_t meta_cls = ss->slabs[slab_idx].class_idx; if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) { uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_mis, 1, memory_order_relaxed); if (n < 8) { void* ra = __builtin_return_address(0); const char* sym = "(unknown)"; #ifdef __GLIBC__ Dl_info info; if (dladdr(ra, &info) && info.dli_sname) { sym = info.dli_sname; } #endif fprintf(stderr, "[HDR_META_MISMATCH] cls=%d meta_cls=%u base=%p slab_idx=%d ss=%p ra=%p fn=%s\n", class_idx, (unsigned)meta_cls, base, slab_idx, (void*)ss, ra, sym); if (n < 4) { void* bt[8]; int frames = backtrace(bt, 8); backtrace_symbols_fd(bt, frames, fileno(stderr)); } fflush(stderr); } } } } } while (0); #endif // !HAKMEM_BUILD_RELEASE // Phase 21: Hot/cold split for FULL mode (ENV-gated) if (tiny_header_hotfull_enabled()) { int header_mode = tiny_header_mode(); if (__builtin_expect(header_mode == TINY_HEADER_MODE_FULL, 1)) { // Hot path: straight-line code (no existing_header read, no guard call) uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); void* user = header_ptr + 1; PTR_TRACK_MALLOC(base, 0, class_idx); #if !HAKMEM_BUILD_RELEASE // Debug logging (keep minimal observability in hot path) { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); if (op < 2000) { fprintf(stderr, "[OP#%04lu ALLOC] cls=%d ptr=%p base=%p from=write_header_hot tls_count=%u\n", (unsigned long)op, class_idx, user, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif return user; } // Non-FULL mode or guard-enabled: delegate to cold helper return tiny_region_id_write_header_slow(base, class_idx, header_ptr); } // Fallback: HOTFULL=0, use existing unified logic (backward compatibility) // Header write policy (bench-only switch, default FULL) int header_mode = tiny_header_mode(); uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); uint8_t existing_header = *header_ptr; if (__builtin_expect(header_mode == TINY_HEADER_MODE_FULL, 1)) { *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); } else if (header_mode == TINY_HEADER_MODE_LIGHT) { // Keep header consistent but avoid redundant stores. if (existing_header != desired_header) { *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); } } else { // TINY_HEADER_MODE_OFF (bench-only) // Only touch the header if it is clearly invalid to keep free() workable. uint8_t existing_magic = existing_header & 0xF0; if (existing_magic != HEADER_MAGIC || (existing_header & HEADER_CLASS_MASK) != (desired_header & HEADER_CLASS_MASK)) { *header_ptr = desired_header; PTR_TRACK_HEADER_WRITE(base, desired_header); } } void* user = header_ptr + 1; // skip header for user pointer (layout preserved) PTR_TRACK_MALLOC(base, 0, class_idx); // Track at BASE (where header is) // ========== ALLOCATION LOGGING (Debug builds only) ========== #if !HAKMEM_BUILD_RELEASE { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); if (op < 2000) { // ALL classes for comprehensive tracing fprintf(stderr, "[OP#%04lu ALLOC] cls=%d ptr=%p base=%p from=write_header tls_count=%u\n", (unsigned long)op, class_idx, user, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif // ========== END ALLOCATION LOGGING ========== // Optional guard: log stride/base/user for targeted class if (header_mode != TINY_HEADER_MODE_OFF && tiny_guard_is_enabled()) { size_t stride = tiny_stride_for_class(class_idx); tiny_guard_on_alloc(class_idx, base, user, stride); } return user; } // ========== Read Header (Free) ========== // Read class_idx from header (called during free) // Returns: class_idx (0-7), or -1 if invalid static inline int tiny_region_id_read_header(void* ptr) { if (!ptr) return -1; if ((uintptr_t)ptr < 4096) return -1; // reject invalid tiny values uint8_t* header_ptr = (uint8_t*)ptr - 1; uint8_t header = *header_ptr; // CRITICAL FIX (Pool TLS Phase 1): ALWAYS validate magic when Pool TLS is enabled // Reason: Pool TLS uses different magic (0xb0 vs 0xa0), MUST distinguish them! // Without this, Pool TLS allocations are wrongly routed to Tiny freelist → corruption // Always validate magic byte to catch non-header allocations (release included). // Reason: mmap-zero or mid/large frees can otherwise be misrouted as class 0. uint8_t magic = header & 0xF0; #if HAKMEM_DEBUG_VERBOSE static int debug_count = 0; if (debug_count < 5) { fprintf(stderr, "[TINY_READ_HEADER] ptr=%p header=0x%02x magic=0x%02x expected=0x%02x\n", ptr, header, magic, HEADER_MAGIC); debug_count++; } #endif if (magic != HEADER_MAGIC) { #if !HAKMEM_BUILD_RELEASE static int invalid_count = 0; if (invalid_count < 5) { fprintf(stderr, "[HEADER_INVALID] ptr=%p, header=%02x, magic=%02x (expected %02x)\n", ptr, header, magic, HEADER_MAGIC); invalid_count++; } #endif // Optional guard hook for invalid header if (tiny_guard_is_enabled()) tiny_guard_on_invalid(ptr, header); return -1; } int class_idx = (int)(header & HEADER_CLASS_MASK); // CRITICAL: Always validate class_idx range (even in release builds) // Reason: Corrupted headers could cause out-of-bounds array access #ifndef TINY_NUM_CLASSES #define TINY_NUM_CLASSES 8 #endif if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { // Corrupted header return -1; } return class_idx; } // ========== Header Validation ========== // Check if pointer has valid header (debug mode) static inline int tiny_region_id_has_header(void* ptr) { #if !HAKMEM_BUILD_RELEASE if (!ptr) return 0; if ((uintptr_t)ptr < 4096) return 0; uint8_t* header_ptr = (uint8_t*)ptr - 1; uint8_t header = *header_ptr; uint8_t magic = header & 0xF0; return (magic == HEADER_MAGIC); #else // Release: Assume all allocations have headers (void)ptr; return 1; #endif } // ========== Allocation Size Adjustment ========== // Calculate allocation size including header (1 byte) static inline size_t tiny_region_id_alloc_size(size_t user_size) { return user_size + 1; // Add 1 byte for header } // Calculate user size from allocation size static inline size_t tiny_region_id_user_size(size_t alloc_size) { return alloc_size - 1; } // ========== Performance Notes ========== // // Header Read Performance: // - Best case: 2 cycles (L1 hit, no validation) // - Average: 3 cycles (with class_idx extraction) // - Worst case: 5 cycles (debug validation) // - vs SuperSlab lookup: 100+ cycles (50x faster!) // // Memory Overhead: // - Per block: 1 byte // - 8-byte blocks: 12.5% overhead // - 128-byte blocks: 0.8% overhead // - Average (typical workload): ~1.5% // - Slab[0]: 0% (reuses 960B wasted padding) // // Cache Impact: // - Excellent: Header is inline with user data // - Prefetch: Header loaded with first user data access // - No additional cache lines required #else // !HAKMEM_TINY_HEADER_CLASSIDX // Disabled: No-op implementations #if HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE __attribute__((always_inline)) #endif static inline void* tiny_region_id_write_header(void* ptr, int class_idx) { (void)class_idx; return ptr; } static inline int tiny_region_id_read_header(void* ptr) { (void)ptr; return -1; // Not supported } static inline int tiny_region_id_has_header(void* ptr) { (void)ptr; return 0; // No headers } static inline size_t tiny_region_id_alloc_size(size_t user_size) { return user_size; // No header } static inline size_t tiny_region_id_user_size(size_t alloc_size) { return alloc_size; } #endif // HAKMEM_TINY_HEADER_CLASSIDX #endif // TINY_REGION_ID_H