#ifndef HAKMEM_TINY_H #define HAKMEM_TINY_H #include #include #include "hakmem_build_flags.h" #include "hakmem_trace.h" // Optional USDT (perf) tracepoints #include #include // Include page mini-magazine module (Phase 1: Hybrid optimization) #include "hakmem_tiny_mini_mag.h" // Forward declaration for initialization guard int hak_is_initializing(void); // Phase 6.12: Tiny Pool - Slab Allocator for ≤1KB allocations // 8 size classes: 8B, 16B, 32B, 64B, 128B, 256B, 512B, 1KB // ============================================================================ // Configuration // ============================================================================ #define TINY_NUM_CLASSES 8 #define TINY_SLAB_SIZE (64 * 1024) // 64KB per slab #define TINY_MAX_SIZE 1024 // Tiny handles up to 1024B (C7 headerless) // ============================================================================ // Size Classes // ============================================================================ // Size class table (branchless lookup) // Note: Definition is in hakmem_tiny.c to avoid multiple definition errors // Declaration is in hakmem_tiny_config.h as: extern const size_t g_tiny_class_sizes[TINY_NUM_CLASSES]; // Box 3 (tiny_box_geometry.h) uses this via hakmem_tiny_config.h // (Definition removed from header - see hakmem_tiny.c) // Full LUT (1..1024) for branchless size-to-class mapping (index by size). // Memory cost ~1KB. Zero hot-path arithmetic for all Tiny sizes. // Generate repeated values via helper macros to keep the source compact. #define HAK_R1(x) x #define HAK_R2(x) HAK_R1(x), HAK_R1(x) #define HAK_R4(x) HAK_R2(x), HAK_R2(x) #define HAK_R8(x) HAK_R4(x), HAK_R4(x) #define HAK_R16(x) HAK_R8(x), HAK_R8(x) #define HAK_R32(x) HAK_R16(x), HAK_R16(x) #define HAK_R64(x) HAK_R32(x), HAK_R32(x) #define HAK_R128(x) HAK_R64(x), HAK_R64(x) #define HAK_R256(x) HAK_R128(x), HAK_R128(x) #define HAK_R512(x) HAK_R256(x), HAK_R256(x) static const int8_t g_size_to_class_lut_1k[1025] = { -1, // index 0: invalid HAK_R8(0), // 1..8 -> class 0 HAK_R8(1), // 9..16 -> class 1 HAK_R16(2), // 17..32 -> class 2 HAK_R32(3), // 33..64 -> class 3 HAK_R64(4), // 65..128 -> class 4 HAK_R128(5), // 129..256 -> class 5 HAK_R256(6), // 257..512 -> class 6 HAK_R512(7), // 513..1024 -> class 7 }; #undef HAK_R512 #undef HAK_R256 #undef HAK_R128 #undef HAK_R64 #undef HAK_R32 #undef HAK_R16 #undef HAK_R8 #undef HAK_R4 #undef HAK_R2 #undef HAK_R1 // Blocks per slab for each class static const uint16_t g_tiny_blocks_per_slab[TINY_NUM_CLASSES] = { 8192, // Class 0: 64KB / 8B = 8192 blocks 4096, // Class 1: 64KB / 16B = 4096 blocks 2048, // Class 2: 64KB / 32B = 2048 blocks 1024, // Class 3: 64KB / 64B = 1024 blocks 512, // Class 4: 64KB / 128B = 512 blocks 256, // Class 5: 64KB / 256B = 256 blocks 128, // Class 6: 64KB / 512B = 128 blocks 64 // Class 7: 64KB / 1024B = 64 blocks }; // Bitmap size (uint64_t words) for each class static const uint8_t g_tiny_bitmap_words[TINY_NUM_CLASSES] = { 128, // Class 0: 8192 blocks / 64 = 128 words 64, // Class 1: 4096 blocks / 64 = 64 words 32, // Class 2: 2048 blocks / 64 = 32 words 16, // Class 3: 1024 blocks / 64 = 16 words 8, // Class 4: 512 blocks / 64 = 8 words 4, // Class 5: 256 blocks / 64 = 4 words 2, // Class 6: 128 blocks / 64 = 2 words 1 // Class 7: 64 blocks / 64 = 1 word }; // ============================================================================ // Data Structures // ============================================================================ // Forward declaration typedef struct TinySlab TinySlab; // Step 2: Slab Registry (Hash Table for O(1) lookup) #define SLAB_REGISTRY_SIZE 1024 #define SLAB_REGISTRY_MASK (SLAB_REGISTRY_SIZE - 1) #define SLAB_REGISTRY_MAX_PROBE 8 typedef struct { uintptr_t slab_base; // 64KB aligned base address (0 = empty slot) _Atomic(TinySlab*) owner; // Atomic pointer to TinySlab metadata (MT-safe) } SlabRegistryEntry; // Global registry (extern for access from multiple translation units) extern SlabRegistryEntry g_slab_registry[SLAB_REGISTRY_SIZE]; // Tiny Pool initialization flag (extern for inline function access) extern int g_tiny_initialized; // Per-class locks to protect slab lists and bitmaps (padded to avoid false sharing) typedef struct __attribute__((aligned(64))) { pthread_mutex_t m; char _pad[64]; } PaddedLock; extern PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES]; // Slab header (one per 64KB slab) typedef struct TinySlab { void* base; // Base address (64KB aligned) uint64_t* bitmap; // Free block bitmap (dynamic size) uint16_t free_count; // Number of free blocks uint16_t total_count; // Total blocks in slab uint8_t class_idx; // Size class index (0-7) uint8_t _padding[3]; struct TinySlab* next; // Next slab in list // MPSC remote-free stack head (lock-free). Stores user ptrs; next is embedded in block. atomic_uintptr_t remote_head; // Approximate count of pending remote frees (for drain thresholding) atomic_uint remote_count; // Targeted remote-drain queue linkage and state (for BG drain targeting) struct TinySlab* remote_q_next; // Intrusive next pointer for target stack atomic_uint remote_queued; // 0=not enqueued, 1=enqueued (CAS guarded) // Owning thread (for remote detection). Allocations from this thread use TLS fast path. pthread_t owner_tid; // Hint for next scan start (reduces bitmap word scanning) uint16_t hint_word; // Summary bitmap (2nd level): per 64-word group, bit=1 if the group has any free block uint8_t summary_words; // number of summary words (=(bitmap_words+63)/64) uint8_t _pad_sum[1]; uint64_t* summary; // length = summary_words // Phase 1: Page Mini-Magazine (Hybrid bitmap+free-list optimization) // Fast LIFO cache (16-32 items) for O(1) allocation without bitmap scan // Cost: 1-2 ns (vs 5-6 ns bitmap scan) PageMiniMag mini_mag; // LIFO free-list cache } TinySlab; // Global Tiny Pool state typedef struct { TinySlab* free_slabs[TINY_NUM_CLASSES]; // Slabs with free blocks TinySlab* full_slabs[TINY_NUM_CLASSES]; // Full slabs (no free blocks) uint64_t alloc_count[TINY_NUM_CLASSES]; // Allocation count per class uint64_t free_count[TINY_NUM_CLASSES]; // Free count per class uint64_t slab_count[TINY_NUM_CLASSES]; // Total slabs per class } TinyPool; // Global pool instance (defined in hakmem_tiny.c) extern TinyPool g_tiny_pool; // ============================================================================ // API Functions // ============================================================================ // Initialize Tiny Pool void hak_tiny_init(void); // Allocate from Tiny Pool (returns NULL if size > 1KB) void* hak_tiny_alloc(size_t size); // Free to Tiny Pool (no-op if ptr is not managed by Tiny Pool) void hak_tiny_free(void* ptr); // Phase 6.12.1: Free with pre-calculated slab (avoids duplicate owner_slab lookup) void hak_tiny_free_with_slab(void* ptr, TinySlab* slab); // Check if pointer is managed by Tiny Pool int hak_tiny_is_managed(void* ptr); int hak_tiny_is_managed_superslab(void* ptr); // Return the usable size for a Tiny-managed pointer (0 if unknown/not tiny). // For SuperSlab-backed blocks, uses size class from the owning SuperSlab. // For TinySlab-backed blocks, uses class_idx from the owning slab. size_t hak_tiny_usable_size(void* ptr); // Get statistics void hak_tiny_get_stats(uint64_t* alloc_count, uint64_t* free_count, uint64_t* slab_count); // Print statistics (debug) void hak_tiny_print_stats(void); // Phase 7.7: Magazine flush API (reduce memory footprint) // Flush Magazine cache to freelists, enabling empty SuperSlab detection void hak_tiny_magazine_flush(int class_idx); void hak_tiny_magazine_flush_all(void); // Trim empty Tiny slabs by releasing fully-free slabs back to the system. // Safe to call anytime; holds per-class locks while trimming. void hak_tiny_trim(void); // Optional shutdown hook for Tiny subsystem. // Stops background threads (e.g., Deferred Intelligence) and performs // any best-effort cleanup needed during process shutdown. void hak_tiny_shutdown(void); // Phase 8.2: Memory profiling (toggle with HAKMEM_DEBUG_MEMORY) // Print detailed breakdown of memory usage by component void hak_tiny_print_memory_profile(void); // Debug: dump Ultra Tiny counters (pop hits/refills/resets) void hak_tiny_ultra_debug_dump(void); void hak_tiny_path_debug_dump(void); // ============================================================================ // ACE Learning Layer: Runtime parameter adjustment // ============================================================================ // Exported for ACE controller access extern int g_remote_drain_thresh_per_class[TINY_NUM_CLASSES]; // Set remote drain threshold for a specific size class void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold); // ============================================================================ // Internal Helpers (branchless size-to-class) // ============================================================================ // Convert size to class index (branchless lookup) // Phase E1-CORRECT: ALL classes have 1-byte header // C7 max usable: 1023B (1024B total with header) // malloc(1024+) → routed to Mid allocator static inline int hak_tiny_size_to_class(size_t size) { if (size == 0) return -1; #if HAKMEM_TINY_HEADER_CLASSIDX // Phase E1-CORRECT: ALL classes have 1-byte header // Box: [Header 1B][Data NB] = (N+1) bytes total // g_tiny_class_sizes stores TOTAL size, so we need size+1 bytes // User requests N bytes → need (N+1) total → look up class with stride ≥ (N+1) // Max usable: 1023B (C7 stride=1024B) if (size > 1023) return -1; // 1024+ → Mid allocator // Find smallest class where stride ≥ (size + 1) // LUT maps total_size → class, so lookup (size + 1) to find class with that stride size_t needed = size + 1; // total bytes needed (data + header) if (needed > 1024) return -1; return g_size_to_class_lut_1k[needed]; #else if (size > 1024) return -1; return g_size_to_class_lut_1k[size]; // 1..1024 #endif } // ============================================================================ // Phase 6.12.1: O(1) Slab Lookup (Embedded Metadata) // ============================================================================ // Phase 6.12.1: Find slab owner by pointer // NOTE: Reverted from O(1) embedded metadata to O(N) linear search for safety // Embedded metadata requires dereferencing potentially unmapped memory // This is still faster than before because Option C eliminates duplicate calls TinySlab* hak_tiny_owner_slab(void* ptr); // ============================================================================ // Bitmap Operations (inline for speed) // ============================================================================ // Set block as used static inline void hak_tiny_set_used(TinySlab* slab, int block_idx) { int word_idx = block_idx / 64; int bit_idx = block_idx % 64; uint64_t v = slab->bitmap[word_idx] | (1ULL << bit_idx); slab->bitmap[word_idx] = v; // update summary: set to 1 if any free bit remains, else 0 int sum_word = word_idx / 64; int sum_bit = word_idx % 64; uint64_t has_free = ~v; // any zero in word means free if (has_free != 0) { slab->summary[sum_word] |= (1ULL << sum_bit); } else { slab->summary[sum_word] &= ~(1ULL << sum_bit); } } // Set block as free static inline void hak_tiny_set_free(TinySlab* slab, int block_idx) { int word_idx = block_idx / 64; int bit_idx = block_idx % 64; uint64_t v = slab->bitmap[word_idx] & ~(1ULL << bit_idx); slab->bitmap[word_idx] = v; // update summary: this word now certainly has a free bit int sum_word = word_idx / 64; int sum_bit = word_idx % 64; slab->summary[sum_word] |= (1ULL << sum_bit); } // Check if block is used static inline int hak_tiny_is_used(TinySlab* slab, int block_idx) { int word_idx = block_idx / 64; int bit_idx = block_idx % 64; return (slab->bitmap[word_idx] & (1ULL << bit_idx)) != 0; } // Find first free block (returns -1 if none) static inline int hak_tiny_find_free_block(TinySlab* slab) { // Trace bitmap scan attempts HAK_TP1(bitmap_scan, slab->class_idx); const int bw = g_tiny_bitmap_words[slab->class_idx]; const int sw = slab->summary_words; if (bw <= 0 || sw <= 0) return -1; int start_word = slab->hint_word % bw; int start_sw = start_word / 64; int start_sb = start_word % 64; for (int k = 0; k < sw; k++) { int idx = start_sw + k; if (idx >= sw) idx -= sw; uint64_t bits = slab->summary[idx]; // mask low bits on first iteration if (k == 0) { bits &= (~0ULL) << start_sb; } // mask out-of-range bits in last summary word if (idx == sw - 1 && (bw % 64) != 0) { uint64_t mask = (bw % 64) == 64 ? ~0ULL : ((1ULL << (bw % 64)) - 1ULL); bits &= mask; } if (bits == 0) continue; int woff = __builtin_ctzll(bits); // word offset within this summary word int word_idx = idx * 64 + woff; // bitmap word index if (word_idx >= bw) continue; // safety uint64_t used = slab->bitmap[word_idx]; uint64_t free_bits = ~used; if (free_bits == 0) continue; // should not happen if summary correct int bit_idx = __builtin_ctzll(free_bits); // first free block within word slab->hint_word = (uint16_t)((word_idx + 1) % bw); return word_idx * 64 + bit_idx; } return -1; } #endif // HAKMEM_TINY_H