/** * hakmem_mid_mt.h * * Mid Range Multi-threaded Allocator (1-32KB) * mimalloc-style per-thread segment design for optimal MT performance * * Part of Hybrid Approach: * - ≤1023B: Tiny Pool (header-based, C7 usable size) * - 1-32KB: Mid MT (this module, mimalloc-style per-thread) * - ≥64KB: Large Pool (learning-based, ELO strategies) * * Created: 2025-11-01 * Goal: 46M → 100-120M ops/s (2.2-2.6x improvement) */ #ifndef HAKMEM_MID_MT_H #define HAKMEM_MID_MT_H #include #include #include #include #ifdef __cplusplus extern "C" { #endif // ============================================================================ // Size Classes // ============================================================================ #define MID_SIZE_CLASS_8K 0 // 8KB blocks #define MID_SIZE_CLASS_16K 1 // 16KB blocks #define MID_SIZE_CLASS_32K 2 // 32KB blocks #define MID_NUM_CLASSES 3 // Total number of size classes // Phase 13: Close Tiny/Mid gap. // Phase 16: Dynamic Mid min size - must start where Tiny ends // Tiny max size is configurable via HAKMEM_TINY_MAX_CLASS: // - HAKMEM_TINY_MAX_CLASS=7 (default) → Tiny up to 1023B → Mid starts at 1024B // - HAKMEM_TINY_MAX_CLASS=5 → Tiny up to 255B → Mid starts at 256B #include "hakmem_tiny.h" // For tiny_get_max_size() static inline size_t mid_get_min_size(void) { return tiny_get_max_size() + 1; // Mid starts where Tiny ends } #define MID_MIN_SIZE_STATIC (1024) // Static fallback (C7 default) #define MID_MAX_SIZE (32 * 1024) // 32KB #define MID_CHUNK_SIZE (4 * 1024 * 1024) // 4MB chunks (same as mimalloc segments) // ============================================================================ // Data Structures // ============================================================================ /** * MidThreadSegment - Per-thread segment for lock-free allocation * * Memory layout optimized for cache line alignment (64 bytes) * - Cache line 0: Fast path fields (free_list, current, end, used_count) * - Cache line 1: Metadata (chunk_base, sizes, capacity) * - Cache line 2: Statistics (optional, for debugging) */ typedef struct MidThreadSegment { // === Fast Path (Cache line 0) === void* free_list; // Free objects linked list (NULL if empty) void* current; // Bump allocation pointer void* end; // End of current chunk uint32_t used_count; // Number of allocated blocks uint32_t padding0; // Alignment padding // === Metadata (Cache line 1) === void* chunk_base; // Base address of current chunk size_t chunk_size; // Size of chunk (typically 64KB) size_t block_size; // Size of each block (8KB/16KB/32KB) uint32_t capacity; // Total blocks in chunk uint32_t padding1; // Alignment padding // === Statistics (Cache line 2) === uint64_t alloc_count; // Total allocations uint64_t free_count; // Total frees uint32_t refill_count; // Number of chunk refills uint32_t padding2; // Alignment padding } __attribute__((aligned(64))) MidThreadSegment; /** * MidSegmentRegistry - Global registry for segment lookup in free() * * Used to find the owning segment when freeing a pointer. * Entries are sorted by base address for O(log N) binary search. */ typedef struct MidSegmentRegistry { void* base; // Segment base address size_t block_size; // Block size (8KB/16KB/32KB) int class_idx; // Size class index (0-2) int padding; // Alignment padding } MidSegmentRegistry; /** * MidGlobalRegistry - Global registry manager * * Thread-safety: Protected by pthread_mutex * Performance: Lock only during registry operations (low frequency) */ typedef struct MidGlobalRegistry { MidSegmentRegistry* entries; // Dynamic array of registry entries uint32_t count; // Number of entries uint32_t capacity; // Array capacity pthread_mutex_t lock; // Registry lock } MidGlobalRegistry; // ============================================================================ // Global Variables // ============================================================================ // TLS: Each thread has its own segments (lock-free!) extern __thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES]; // Global registry (protected by lock) extern MidGlobalRegistry g_mid_registry; // ============================================================================ // API Functions // ============================================================================ /** * mid_mt_init - Initialize Mid Range MT allocator * * Call once at startup (thread-safe, idempotent) */ void mid_mt_init(void); /** * mid_mt_alloc - Allocate memory from Mid Range pool * * @param size Allocation size (must be mid_get_min_size() ≤ size ≤ MID_MAX_SIZE) * Phase 16: Range adjusts dynamically based on Tiny's max size * Default: 1024B-32KB, can expand to 256B-32KB if Tiny reduced to C0-C5 * @return Allocated pointer (aligned to block_size), or NULL on failure * * Thread-safety: Lock-free (uses TLS) * Performance: O(1) fast path, O(1) amortized * * Fast path: * 1. Check free_list (most common, ~4-5 instructions) * 2. Bump allocation if free_list empty (~6-8 instructions) * 3. Refill chunk if segment exhausted (rare, ~0.1%) */ void* mid_mt_alloc(size_t size); /** * mid_mt_free - Free memory allocated by mid_mt_alloc * * @param ptr Pointer to free (must be from mid_mt_alloc) * @param size Original allocation size (for size class lookup) * * Thread-safety: Lock-free if freeing to own thread's segment * Requires registry lock if remote free (cross-thread) * Performance: O(1) local free, O(log N) remote free (registry lookup) * * Note: Phase 1 implementation does not handle remote free (memory leak) * Phase 2 will implement per-segment atomic remote free list */ void mid_mt_free(void* ptr, size_t size); /** * mid_mt_thread_exit - Cleanup thread-local segments * * Called on thread exit to release resources * Should be registered via pthread_key_create or __attribute__((destructor)) */ void mid_mt_thread_exit(void); /** * mid_registry_lookup - Find segment containing ptr (for free() path) * * @param ptr Pointer to lookup * @param out_block_size Output: block size if found * @param out_class_idx Output: size class index if found * @return true if found in Mid MT registry, false otherwise * * Used internally by hak_free_at() to identify Mid MT allocations */ bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx); // ============================================================================ // Inline Helper Functions // ============================================================================ /** * mid_size_to_class - Convert size to size class index * * @param size Allocation size * @return Size class index (0-2), or -1 if out of range */ static inline int mid_size_to_class(size_t size) { if (size <= 8192) return MID_SIZE_CLASS_8K; if (size <= 16384) return MID_SIZE_CLASS_16K; if (size <= 32768) return MID_SIZE_CLASS_32K; return -1; // Out of range } /** * mid_class_to_size - Convert size class to block size * * @param class_idx Size class index (0-2) * @return Block size in bytes */ static inline size_t mid_class_to_size(int class_idx) { static const size_t sizes[MID_NUM_CLASSES] = { 8192, // 8KB 16384, // 16KB 32768 // 32KB }; return (class_idx >= 0 && class_idx < MID_NUM_CLASSES) ? sizes[class_idx] : 0; } /** * mid_is_in_range - Check if size is in Mid Range pool range * * @param size Allocation size * @return true if (tiny_max+1) ≤ size ≤ 32KB * * Phase 16: Dynamic range - adjusts based on Tiny's max size * PERF_OPT: Force inline to eliminate function call overhead in hot path */ __attribute__((always_inline)) static inline bool mid_is_in_range(size_t size) { return (size >= mid_get_min_size() && size <= MID_MAX_SIZE); } // ============================================================================ // Configuration (can be overridden via environment variables) // ============================================================================ // Default chunk size (64KB) #ifndef MID_DEFAULT_CHUNK_SIZE #define MID_DEFAULT_CHUNK_SIZE (64 * 1024) #endif // Initial registry capacity #ifndef MID_REGISTRY_INITIAL_CAPACITY #define MID_REGISTRY_INITIAL_CAPACITY 64 #endif // Enable/disable statistics collection #ifndef MID_ENABLE_STATS #define MID_ENABLE_STATS 0 // DISABLED for performance #endif // Enable/disable debug logging #ifndef MID_DEBUG #define MID_DEBUG 0 // DISABLE for performance testing #endif #if MID_DEBUG #include #define MID_LOG(fmt, ...) fprintf(stderr, "[MID_MT] " fmt "\n", ##__VA_ARGS__) #else #define MID_LOG(fmt, ...) ((void)0) #endif // ============================================================================ // Statistics (Debug/Profiling) // ============================================================================ #if MID_ENABLE_STATS /** * MidStats - Global statistics for profiling */ typedef struct MidStats { uint64_t total_allocs; // Total allocations uint64_t total_frees; // Total frees uint64_t total_refills; // Total chunk refills uint64_t local_frees; // Local frees (same thread) uint64_t remote_frees; // Remote frees (cross-thread) uint64_t registry_lookups; // Registry lookups } MidStats; extern MidStats g_mid_stats; void mid_mt_print_stats(void); #endif // MID_ENABLE_STATS #ifdef __cplusplus } #endif #endif // HAKMEM_MID_MT_H