/** * hakmem_mid_mt.c * * Mid Range Multi-threaded Allocator Implementation (8-32KB) * mimalloc-style per-thread segment for optimal MT performance * * Design: * - Per-thread segments (TLS) for lock-free allocation * - Global registry for segment lookup during free() * - 64KB chunks with bump + free list allocation * - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking) * - Phase 2: Will add atomic remote free list */ #include "hakmem_mid_mt.h" #include #include #include #include #include #include #include // Use likely/unlikely hints for branch prediction #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) #endif // ============================================================================ // Global and TLS Variables // ============================================================================ // TLS: Each thread has independent segments (lock-free!) __thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0}; // Phase 6-B: Registry removed (no longer needed with header-based free) // Statistics (if enabled) #if MID_ENABLE_STATS MidStats g_mid_stats = {0}; #endif // Initialization flag static volatile int g_mid_initialized = 0; static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER; // ============================================================================ // Forward Declarations // ============================================================================ static bool segment_refill(MidThreadSegment* seg, int class_idx); static void* segment_alloc(MidThreadSegment* seg, int class_idx); static void segment_free_local(MidThreadSegment* seg, void* ptr); static void* chunk_allocate(size_t chunk_size); static void chunk_deallocate(void* chunk, size_t chunk_size); // Phase 6-B: Registry functions removed (header-based free instead) // ============================================================================ // Chunk Management (mmap/munmap wrappers) // ============================================================================ /** * chunk_allocate - Allocate a new chunk via mmap * * @param chunk_size Size of chunk (typically 64KB) * @return Chunk base address, or NULL on failure */ static void* chunk_allocate(size_t chunk_size) { void* chunk = mmap( NULL, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0 ); if (chunk == MAP_FAILED) { MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size); return NULL; } MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size); return chunk; } /** * chunk_deallocate - Free chunk via munmap * * @param chunk Chunk base address * @param chunk_size Size of chunk */ static void chunk_deallocate(void* chunk, size_t chunk_size) { if (!chunk) return; int ret = munmap(chunk, chunk_size); if (ret != 0) { MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size); } else { MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size); } } // ============================================================================ // Segment Operations // ============================================================================ /** * segment_refill - Allocate new chunk and setup segment * * Called when segment is exhausted (rare, ~0.1% of allocations) * * Phase 6-B: No longer registers chunks (header-based free instead) * * @return true on success, false on OOM */ static bool segment_refill(MidThreadSegment* seg, int class_idx) { size_t block_size = mid_class_to_size(class_idx); size_t chunk_size = MID_CHUNK_SIZE; // Allocate new chunk via mmap void* chunk = chunk_allocate(chunk_size); if (!chunk) { return false; } // Phase 6-B: No registry add (header-based free doesn't need registry) // Setup segment seg->chunk_base = chunk; seg->chunk_size = chunk_size; seg->block_size = block_size; seg->current = chunk; seg->end = (uint8_t*)chunk + chunk_size; seg->capacity = chunk_size / block_size; seg->refill_count++; MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p", class_idx, block_size, seg->capacity, chunk); return true; } /** * segment_alloc - Allocate from segment (fast path) * * PERFORMANCE: Force inline for maximum speed * * Fast path priority: * 1. Free list (most common, ~90-95% hit rate) * 2. Bump allocation (when free list empty) * 3. Refill (when segment exhausted) * * Phase 6-B: Now writes MidMTHeader for lock-free free() * * @return Allocated pointer (after header), or NULL on OOM */ static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline)); static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) { void* block; // Block start (includes header space) size_t block_size = seg->block_size; // === Path 0: First allocation - need refill === // CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call if (unlikely(seg->chunk_base == NULL)) { if (!segment_refill(seg, class_idx)) { return NULL; // OOM } block_size = seg->block_size; // Update after refill } // === Path 1: Free list (fastest, ~4-5 instructions) === // Note: Free list stores next pointer at block start (overwrites header when freed) block = seg->free_list; if (likely(block != NULL)) { seg->free_list = *(void**)block; // Pop from free list seg->used_count++; seg->alloc_count++; // Phase 6-B: Write header before returning MidMTHeader* hdr = (MidMTHeader*)block; hdr->block_size = (uint32_t)block_size; hdr->class_idx = (uint16_t)class_idx; hdr->magic = MID_MT_MAGIC; return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header } // === Path 2: Bump allocation (fast, ~6-8 instructions) === block = seg->current; void* next = (uint8_t*)block + block_size; if (likely(next <= seg->end)) { seg->current = next; seg->used_count++; seg->alloc_count++; // Phase 6-B: Write header before returning MidMTHeader* hdr = (MidMTHeader*)block; hdr->block_size = (uint32_t)block_size; hdr->class_idx = (uint16_t)class_idx; hdr->magic = MID_MT_MAGIC; return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header } // === Path 3: Refill (slow, called ~once per 64KB) === if (!segment_refill(seg, class_idx)) { return NULL; // OOM } // Retry after refill block = seg->current; block_size = seg->block_size; // Update after refill seg->current = (uint8_t*)block + block_size; seg->used_count++; seg->alloc_count++; // Phase 6-B: Write header before returning MidMTHeader* hdr = (MidMTHeader*)block; hdr->block_size = (uint32_t)block_size; hdr->class_idx = (uint16_t)class_idx; hdr->magic = MID_MT_MAGIC; return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header } /** * segment_free_local - Free to local segment (same thread) * * @param seg Segment to free to * @param ptr Pointer to free (user pointer, after header) * * Phase 6-B: Adjusted for header-based allocation */ static inline void segment_free_local(MidThreadSegment* seg, void* ptr) { // Phase 6-B: Get block start (before header) void* block = (uint8_t*)ptr - sizeof(MidMTHeader); // Push to free list (lock-free, local operation) // Note: Overwrites header with next pointer (header no longer needed after free) *(void**)block = seg->free_list; seg->free_list = block; seg->used_count--; seg->free_count++; #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.local_frees, 1); #endif } // ============================================================================ // Public API // ============================================================================ /** * mid_mt_init - Initialize Mid Range MT allocator * * Thread-safe, idempotent * * Phase 6-B: Simplified (no registry initialization) */ void mid_mt_init(void) { if (g_mid_initialized) return; pthread_mutex_lock(&g_init_lock); if (!g_mid_initialized) { // Phase 6-B: No registry initialization (header-based free) #if MID_ENABLE_STATS memset(&g_mid_stats, 0, sizeof(g_mid_stats)); #endif g_mid_initialized = 1; MID_LOG("Mid MT allocator initialized (Phase 6-B: header-based)"); } pthread_mutex_unlock(&g_init_lock); } /** * mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB) * * Thread-safe, lock-free (uses TLS) */ void* mid_mt_alloc(size_t size) { // Validate size range (Phase 16: dynamic min size based on Tiny's max) if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) { return NULL; } // Initialize if needed (thread-safe) if (unlikely(!g_mid_initialized)) { mid_mt_init(); } // Get size class int class_idx = mid_size_to_class(size); if (unlikely(class_idx < 0)) { return NULL; } // Get thread-local segment MidThreadSegment* seg = &g_mid_segments[class_idx]; // Allocate from segment (fast path) void* p = segment_alloc(seg, class_idx); #if MID_ENABLE_STATS if (p) { __sync_fetch_and_add(&g_mid_stats.total_allocs, 1); } #endif return p; } /** * mid_mt_free - Free memory allocated by mid_mt_alloc * * Phase 6-B: Header-based free (lock-free, no registry lookup!) * - Reads MidMTHeader to get block metadata (O(1), ~2 cycles) * - Eliminates pthread_mutex_lock/unlock (13.98% CPU overhead) * - Expected: +17-27% throughput improvement * * Local free (same thread): Ultra-fast, lock-free * Remote free (cross-thread): NOT IMPLEMENTED (memory leak, Phase 2 will add atomic remote free list) */ void mid_mt_free(void* ptr, size_t size) { if (unlikely(!ptr)) return; #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.total_frees, 1); #endif // Phase 6-B: Read header for O(1) metadata lookup (no mutex!) void* block = (uint8_t*)ptr - sizeof(MidMTHeader); MidMTHeader* hdr = (MidMTHeader*)block; // Validate header magic (sanity check) if (unlikely(hdr->magic != MID_MT_MAGIC)) { MID_LOG("ERROR: Invalid Mid MT magic 0x%X (expected 0x%X) for ptr %p", hdr->magic, MID_MT_MAGIC, ptr); return; } // Get metadata from header (no registry lookup!) int class_idx = hdr->class_idx; // Validate class_idx if (unlikely(class_idx < 0 || class_idx >= MID_NUM_CLASSES)) { MID_LOG("ERROR: Invalid class_idx %d in header for ptr %p", class_idx, ptr); return; } // Get thread-local segment for this size class MidThreadSegment* seg = &g_mid_segments[class_idx]; // === Fast path: Check if block belongs to current segment === // Note: Check block (not ptr), since segment tracks block addresses if (likely(seg->chunk_base != NULL && block >= seg->chunk_base && block < seg->end)) { // Local free (same thread, lock-free) segment_free_local(seg, ptr); return; } // === Slow path: Remote free (cross-thread) === // Phase 1: NOT IMPLEMENTED // We would need to find the owning segment and push to its remote free list. // // For Phase 1 (benchmarking), we accept this memory leak. // bench_mid_mt_gap uses single-threaded workload, so remote frees never happen. MID_LOG("WARNING: Remote free not implemented, leaking %p (block_size=%u, class=%d)", ptr, hdr->block_size, class_idx); #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.remote_frees, 1); #endif // TODO Phase 2: Implement remote free // segment_free_remote(ptr, hdr->block_size, class_idx); } /** * mid_mt_thread_exit - Cleanup thread-local segments * * Called on thread exit to release resources * * Phase 6-B: No registry cleanup needed (header-based free) */ void mid_mt_thread_exit(void) { MID_LOG("Thread exit cleanup"); // Free all chunks from this thread's segments for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) { MidThreadSegment* seg = &g_mid_segments[class_idx]; if (seg->chunk_base) { // Phase 6-B: No registry remove (no registry exists) // Deallocate chunk chunk_deallocate(seg->chunk_base, seg->chunk_size); // Clear segment memset(seg, 0, sizeof(MidThreadSegment)); } } } // ============================================================================ // Statistics (Debug/Profiling) // ============================================================================ #if MID_ENABLE_STATS void mid_mt_print_stats(void) { printf("\n=== Mid Range MT Statistics ===\n"); printf("Total allocations: %lu\n", g_mid_stats.total_allocs); printf("Total frees: %lu\n", g_mid_stats.total_frees); printf("Local frees: %lu (%.1f%%)\n", g_mid_stats.local_frees, 100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1)); printf("Remote frees: %lu (%.1f%%)\n", g_mid_stats.remote_frees, 100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1)); printf("Registry lookups: %lu\n", g_mid_stats.registry_lookups); printf("\n"); // Per-segment stats for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) { MidThreadSegment* seg = &g_mid_segments[class_idx]; if (seg->alloc_count > 0) { printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx)); printf(" Allocations: %lu\n", seg->alloc_count); printf(" Frees: %lu\n", seg->free_count); printf(" Refills: %u\n", seg->refill_count); printf(" Used count: %u / %u\n", seg->used_count, seg->capacity); } } printf("\n"); } #endif // MID_ENABLE_STATS