/** * hakmem_mid_mt.c * * Mid Range Multi-threaded Allocator Implementation (8-32KB) * mimalloc-style per-thread segment for optimal MT performance * * Design: * - Per-thread segments (TLS) for lock-free allocation * - Global registry for segment lookup during free() * - 64KB chunks with bump + free list allocation * - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking) * - Phase 2: Will add atomic remote free list */ #include "hakmem_mid_mt.h" #include #include #include #include #include #include #include // Use likely/unlikely hints for branch prediction #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) #endif // ============================================================================ // Global and TLS Variables // ============================================================================ // TLS: Each thread has independent segments (lock-free!) __thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0}; // Global registry (protected by lock) MidGlobalRegistry g_mid_registry = { .entries = NULL, .count = 0, .capacity = 0, .lock = PTHREAD_MUTEX_INITIALIZER }; // Statistics (if enabled) #if MID_ENABLE_STATS MidStats g_mid_stats = {0}; #endif // Initialization flag static volatile int g_mid_initialized = 0; static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER; // ============================================================================ // Forward Declarations // ============================================================================ static bool segment_refill(MidThreadSegment* seg, int class_idx); static void* segment_alloc(MidThreadSegment* seg, int class_idx); static void segment_free_local(MidThreadSegment* seg, void* ptr); static void* chunk_allocate(size_t chunk_size); static void chunk_deallocate(void* chunk, size_t chunk_size); static void registry_add(void* base, size_t block_size, int class_idx); bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx); // Public for hak_free_at() static void registry_remove(void* base); // ============================================================================ // Registry Operations (Protected by Lock) // ============================================================================ /** * registry_add - Add a new segment to global registry * * Called during segment refill (rare, ~0.1% of allocations) */ static void registry_add(void* base, size_t block_size, int class_idx) { pthread_mutex_lock(&g_mid_registry.lock); // Grow registry if needed if (g_mid_registry.count >= g_mid_registry.capacity) { uint32_t new_capacity = g_mid_registry.capacity == 0 ? MID_REGISTRY_INITIAL_CAPACITY : g_mid_registry.capacity * 2; // CRITICAL: Use mmap() instead of realloc() to avoid deadlock! // realloc() would go through hakmem → mid_mt → registry_add → deadlock size_t new_size = new_capacity * sizeof(MidSegmentRegistry); MidSegmentRegistry* new_entries = mmap( NULL, new_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0 ); if (new_entries == MAP_FAILED) { new_entries = NULL; } else if (g_mid_registry.entries) { // Copy old entries memcpy(new_entries, g_mid_registry.entries, g_mid_registry.count * sizeof(MidSegmentRegistry)); // Don't unmap old entries (lazy cleanup, avoids complexity) } if (!new_entries) { pthread_mutex_unlock(&g_mid_registry.lock); MID_LOG("ERROR: Registry realloc failed"); return; } g_mid_registry.entries = new_entries; g_mid_registry.capacity = new_capacity; } // Add new entry MidSegmentRegistry* entry = &g_mid_registry.entries[g_mid_registry.count]; entry->base = base; entry->block_size = block_size; entry->class_idx = class_idx; g_mid_registry.count++; // Keep entries sorted by base address (for binary search) // Simple insertion: swap with previous until in order for (uint32_t i = g_mid_registry.count - 1; i > 0; i--) { if (g_mid_registry.entries[i].base >= g_mid_registry.entries[i - 1].base) { break; } // Swap MidSegmentRegistry tmp = g_mid_registry.entries[i]; g_mid_registry.entries[i] = g_mid_registry.entries[i - 1]; g_mid_registry.entries[i - 1] = tmp; } pthread_mutex_unlock(&g_mid_registry.lock); MID_LOG("Registry add: base=%p, block_size=%zu, class=%d, count=%u", base, block_size, class_idx, g_mid_registry.count); } /** * mid_registry_lookup - Find segment containing ptr via binary search * * Called during free() when ptr is not in current segment (uncommon) * * @return true if found, false otherwise */ bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx) { pthread_mutex_lock(&g_mid_registry.lock); #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.registry_lookups, 1); #endif // Binary search for segment containing ptr int left = 0; int right = (int)g_mid_registry.count - 1; bool found = false; while (left <= right) { int mid = left + (right - left) / 2; MidSegmentRegistry* entry = &g_mid_registry.entries[mid]; void* seg_end = (uint8_t*)entry->base + MID_CHUNK_SIZE; if (ptr < entry->base) { right = mid - 1; } else if (ptr >= seg_end) { left = mid + 1; } else { // Found! *out_block_size = entry->block_size; *out_class_idx = entry->class_idx; found = true; break; } } pthread_mutex_unlock(&g_mid_registry.lock); return found; } /** * registry_remove - Remove segment from registry * * Called when segment is completely freed (rare) */ static void registry_remove(void* base) { pthread_mutex_lock(&g_mid_registry.lock); // Find entry with matching base for (uint32_t i = 0; i < g_mid_registry.count; i++) { if (g_mid_registry.entries[i].base == base) { // Remove by shifting remaining entries for (uint32_t j = i; j < g_mid_registry.count - 1; j++) { g_mid_registry.entries[j] = g_mid_registry.entries[j + 1]; } g_mid_registry.count--; pthread_mutex_unlock(&g_mid_registry.lock); MID_LOG("Registry remove: base=%p, count=%u", base, g_mid_registry.count); return; } } pthread_mutex_unlock(&g_mid_registry.lock); } // ============================================================================ // Chunk Management (mmap/munmap wrappers) // ============================================================================ /** * chunk_allocate - Allocate a new chunk via mmap * * @param chunk_size Size of chunk (typically 64KB) * @return Chunk base address, or NULL on failure */ static void* chunk_allocate(size_t chunk_size) { void* chunk = mmap( NULL, chunk_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0 ); if (chunk == MAP_FAILED) { MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size); return NULL; } MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size); return chunk; } /** * chunk_deallocate - Free chunk via munmap * * @param chunk Chunk base address * @param chunk_size Size of chunk */ static void chunk_deallocate(void* chunk, size_t chunk_size) { if (!chunk) return; int ret = munmap(chunk, chunk_size); if (ret != 0) { MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size); } else { MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size); } } // ============================================================================ // Segment Operations // ============================================================================ /** * segment_refill - Allocate new chunk and setup segment * * Called when segment is exhausted (rare, ~0.1% of allocations) * * @return true on success, false on OOM */ static bool segment_refill(MidThreadSegment* seg, int class_idx) { size_t block_size = mid_class_to_size(class_idx); size_t chunk_size = MID_CHUNK_SIZE; // Allocate new chunk via mmap void* chunk = chunk_allocate(chunk_size); if (!chunk) { return false; } // Register chunk in global registry (for free() lookup) registry_add(chunk, block_size, class_idx); // Setup segment seg->chunk_base = chunk; seg->chunk_size = chunk_size; seg->block_size = block_size; seg->current = chunk; seg->end = (uint8_t*)chunk + chunk_size; seg->capacity = chunk_size / block_size; seg->refill_count++; MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p", class_idx, block_size, seg->capacity, chunk); return true; } /** * segment_alloc - Allocate from segment (fast path) * * PERFORMANCE: Force inline for maximum speed * * Fast path priority: * 1. Free list (most common, ~90-95% hit rate) * 2. Bump allocation (when free list empty) * 3. Refill (when segment exhausted) * * @return Allocated pointer, or NULL on OOM */ static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline)); static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) { void* p; // === Path 0: First allocation - need refill === // CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call if (unlikely(seg->chunk_base == NULL)) { if (!segment_refill(seg, class_idx)) { return NULL; // OOM } // Fall through to bump allocation after refill } // === Path 1: Free list (fastest, ~4-5 instructions) === p = seg->free_list; if (likely(p != NULL)) { seg->free_list = *(void**)p; // Pop from free list seg->used_count++; seg->alloc_count++; return p; } // === Path 2: Bump allocation (fast, ~6-8 instructions) === p = seg->current; void* next = (uint8_t*)p + seg->block_size; if (likely(next <= seg->end)) { seg->current = next; seg->used_count++; seg->alloc_count++; return p; } // === Path 3: Refill (slow, called ~once per 64KB) === if (!segment_refill(seg, class_idx)) { return NULL; // OOM } // Retry after refill p = seg->current; seg->current = (uint8_t*)p + seg->block_size; seg->used_count++; seg->alloc_count++; return p; } /** * segment_free_local - Free to local segment (same thread) * * @param seg Segment to free to * @param ptr Pointer to free */ static inline void segment_free_local(MidThreadSegment* seg, void* ptr) { // Push to free list (lock-free, local operation) *(void**)ptr = seg->free_list; seg->free_list = ptr; seg->used_count--; seg->free_count++; #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.local_frees, 1); #endif } // ============================================================================ // Public API // ============================================================================ /** * mid_mt_init - Initialize Mid Range MT allocator * * Thread-safe, idempotent */ void mid_mt_init(void) { if (g_mid_initialized) return; pthread_mutex_lock(&g_init_lock); if (!g_mid_initialized) { // Initialize registry g_mid_registry.entries = NULL; g_mid_registry.count = 0; g_mid_registry.capacity = 0; pthread_mutex_init(&g_mid_registry.lock, NULL); #if MID_ENABLE_STATS memset(&g_mid_stats, 0, sizeof(g_mid_stats)); #endif g_mid_initialized = 1; MID_LOG("Mid MT allocator initialized"); } pthread_mutex_unlock(&g_init_lock); } /** * mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB) * * Thread-safe, lock-free (uses TLS) */ void* mid_mt_alloc(size_t size) { // Validate size range (Phase 16: dynamic min size based on Tiny's max) if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) { return NULL; } // Initialize if needed (thread-safe) if (unlikely(!g_mid_initialized)) { mid_mt_init(); } // Get size class int class_idx = mid_size_to_class(size); if (unlikely(class_idx < 0)) { return NULL; } // Get thread-local segment MidThreadSegment* seg = &g_mid_segments[class_idx]; // Allocate from segment (fast path) void* p = segment_alloc(seg, class_idx); #if MID_ENABLE_STATS if (p) { __sync_fetch_and_add(&g_mid_stats.total_allocs, 1); } #endif return p; } /** * mid_mt_free - Free memory allocated by mid_mt_alloc * * Phase 1 implementation: * - Local free (same thread): Fast, lock-free * - Remote free (cross-thread): NOT IMPLEMENTED (memory leak) * * Phase 2 will add atomic remote free list per segment */ void mid_mt_free(void* ptr, size_t size) { if (unlikely(!ptr)) return; #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.total_frees, 1); #endif // Get size class int class_idx = mid_size_to_class(size); if (unlikely(class_idx < 0)) { MID_LOG("ERROR: Invalid size %zu in free", size); return; } // Get thread-local segment MidThreadSegment* seg = &g_mid_segments[class_idx]; // === Fast path: Check if ptr belongs to current segment === if (likely(seg->chunk_base != NULL && ptr >= seg->chunk_base && ptr < seg->end)) { // Local free (same thread, lock-free) segment_free_local(seg, ptr); return; } // === Slow path: Remote free (cross-thread) === // Phase 1: NOT IMPLEMENTED // We need to find the owning segment via registry, // then push to that segment's remote free list. // // For Phase 1 (benchmarking), we accept this memory leak. // bench_mid_large_mt uses independent working sets per thread, // so remote frees are rare. size_t block_size; int owner_class; if (mid_registry_lookup(ptr, &block_size, &owner_class)) { // Found in registry, but we can't free it yet (no remote free list) MID_LOG("WARNING: Remote free not implemented, leaking %p (size=%zu)", ptr, size); #if MID_ENABLE_STATS __sync_fetch_and_add(&g_mid_stats.remote_frees, 1); #endif // TODO Phase 2: Implement remote free // segment_free_remote(ptr, block_size, owner_class); } else { // Not found in registry - might be from a different allocator MID_LOG("ERROR: Pointer %p not found in registry (size=%zu)", ptr, size); } } /** * mid_mt_thread_exit - Cleanup thread-local segments * * Called on thread exit to release resources */ void mid_mt_thread_exit(void) { MID_LOG("Thread exit cleanup"); // Free all chunks from this thread's segments for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) { MidThreadSegment* seg = &g_mid_segments[class_idx]; if (seg->chunk_base) { // Remove from registry registry_remove(seg->chunk_base); // Deallocate chunk chunk_deallocate(seg->chunk_base, seg->chunk_size); // Clear segment memset(seg, 0, sizeof(MidThreadSegment)); } } } // ============================================================================ // Statistics (Debug/Profiling) // ============================================================================ #if MID_ENABLE_STATS void mid_mt_print_stats(void) { printf("\n=== Mid Range MT Statistics ===\n"); printf("Total allocations: %lu\n", g_mid_stats.total_allocs); printf("Total frees: %lu\n", g_mid_stats.total_frees); printf("Local frees: %lu (%.1f%%)\n", g_mid_stats.local_frees, 100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1)); printf("Remote frees: %lu (%.1f%%)\n", g_mid_stats.remote_frees, 100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1)); printf("Registry lookups: %lu\n", g_mid_stats.registry_lookups); printf("\n"); // Per-segment stats for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) { MidThreadSegment* seg = &g_mid_segments[class_idx]; if (seg->alloc_count > 0) { printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx)); printf(" Allocations: %lu\n", seg->alloc_count); printf(" Frees: %lu\n", seg->free_count); printf(" Refills: %u\n", seg->refill_count); printf(" Used count: %u / %u\n", seg->used_count, seg->capacity); } } printf("\n"); } #endif // MID_ENABLE_STATS