Performance Results (bench_mid_mt_gap, 1KB-8KB, ws=256): - Before: 41.0 M ops/s (mutex-protected registry) - After: 42.09 M ops/s (+2.65% improvement) Expected vs Actual: - Expected: +17-27% (based on perf showing 13.98% mutex overhead) - Actual: +2.65% (needs investigation) Implementation: - Added MidMTHeader (8 bytes) to each Mid MT allocation - Allocation: Write header with block_size, class_idx, magic (0xAB42) - Free: Read header for O(1) metadata lookup (no mutex!) - Eliminated entire registry infrastructure (127 lines deleted) Changes: - core/hakmem_mid_mt.h: Added MidMTHeader, removed registry structures - core/hakmem_mid_mt.c: Updated alloc/free, removed registry functions - core/box/mid_free_route_box.h: Header-based detection instead of registry lookup Code Quality: ✅ Lock-free (no pthread_mutex operations) ✅ Simpler (O(1) header read vs O(log N) binary search) ✅ Smaller binary (127 lines deleted) ✅ Positive improvement (no regression) Next: Investigate why improvement is smaller than expected 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
452 lines
14 KiB
C
452 lines
14 KiB
C
/**
|
|
* hakmem_mid_mt.c
|
|
*
|
|
* Mid Range Multi-threaded Allocator Implementation (8-32KB)
|
|
* mimalloc-style per-thread segment for optimal MT performance
|
|
*
|
|
* Design:
|
|
* - Per-thread segments (TLS) for lock-free allocation
|
|
* - Global registry for segment lookup during free()
|
|
* - 64KB chunks with bump + free list allocation
|
|
* - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking)
|
|
* - Phase 2: Will add atomic remote free list
|
|
*/
|
|
|
|
#include "hakmem_mid_mt.h"
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <sys/mman.h>
|
|
#include <unistd.h>
|
|
#include <assert.h>
|
|
#include <stdatomic.h>
|
|
|
|
// Use likely/unlikely hints for branch prediction
|
|
#ifndef likely
|
|
#define likely(x) __builtin_expect(!!(x), 1)
|
|
#endif
|
|
#ifndef unlikely
|
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
|
#endif
|
|
|
|
// ============================================================================
|
|
// Global and TLS Variables
|
|
// ============================================================================
|
|
|
|
// TLS: Each thread has independent segments (lock-free!)
|
|
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};
|
|
|
|
// Phase 6-B: Registry removed (no longer needed with header-based free)
|
|
|
|
// Statistics (if enabled)
|
|
#if MID_ENABLE_STATS
|
|
MidStats g_mid_stats = {0};
|
|
#endif
|
|
|
|
// Initialization flag
|
|
static volatile int g_mid_initialized = 0;
|
|
static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
// ============================================================================
|
|
// Forward Declarations
|
|
// ============================================================================
|
|
|
|
static bool segment_refill(MidThreadSegment* seg, int class_idx);
|
|
static void* segment_alloc(MidThreadSegment* seg, int class_idx);
|
|
static void segment_free_local(MidThreadSegment* seg, void* ptr);
|
|
static void* chunk_allocate(size_t chunk_size);
|
|
static void chunk_deallocate(void* chunk, size_t chunk_size);
|
|
// Phase 6-B: Registry functions removed (header-based free instead)
|
|
|
|
// ============================================================================
|
|
// Chunk Management (mmap/munmap wrappers)
|
|
// ============================================================================
|
|
|
|
/**
|
|
* chunk_allocate - Allocate a new chunk via mmap
|
|
*
|
|
* @param chunk_size Size of chunk (typically 64KB)
|
|
* @return Chunk base address, or NULL on failure
|
|
*/
|
|
static void* chunk_allocate(size_t chunk_size) {
|
|
void* chunk = mmap(
|
|
NULL,
|
|
chunk_size,
|
|
PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS,
|
|
-1,
|
|
0
|
|
);
|
|
|
|
if (chunk == MAP_FAILED) {
|
|
MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size);
|
|
return NULL;
|
|
}
|
|
|
|
MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size);
|
|
return chunk;
|
|
}
|
|
|
|
/**
|
|
* chunk_deallocate - Free chunk via munmap
|
|
*
|
|
* @param chunk Chunk base address
|
|
* @param chunk_size Size of chunk
|
|
*/
|
|
static void chunk_deallocate(void* chunk, size_t chunk_size) {
|
|
if (!chunk) return;
|
|
|
|
int ret = munmap(chunk, chunk_size);
|
|
if (ret != 0) {
|
|
MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size);
|
|
} else {
|
|
MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Segment Operations
|
|
// ============================================================================
|
|
|
|
/**
|
|
* segment_refill - Allocate new chunk and setup segment
|
|
*
|
|
* Called when segment is exhausted (rare, ~0.1% of allocations)
|
|
*
|
|
* Phase 6-B: No longer registers chunks (header-based free instead)
|
|
*
|
|
* @return true on success, false on OOM
|
|
*/
|
|
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
|
|
size_t block_size = mid_class_to_size(class_idx);
|
|
size_t chunk_size = MID_CHUNK_SIZE;
|
|
|
|
// Allocate new chunk via mmap
|
|
void* chunk = chunk_allocate(chunk_size);
|
|
if (!chunk) {
|
|
return false;
|
|
}
|
|
|
|
// Phase 6-B: No registry add (header-based free doesn't need registry)
|
|
|
|
// Setup segment
|
|
seg->chunk_base = chunk;
|
|
seg->chunk_size = chunk_size;
|
|
seg->block_size = block_size;
|
|
seg->current = chunk;
|
|
seg->end = (uint8_t*)chunk + chunk_size;
|
|
seg->capacity = chunk_size / block_size;
|
|
seg->refill_count++;
|
|
|
|
MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p",
|
|
class_idx, block_size, seg->capacity, chunk);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* segment_alloc - Allocate from segment (fast path)
|
|
*
|
|
* PERFORMANCE: Force inline for maximum speed
|
|
*
|
|
* Fast path priority:
|
|
* 1. Free list (most common, ~90-95% hit rate)
|
|
* 2. Bump allocation (when free list empty)
|
|
* 3. Refill (when segment exhausted)
|
|
*
|
|
* Phase 6-B: Now writes MidMTHeader for lock-free free()
|
|
*
|
|
* @return Allocated pointer (after header), or NULL on OOM
|
|
*/
|
|
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
|
|
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
|
|
void* block; // Block start (includes header space)
|
|
size_t block_size = seg->block_size;
|
|
|
|
// === Path 0: First allocation - need refill ===
|
|
// CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
|
|
if (unlikely(seg->chunk_base == NULL)) {
|
|
if (!segment_refill(seg, class_idx)) {
|
|
return NULL; // OOM
|
|
}
|
|
block_size = seg->block_size; // Update after refill
|
|
}
|
|
|
|
// === Path 1: Free list (fastest, ~4-5 instructions) ===
|
|
// Note: Free list stores next pointer at block start (overwrites header when freed)
|
|
block = seg->free_list;
|
|
if (likely(block != NULL)) {
|
|
seg->free_list = *(void**)block; // Pop from free list
|
|
seg->used_count++;
|
|
seg->alloc_count++;
|
|
|
|
// Phase 6-B: Write header before returning
|
|
MidMTHeader* hdr = (MidMTHeader*)block;
|
|
hdr->block_size = (uint32_t)block_size;
|
|
hdr->class_idx = (uint16_t)class_idx;
|
|
hdr->magic = MID_MT_MAGIC;
|
|
|
|
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
|
}
|
|
|
|
// === Path 2: Bump allocation (fast, ~6-8 instructions) ===
|
|
block = seg->current;
|
|
void* next = (uint8_t*)block + block_size;
|
|
|
|
if (likely(next <= seg->end)) {
|
|
seg->current = next;
|
|
seg->used_count++;
|
|
seg->alloc_count++;
|
|
|
|
// Phase 6-B: Write header before returning
|
|
MidMTHeader* hdr = (MidMTHeader*)block;
|
|
hdr->block_size = (uint32_t)block_size;
|
|
hdr->class_idx = (uint16_t)class_idx;
|
|
hdr->magic = MID_MT_MAGIC;
|
|
|
|
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
|
}
|
|
|
|
// === Path 3: Refill (slow, called ~once per 64KB) ===
|
|
if (!segment_refill(seg, class_idx)) {
|
|
return NULL; // OOM
|
|
}
|
|
|
|
// Retry after refill
|
|
block = seg->current;
|
|
block_size = seg->block_size; // Update after refill
|
|
seg->current = (uint8_t*)block + block_size;
|
|
seg->used_count++;
|
|
seg->alloc_count++;
|
|
|
|
// Phase 6-B: Write header before returning
|
|
MidMTHeader* hdr = (MidMTHeader*)block;
|
|
hdr->block_size = (uint32_t)block_size;
|
|
hdr->class_idx = (uint16_t)class_idx;
|
|
hdr->magic = MID_MT_MAGIC;
|
|
|
|
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
|
}
|
|
|
|
/**
|
|
* segment_free_local - Free to local segment (same thread)
|
|
*
|
|
* @param seg Segment to free to
|
|
* @param ptr Pointer to free (user pointer, after header)
|
|
*
|
|
* Phase 6-B: Adjusted for header-based allocation
|
|
*/
|
|
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
|
|
// Phase 6-B: Get block start (before header)
|
|
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
|
|
|
// Push to free list (lock-free, local operation)
|
|
// Note: Overwrites header with next pointer (header no longer needed after free)
|
|
*(void**)block = seg->free_list;
|
|
seg->free_list = block;
|
|
seg->used_count--;
|
|
seg->free_count++;
|
|
|
|
#if MID_ENABLE_STATS
|
|
__sync_fetch_and_add(&g_mid_stats.local_frees, 1);
|
|
#endif
|
|
}
|
|
|
|
// ============================================================================
|
|
// Public API
|
|
// ============================================================================
|
|
|
|
/**
|
|
* mid_mt_init - Initialize Mid Range MT allocator
|
|
*
|
|
* Thread-safe, idempotent
|
|
*
|
|
* Phase 6-B: Simplified (no registry initialization)
|
|
*/
|
|
void mid_mt_init(void) {
|
|
if (g_mid_initialized) return;
|
|
|
|
pthread_mutex_lock(&g_init_lock);
|
|
|
|
if (!g_mid_initialized) {
|
|
// Phase 6-B: No registry initialization (header-based free)
|
|
|
|
#if MID_ENABLE_STATS
|
|
memset(&g_mid_stats, 0, sizeof(g_mid_stats));
|
|
#endif
|
|
|
|
g_mid_initialized = 1;
|
|
|
|
MID_LOG("Mid MT allocator initialized (Phase 6-B: header-based)");
|
|
}
|
|
|
|
pthread_mutex_unlock(&g_init_lock);
|
|
}
|
|
|
|
/**
|
|
* mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB)
|
|
*
|
|
* Thread-safe, lock-free (uses TLS)
|
|
*/
|
|
void* mid_mt_alloc(size_t size) {
|
|
// Validate size range (Phase 16: dynamic min size based on Tiny's max)
|
|
if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) {
|
|
return NULL;
|
|
}
|
|
|
|
// Initialize if needed (thread-safe)
|
|
if (unlikely(!g_mid_initialized)) {
|
|
mid_mt_init();
|
|
}
|
|
|
|
// Get size class
|
|
int class_idx = mid_size_to_class(size);
|
|
if (unlikely(class_idx < 0)) {
|
|
return NULL;
|
|
}
|
|
|
|
// Get thread-local segment
|
|
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
|
|
|
// Allocate from segment (fast path)
|
|
void* p = segment_alloc(seg, class_idx);
|
|
|
|
#if MID_ENABLE_STATS
|
|
if (p) {
|
|
__sync_fetch_and_add(&g_mid_stats.total_allocs, 1);
|
|
}
|
|
#endif
|
|
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
* mid_mt_free - Free memory allocated by mid_mt_alloc
|
|
*
|
|
* Phase 6-B: Header-based free (lock-free, no registry lookup!)
|
|
* - Reads MidMTHeader to get block metadata (O(1), ~2 cycles)
|
|
* - Eliminates pthread_mutex_lock/unlock (13.98% CPU overhead)
|
|
* - Expected: +17-27% throughput improvement
|
|
*
|
|
* Local free (same thread): Ultra-fast, lock-free
|
|
* Remote free (cross-thread): NOT IMPLEMENTED (memory leak, Phase 2 will add atomic remote free list)
|
|
*/
|
|
void mid_mt_free(void* ptr, size_t size) {
|
|
if (unlikely(!ptr)) return;
|
|
|
|
#if MID_ENABLE_STATS
|
|
__sync_fetch_and_add(&g_mid_stats.total_frees, 1);
|
|
#endif
|
|
|
|
// Phase 6-B: Read header for O(1) metadata lookup (no mutex!)
|
|
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
|
MidMTHeader* hdr = (MidMTHeader*)block;
|
|
|
|
// Validate header magic (sanity check)
|
|
if (unlikely(hdr->magic != MID_MT_MAGIC)) {
|
|
MID_LOG("ERROR: Invalid Mid MT magic 0x%X (expected 0x%X) for ptr %p",
|
|
hdr->magic, MID_MT_MAGIC, ptr);
|
|
return;
|
|
}
|
|
|
|
// Get metadata from header (no registry lookup!)
|
|
int class_idx = hdr->class_idx;
|
|
|
|
// Validate class_idx
|
|
if (unlikely(class_idx < 0 || class_idx >= MID_NUM_CLASSES)) {
|
|
MID_LOG("ERROR: Invalid class_idx %d in header for ptr %p", class_idx, ptr);
|
|
return;
|
|
}
|
|
|
|
// Get thread-local segment for this size class
|
|
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
|
|
|
// === Fast path: Check if block belongs to current segment ===
|
|
// Note: Check block (not ptr), since segment tracks block addresses
|
|
if (likely(seg->chunk_base != NULL &&
|
|
block >= seg->chunk_base &&
|
|
block < seg->end)) {
|
|
// Local free (same thread, lock-free)
|
|
segment_free_local(seg, ptr);
|
|
return;
|
|
}
|
|
|
|
// === Slow path: Remote free (cross-thread) ===
|
|
// Phase 1: NOT IMPLEMENTED
|
|
// We would need to find the owning segment and push to its remote free list.
|
|
//
|
|
// For Phase 1 (benchmarking), we accept this memory leak.
|
|
// bench_mid_mt_gap uses single-threaded workload, so remote frees never happen.
|
|
|
|
MID_LOG("WARNING: Remote free not implemented, leaking %p (block_size=%u, class=%d)",
|
|
ptr, hdr->block_size, class_idx);
|
|
|
|
#if MID_ENABLE_STATS
|
|
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
|
|
#endif
|
|
|
|
// TODO Phase 2: Implement remote free
|
|
// segment_free_remote(ptr, hdr->block_size, class_idx);
|
|
}
|
|
|
|
/**
|
|
* mid_mt_thread_exit - Cleanup thread-local segments
|
|
*
|
|
* Called on thread exit to release resources
|
|
*
|
|
* Phase 6-B: No registry cleanup needed (header-based free)
|
|
*/
|
|
void mid_mt_thread_exit(void) {
|
|
MID_LOG("Thread exit cleanup");
|
|
|
|
// Free all chunks from this thread's segments
|
|
for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
|
|
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
|
|
|
if (seg->chunk_base) {
|
|
// Phase 6-B: No registry remove (no registry exists)
|
|
|
|
// Deallocate chunk
|
|
chunk_deallocate(seg->chunk_base, seg->chunk_size);
|
|
|
|
// Clear segment
|
|
memset(seg, 0, sizeof(MidThreadSegment));
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Statistics (Debug/Profiling)
|
|
// ============================================================================
|
|
|
|
#if MID_ENABLE_STATS
|
|
|
|
void mid_mt_print_stats(void) {
|
|
printf("\n=== Mid Range MT Statistics ===\n");
|
|
printf("Total allocations: %lu\n", g_mid_stats.total_allocs);
|
|
printf("Total frees: %lu\n", g_mid_stats.total_frees);
|
|
printf("Local frees: %lu (%.1f%%)\n",
|
|
g_mid_stats.local_frees,
|
|
100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1));
|
|
printf("Remote frees: %lu (%.1f%%)\n",
|
|
g_mid_stats.remote_frees,
|
|
100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1));
|
|
printf("Registry lookups: %lu\n", g_mid_stats.registry_lookups);
|
|
printf("\n");
|
|
|
|
// Per-segment stats
|
|
for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
|
|
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
|
if (seg->alloc_count > 0) {
|
|
printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx));
|
|
printf(" Allocations: %lu\n", seg->alloc_count);
|
|
printf(" Frees: %lu\n", seg->free_count);
|
|
printf(" Refills: %u\n", seg->refill_count);
|
|
printf(" Used count: %u / %u\n", seg->used_count, seg->capacity);
|
|
}
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
#endif // MID_ENABLE_STATS
|