Files
hakmem/core/hakmem_mid_mt.c

564 lines
17 KiB
C
Raw Normal View History

/**
* hakmem_mid_mt.c
*
* Mid Range Multi-threaded Allocator Implementation (8-32KB)
* mimalloc-style per-thread segment for optimal MT performance
*
* Design:
* - Per-thread segments (TLS) for lock-free allocation
* - Global registry for segment lookup during free()
* - 64KB chunks with bump + free list allocation
* - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking)
* - Phase 2: Will add atomic remote free list
*/
#include "hakmem_mid_mt.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <assert.h>
#include <stdatomic.h>
// Use likely/unlikely hints for branch prediction
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
// ============================================================================
// Global and TLS Variables
// ============================================================================
// TLS: Each thread has independent segments (lock-free!)
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};
// Global registry (protected by lock)
MidGlobalRegistry g_mid_registry = {
.entries = NULL,
.count = 0,
.capacity = 0,
.lock = PTHREAD_MUTEX_INITIALIZER
};
// Statistics (if enabled)
#if MID_ENABLE_STATS
MidStats g_mid_stats = {0};
#endif
// Initialization flag
static volatile int g_mid_initialized = 0;
static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER;
// ============================================================================
// Forward Declarations
// ============================================================================
static bool segment_refill(MidThreadSegment* seg, int class_idx);
static void* segment_alloc(MidThreadSegment* seg, int class_idx);
static void segment_free_local(MidThreadSegment* seg, void* ptr);
static void* chunk_allocate(size_t chunk_size);
static void chunk_deallocate(void* chunk, size_t chunk_size);
static void registry_add(void* base, size_t block_size, int class_idx);
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx); // Public for hak_free_at()
static void registry_remove(void* base);
// ============================================================================
// Registry Operations (Protected by Lock)
// ============================================================================
/**
* registry_add - Add a new segment to global registry
*
* Called during segment refill (rare, ~0.1% of allocations)
*/
static void registry_add(void* base, size_t block_size, int class_idx) {
pthread_mutex_lock(&g_mid_registry.lock);
// Grow registry if needed
if (g_mid_registry.count >= g_mid_registry.capacity) {
uint32_t new_capacity = g_mid_registry.capacity == 0
? MID_REGISTRY_INITIAL_CAPACITY
: g_mid_registry.capacity * 2;
// CRITICAL: Use mmap() instead of realloc() to avoid deadlock!
// realloc() would go through hakmem → mid_mt → registry_add → deadlock
size_t new_size = new_capacity * sizeof(MidSegmentRegistry);
MidSegmentRegistry* new_entries = mmap(
NULL, new_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0
);
if (new_entries == MAP_FAILED) {
new_entries = NULL;
} else if (g_mid_registry.entries) {
// Copy old entries
memcpy(new_entries, g_mid_registry.entries,
g_mid_registry.count * sizeof(MidSegmentRegistry));
// Don't unmap old entries (lazy cleanup, avoids complexity)
}
if (!new_entries) {
pthread_mutex_unlock(&g_mid_registry.lock);
MID_LOG("ERROR: Registry realloc failed");
return;
}
g_mid_registry.entries = new_entries;
g_mid_registry.capacity = new_capacity;
}
// Add new entry
MidSegmentRegistry* entry = &g_mid_registry.entries[g_mid_registry.count];
entry->base = base;
entry->block_size = block_size;
entry->class_idx = class_idx;
g_mid_registry.count++;
// Keep entries sorted by base address (for binary search)
// Simple insertion: swap with previous until in order
for (uint32_t i = g_mid_registry.count - 1; i > 0; i--) {
if (g_mid_registry.entries[i].base >= g_mid_registry.entries[i - 1].base) {
break;
}
// Swap
MidSegmentRegistry tmp = g_mid_registry.entries[i];
g_mid_registry.entries[i] = g_mid_registry.entries[i - 1];
g_mid_registry.entries[i - 1] = tmp;
}
pthread_mutex_unlock(&g_mid_registry.lock);
MID_LOG("Registry add: base=%p, block_size=%zu, class=%d, count=%u",
base, block_size, class_idx, g_mid_registry.count);
}
/**
* mid_registry_lookup - Find segment containing ptr via binary search
*
* Called during free() when ptr is not in current segment (uncommon)
*
* @return true if found, false otherwise
*/
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx) {
pthread_mutex_lock(&g_mid_registry.lock);
#if MID_ENABLE_STATS
__sync_fetch_and_add(&g_mid_stats.registry_lookups, 1);
#endif
// Binary search for segment containing ptr
int left = 0;
int right = (int)g_mid_registry.count - 1;
bool found = false;
while (left <= right) {
int mid = left + (right - left) / 2;
MidSegmentRegistry* entry = &g_mid_registry.entries[mid];
void* seg_end = (uint8_t*)entry->base + MID_CHUNK_SIZE;
if (ptr < entry->base) {
right = mid - 1;
} else if (ptr >= seg_end) {
left = mid + 1;
} else {
// Found!
*out_block_size = entry->block_size;
*out_class_idx = entry->class_idx;
found = true;
break;
}
}
pthread_mutex_unlock(&g_mid_registry.lock);
return found;
}
/**
* registry_remove - Remove segment from registry
*
* Called when segment is completely freed (rare)
*/
static void registry_remove(void* base) {
pthread_mutex_lock(&g_mid_registry.lock);
// Find entry with matching base
for (uint32_t i = 0; i < g_mid_registry.count; i++) {
if (g_mid_registry.entries[i].base == base) {
// Remove by shifting remaining entries
for (uint32_t j = i; j < g_mid_registry.count - 1; j++) {
g_mid_registry.entries[j] = g_mid_registry.entries[j + 1];
}
g_mid_registry.count--;
pthread_mutex_unlock(&g_mid_registry.lock);
MID_LOG("Registry remove: base=%p, count=%u", base, g_mid_registry.count);
return;
}
}
pthread_mutex_unlock(&g_mid_registry.lock);
}
// ============================================================================
// Chunk Management (mmap/munmap wrappers)
// ============================================================================
/**
* chunk_allocate - Allocate a new chunk via mmap
*
* @param chunk_size Size of chunk (typically 64KB)
* @return Chunk base address, or NULL on failure
*/
static void* chunk_allocate(size_t chunk_size) {
void* chunk = mmap(
NULL,
chunk_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0
);
if (chunk == MAP_FAILED) {
MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size);
return NULL;
}
MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size);
return chunk;
}
/**
* chunk_deallocate - Free chunk via munmap
*
* @param chunk Chunk base address
* @param chunk_size Size of chunk
*/
static void chunk_deallocate(void* chunk, size_t chunk_size) {
if (!chunk) return;
int ret = munmap(chunk, chunk_size);
if (ret != 0) {
MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size);
} else {
MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size);
}
}
// ============================================================================
// Segment Operations
// ============================================================================
/**
* segment_refill - Allocate new chunk and setup segment
*
* Called when segment is exhausted (rare, ~0.1% of allocations)
*
* @return true on success, false on OOM
*/
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
size_t block_size = mid_class_to_size(class_idx);
size_t chunk_size = MID_CHUNK_SIZE;
// Allocate new chunk via mmap
void* chunk = chunk_allocate(chunk_size);
if (!chunk) {
return false;
}
// Register chunk in global registry (for free() lookup)
registry_add(chunk, block_size, class_idx);
// Setup segment
seg->chunk_base = chunk;
seg->chunk_size = chunk_size;
seg->block_size = block_size;
seg->current = chunk;
seg->end = (uint8_t*)chunk + chunk_size;
seg->capacity = chunk_size / block_size;
seg->refill_count++;
MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p",
class_idx, block_size, seg->capacity, chunk);
return true;
}
/**
* segment_alloc - Allocate from segment (fast path)
*
* PERFORMANCE: Force inline for maximum speed
*
* Fast path priority:
* 1. Free list (most common, ~90-95% hit rate)
* 2. Bump allocation (when free list empty)
* 3. Refill (when segment exhausted)
*
* @return Allocated pointer, or NULL on OOM
*/
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
void* p;
// === Path 0: First allocation - need refill ===
// CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
if (unlikely(seg->chunk_base == NULL)) {
if (!segment_refill(seg, class_idx)) {
return NULL; // OOM
}
// Fall through to bump allocation after refill
}
// === Path 1: Free list (fastest, ~4-5 instructions) ===
p = seg->free_list;
if (likely(p != NULL)) {
seg->free_list = *(void**)p; // Pop from free list
seg->used_count++;
seg->alloc_count++;
return p;
}
// === Path 2: Bump allocation (fast, ~6-8 instructions) ===
p = seg->current;
void* next = (uint8_t*)p + seg->block_size;
if (likely(next <= seg->end)) {
seg->current = next;
seg->used_count++;
seg->alloc_count++;
return p;
}
// === Path 3: Refill (slow, called ~once per 64KB) ===
if (!segment_refill(seg, class_idx)) {
return NULL; // OOM
}
// Retry after refill
p = seg->current;
seg->current = (uint8_t*)p + seg->block_size;
seg->used_count++;
seg->alloc_count++;
return p;
}
/**
* segment_free_local - Free to local segment (same thread)
*
* @param seg Segment to free to
* @param ptr Pointer to free
*/
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
// Push to free list (lock-free, local operation)
*(void**)ptr = seg->free_list;
seg->free_list = ptr;
seg->used_count--;
seg->free_count++;
#if MID_ENABLE_STATS
__sync_fetch_and_add(&g_mid_stats.local_frees, 1);
#endif
}
// ============================================================================
// Public API
// ============================================================================
/**
* mid_mt_init - Initialize Mid Range MT allocator
*
* Thread-safe, idempotent
*/
void mid_mt_init(void) {
if (g_mid_initialized) return;
pthread_mutex_lock(&g_init_lock);
if (!g_mid_initialized) {
// Initialize registry
g_mid_registry.entries = NULL;
g_mid_registry.count = 0;
g_mid_registry.capacity = 0;
pthread_mutex_init(&g_mid_registry.lock, NULL);
#if MID_ENABLE_STATS
memset(&g_mid_stats, 0, sizeof(g_mid_stats));
#endif
g_mid_initialized = 1;
MID_LOG("Mid MT allocator initialized");
}
pthread_mutex_unlock(&g_init_lock);
}
/**
* mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB)
*
* Thread-safe, lock-free (uses TLS)
*/
void* mid_mt_alloc(size_t size) {
Phase 16: Dynamic Tiny/Mid Boundary with A/B Testing (ENV-controlled) IMPLEMENTATION: =============== Add dynamic boundary adjustment between Tiny and Mid allocators via HAKMEM_TINY_MAX_CLASS environment variable for performance tuning. Changes: -------- 1. hakmem_tiny.h/c: Add tiny_get_max_size() - reads ENV and maps class to max usable size (default: class 7 = 1023B, can reduce to class 5 = 255B) 2. hakmem_mid_mt.h/c: Add mid_get_min_size() - returns tiny_get_max_size() + 1 to ensure no size gap between allocators 3. hak_alloc_api.inc.h: Replace static TINY_MAX_SIZE with dynamic tiny_get_max_size() call in allocation routing logic 4. Size gap fix: Mid's range now dynamically adjusts based on Tiny's max (prevents 256-1023B from falling through when HAKMEM_TINY_MAX_CLASS=5) A/B BENCHMARK RESULTS: ====================== Config A (Default, C0-C7, Tiny up to 1023B): 128B: 6.34M ops/s | 256B: 6.34M ops/s 512B: 5.55M ops/s | 1024B: 5.91M ops/s Config B (Reduced, C0-C5, Tiny up to 255B): 128B: 1.38M ops/s (-78%) | 256B: 1.36M ops/s (-79%) 512B: 1.33M ops/s (-76%) | 1024B: 1.37M ops/s (-77%) FINDINGS: ========= ✅ Size gap fixed - no OOM crashes with HAKMEM_TINY_MAX_CLASS=5 ❌ Severe performance degradation (-76% to -79%) when reducing Tiny coverage ❌ Even 128B degraded (should still use Tiny) - possible class filtering issue ⚠️ Mid's coarse size classes (8KB/16KB/32KB) cause fragmentation for small sizes HYPOTHESIS: ----------- Mid allocator uses 8KB blocks for all 256-1024B allocations, causing: - Severe internal fragmentation (1024B request → 8KB block = 87% waste) - Poor cache utilization - Consistent ~1.3M ops/s across all sizes (same 8KB class) RECOMMENDATION: =============== **Keep default HAKMEM_TINY_MAX_CLASS=7 (C0-C7, up to 1023B)** Reducing Tiny coverage is COUNTERPRODUCTIVE with current Mid allocator design. To make this viable, Mid would need finer size classes for 256B-8KB range. ENV USAGE (for future experimentation): ---------------------------------------- export HAKMEM_TINY_MAX_CLASS=7 # Default (C0-C7, up to 1023B) export HAKMEM_TINY_MAX_CLASS=5 # Reduced (C0-C5, up to 255B) - NOT recommended 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-16 01:26:48 +09:00
// Validate size range (Phase 16: dynamic min size based on Tiny's max)
if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) {
return NULL;
}
// Initialize if needed (thread-safe)
if (unlikely(!g_mid_initialized)) {
mid_mt_init();
}
// Get size class
int class_idx = mid_size_to_class(size);
if (unlikely(class_idx < 0)) {
return NULL;
}
// Get thread-local segment
MidThreadSegment* seg = &g_mid_segments[class_idx];
// Allocate from segment (fast path)
void* p = segment_alloc(seg, class_idx);
#if MID_ENABLE_STATS
if (p) {
__sync_fetch_and_add(&g_mid_stats.total_allocs, 1);
}
#endif
return p;
}
/**
* mid_mt_free - Free memory allocated by mid_mt_alloc
*
* Phase 1 implementation:
* - Local free (same thread): Fast, lock-free
* - Remote free (cross-thread): NOT IMPLEMENTED (memory leak)
*
* Phase 2 will add atomic remote free list per segment
*/
void mid_mt_free(void* ptr, size_t size) {
if (unlikely(!ptr)) return;
#if MID_ENABLE_STATS
__sync_fetch_and_add(&g_mid_stats.total_frees, 1);
#endif
// Get size class
int class_idx = mid_size_to_class(size);
if (unlikely(class_idx < 0)) {
MID_LOG("ERROR: Invalid size %zu in free", size);
return;
}
// Get thread-local segment
MidThreadSegment* seg = &g_mid_segments[class_idx];
// === Fast path: Check if ptr belongs to current segment ===
if (likely(seg->chunk_base != NULL &&
ptr >= seg->chunk_base &&
ptr < seg->end)) {
// Local free (same thread, lock-free)
segment_free_local(seg, ptr);
return;
}
// === Slow path: Remote free (cross-thread) ===
// Phase 1: NOT IMPLEMENTED
// We need to find the owning segment via registry,
// then push to that segment's remote free list.
//
// For Phase 1 (benchmarking), we accept this memory leak.
// bench_mid_large_mt uses independent working sets per thread,
// so remote frees are rare.
size_t block_size;
int owner_class;
if (mid_registry_lookup(ptr, &block_size, &owner_class)) {
// Found in registry, but we can't free it yet (no remote free list)
MID_LOG("WARNING: Remote free not implemented, leaking %p (size=%zu)", ptr, size);
#if MID_ENABLE_STATS
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
#endif
// TODO Phase 2: Implement remote free
// segment_free_remote(ptr, block_size, owner_class);
} else {
// Not found in registry - might be from a different allocator
MID_LOG("ERROR: Pointer %p not found in registry (size=%zu)", ptr, size);
}
}
/**
* mid_mt_thread_exit - Cleanup thread-local segments
*
* Called on thread exit to release resources
*/
void mid_mt_thread_exit(void) {
MID_LOG("Thread exit cleanup");
// Free all chunks from this thread's segments
for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
MidThreadSegment* seg = &g_mid_segments[class_idx];
if (seg->chunk_base) {
// Remove from registry
registry_remove(seg->chunk_base);
// Deallocate chunk
chunk_deallocate(seg->chunk_base, seg->chunk_size);
// Clear segment
memset(seg, 0, sizeof(MidThreadSegment));
}
}
}
// ============================================================================
// Statistics (Debug/Profiling)
// ============================================================================
#if MID_ENABLE_STATS
void mid_mt_print_stats(void) {
printf("\n=== Mid Range MT Statistics ===\n");
printf("Total allocations: %lu\n", g_mid_stats.total_allocs);
printf("Total frees: %lu\n", g_mid_stats.total_frees);
printf("Local frees: %lu (%.1f%%)\n",
g_mid_stats.local_frees,
100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1));
printf("Remote frees: %lu (%.1f%%)\n",
g_mid_stats.remote_frees,
100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1));
printf("Registry lookups: %lu\n", g_mid_stats.registry_lookups);
printf("\n");
// Per-segment stats
for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
MidThreadSegment* seg = &g_mid_segments[class_idx];
if (seg->alloc_count > 0) {
printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx));
printf(" Allocations: %lu\n", seg->alloc_count);
printf(" Frees: %lu\n", seg->free_count);
printf(" Refills: %u\n", seg->refill_count);
printf(" Used count: %u / %u\n", seg->used_count, seg->capacity);
}
}
printf("\n");
}
#endif // MID_ENABLE_STATS