Phase 6-B: Header-based Mid MT free (lock-free, +2.65% improvement)
Performance Results (bench_mid_mt_gap, 1KB-8KB, ws=256): - Before: 41.0 M ops/s (mutex-protected registry) - After: 42.09 M ops/s (+2.65% improvement) Expected vs Actual: - Expected: +17-27% (based on perf showing 13.98% mutex overhead) - Actual: +2.65% (needs investigation) Implementation: - Added MidMTHeader (8 bytes) to each Mid MT allocation - Allocation: Write header with block_size, class_idx, magic (0xAB42) - Free: Read header for O(1) metadata lookup (no mutex!) - Eliminated entire registry infrastructure (127 lines deleted) Changes: - core/hakmem_mid_mt.h: Added MidMTHeader, removed registry structures - core/hakmem_mid_mt.c: Updated alloc/free, removed registry functions - core/box/mid_free_route_box.h: Header-based detection instead of registry lookup Code Quality: ✅ Lock-free (no pthread_mutex operations) ✅ Simpler (O(1) header read vs O(log N) binary search) ✅ Smaller binary (127 lines deleted) ✅ Positive improvement (no regression) Next: Investigate why improvement is smaller than expected 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -36,13 +36,7 @@
|
||||
// TLS: Each thread has independent segments (lock-free!)
|
||||
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};
|
||||
|
||||
// Global registry (protected by lock)
|
||||
MidGlobalRegistry g_mid_registry = {
|
||||
.entries = NULL,
|
||||
.count = 0,
|
||||
.capacity = 0,
|
||||
.lock = PTHREAD_MUTEX_INITIALIZER
|
||||
};
|
||||
// Phase 6-B: Registry removed (no longer needed with header-based free)
|
||||
|
||||
// Statistics (if enabled)
|
||||
#if MID_ENABLE_STATS
|
||||
@ -62,150 +56,7 @@ static void* segment_alloc(MidThreadSegment* seg, int class_idx);
|
||||
static void segment_free_local(MidThreadSegment* seg, void* ptr);
|
||||
static void* chunk_allocate(size_t chunk_size);
|
||||
static void chunk_deallocate(void* chunk, size_t chunk_size);
|
||||
static void registry_add(void* base, size_t block_size, int class_idx);
|
||||
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx); // Public for hak_free_at()
|
||||
static void registry_remove(void* base);
|
||||
|
||||
// ============================================================================
|
||||
// Registry Operations (Protected by Lock)
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* registry_add - Add a new segment to global registry
|
||||
*
|
||||
* Called during segment refill (rare, ~0.1% of allocations)
|
||||
*/
|
||||
static void registry_add(void* base, size_t block_size, int class_idx) {
|
||||
pthread_mutex_lock(&g_mid_registry.lock);
|
||||
|
||||
// Grow registry if needed
|
||||
if (g_mid_registry.count >= g_mid_registry.capacity) {
|
||||
uint32_t new_capacity = g_mid_registry.capacity == 0
|
||||
? MID_REGISTRY_INITIAL_CAPACITY
|
||||
: g_mid_registry.capacity * 2;
|
||||
|
||||
// CRITICAL: Use mmap() instead of realloc() to avoid deadlock!
|
||||
// realloc() would go through hakmem → mid_mt → registry_add → deadlock
|
||||
size_t new_size = new_capacity * sizeof(MidSegmentRegistry);
|
||||
MidSegmentRegistry* new_entries = mmap(
|
||||
NULL, new_size,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS,
|
||||
-1, 0
|
||||
);
|
||||
|
||||
if (new_entries == MAP_FAILED) {
|
||||
new_entries = NULL;
|
||||
} else if (g_mid_registry.entries) {
|
||||
// Copy old entries
|
||||
memcpy(new_entries, g_mid_registry.entries,
|
||||
g_mid_registry.count * sizeof(MidSegmentRegistry));
|
||||
// Don't unmap old entries (lazy cleanup, avoids complexity)
|
||||
}
|
||||
|
||||
if (!new_entries) {
|
||||
pthread_mutex_unlock(&g_mid_registry.lock);
|
||||
MID_LOG("ERROR: Registry realloc failed");
|
||||
return;
|
||||
}
|
||||
|
||||
g_mid_registry.entries = new_entries;
|
||||
g_mid_registry.capacity = new_capacity;
|
||||
}
|
||||
|
||||
// Add new entry
|
||||
MidSegmentRegistry* entry = &g_mid_registry.entries[g_mid_registry.count];
|
||||
entry->base = base;
|
||||
entry->block_size = block_size;
|
||||
entry->class_idx = class_idx;
|
||||
g_mid_registry.count++;
|
||||
|
||||
// Keep entries sorted by base address (for binary search)
|
||||
// Simple insertion: swap with previous until in order
|
||||
for (uint32_t i = g_mid_registry.count - 1; i > 0; i--) {
|
||||
if (g_mid_registry.entries[i].base >= g_mid_registry.entries[i - 1].base) {
|
||||
break;
|
||||
}
|
||||
// Swap
|
||||
MidSegmentRegistry tmp = g_mid_registry.entries[i];
|
||||
g_mid_registry.entries[i] = g_mid_registry.entries[i - 1];
|
||||
g_mid_registry.entries[i - 1] = tmp;
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&g_mid_registry.lock);
|
||||
|
||||
MID_LOG("Registry add: base=%p, block_size=%zu, class=%d, count=%u",
|
||||
base, block_size, class_idx, g_mid_registry.count);
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_registry_lookup - Find segment containing ptr via binary search
|
||||
*
|
||||
* Called during free() when ptr is not in current segment (uncommon)
|
||||
*
|
||||
* @return true if found, false otherwise
|
||||
*/
|
||||
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx) {
|
||||
pthread_mutex_lock(&g_mid_registry.lock);
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
__sync_fetch_and_add(&g_mid_stats.registry_lookups, 1);
|
||||
#endif
|
||||
|
||||
// Binary search for segment containing ptr
|
||||
int left = 0;
|
||||
int right = (int)g_mid_registry.count - 1;
|
||||
bool found = false;
|
||||
|
||||
while (left <= right) {
|
||||
int mid = left + (right - left) / 2;
|
||||
MidSegmentRegistry* entry = &g_mid_registry.entries[mid];
|
||||
|
||||
void* seg_end = (uint8_t*)entry->base + MID_CHUNK_SIZE;
|
||||
|
||||
if (ptr < entry->base) {
|
||||
right = mid - 1;
|
||||
} else if (ptr >= seg_end) {
|
||||
left = mid + 1;
|
||||
} else {
|
||||
// Found!
|
||||
*out_block_size = entry->block_size;
|
||||
*out_class_idx = entry->class_idx;
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&g_mid_registry.lock);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* registry_remove - Remove segment from registry
|
||||
*
|
||||
* Called when segment is completely freed (rare)
|
||||
*/
|
||||
static void registry_remove(void* base) {
|
||||
pthread_mutex_lock(&g_mid_registry.lock);
|
||||
|
||||
// Find entry with matching base
|
||||
for (uint32_t i = 0; i < g_mid_registry.count; i++) {
|
||||
if (g_mid_registry.entries[i].base == base) {
|
||||
// Remove by shifting remaining entries
|
||||
for (uint32_t j = i; j < g_mid_registry.count - 1; j++) {
|
||||
g_mid_registry.entries[j] = g_mid_registry.entries[j + 1];
|
||||
}
|
||||
g_mid_registry.count--;
|
||||
pthread_mutex_unlock(&g_mid_registry.lock);
|
||||
|
||||
MID_LOG("Registry remove: base=%p, count=%u", base, g_mid_registry.count);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&g_mid_registry.lock);
|
||||
}
|
||||
// Phase 6-B: Registry functions removed (header-based free instead)
|
||||
|
||||
// ============================================================================
|
||||
// Chunk Management (mmap/munmap wrappers)
|
||||
@ -262,6 +113,8 @@ static void chunk_deallocate(void* chunk, size_t chunk_size) {
|
||||
*
|
||||
* Called when segment is exhausted (rare, ~0.1% of allocations)
|
||||
*
|
||||
* Phase 6-B: No longer registers chunks (header-based free instead)
|
||||
*
|
||||
* @return true on success, false on OOM
|
||||
*/
|
||||
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
|
||||
@ -274,8 +127,7 @@ static bool segment_refill(MidThreadSegment* seg, int class_idx) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Register chunk in global registry (for free() lookup)
|
||||
registry_add(chunk, block_size, class_idx);
|
||||
// Phase 6-B: No registry add (header-based free doesn't need registry)
|
||||
|
||||
// Setup segment
|
||||
seg->chunk_base = chunk;
|
||||
@ -302,11 +154,14 @@ static bool segment_refill(MidThreadSegment* seg, int class_idx) {
|
||||
* 2. Bump allocation (when free list empty)
|
||||
* 3. Refill (when segment exhausted)
|
||||
*
|
||||
* @return Allocated pointer, or NULL on OOM
|
||||
* Phase 6-B: Now writes MidMTHeader for lock-free free()
|
||||
*
|
||||
* @return Allocated pointer (after header), or NULL on OOM
|
||||
*/
|
||||
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
|
||||
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
|
||||
void* p;
|
||||
void* block; // Block start (includes header space)
|
||||
size_t block_size = seg->block_size;
|
||||
|
||||
// === Path 0: First allocation - need refill ===
|
||||
// CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
|
||||
@ -314,27 +169,42 @@ static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
|
||||
if (!segment_refill(seg, class_idx)) {
|
||||
return NULL; // OOM
|
||||
}
|
||||
// Fall through to bump allocation after refill
|
||||
block_size = seg->block_size; // Update after refill
|
||||
}
|
||||
|
||||
// === Path 1: Free list (fastest, ~4-5 instructions) ===
|
||||
p = seg->free_list;
|
||||
if (likely(p != NULL)) {
|
||||
seg->free_list = *(void**)p; // Pop from free list
|
||||
// Note: Free list stores next pointer at block start (overwrites header when freed)
|
||||
block = seg->free_list;
|
||||
if (likely(block != NULL)) {
|
||||
seg->free_list = *(void**)block; // Pop from free list
|
||||
seg->used_count++;
|
||||
seg->alloc_count++;
|
||||
return p;
|
||||
|
||||
// Phase 6-B: Write header before returning
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
hdr->block_size = (uint32_t)block_size;
|
||||
hdr->class_idx = (uint16_t)class_idx;
|
||||
hdr->magic = MID_MT_MAGIC;
|
||||
|
||||
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
||||
}
|
||||
|
||||
// === Path 2: Bump allocation (fast, ~6-8 instructions) ===
|
||||
p = seg->current;
|
||||
void* next = (uint8_t*)p + seg->block_size;
|
||||
block = seg->current;
|
||||
void* next = (uint8_t*)block + block_size;
|
||||
|
||||
if (likely(next <= seg->end)) {
|
||||
seg->current = next;
|
||||
seg->used_count++;
|
||||
seg->alloc_count++;
|
||||
return p;
|
||||
|
||||
// Phase 6-B: Write header before returning
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
hdr->block_size = (uint32_t)block_size;
|
||||
hdr->class_idx = (uint16_t)class_idx;
|
||||
hdr->magic = MID_MT_MAGIC;
|
||||
|
||||
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
||||
}
|
||||
|
||||
// === Path 3: Refill (slow, called ~once per 64KB) ===
|
||||
@ -343,24 +213,37 @@ static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
|
||||
}
|
||||
|
||||
// Retry after refill
|
||||
p = seg->current;
|
||||
seg->current = (uint8_t*)p + seg->block_size;
|
||||
block = seg->current;
|
||||
block_size = seg->block_size; // Update after refill
|
||||
seg->current = (uint8_t*)block + block_size;
|
||||
seg->used_count++;
|
||||
seg->alloc_count++;
|
||||
|
||||
return p;
|
||||
// Phase 6-B: Write header before returning
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
hdr->block_size = (uint32_t)block_size;
|
||||
hdr->class_idx = (uint16_t)class_idx;
|
||||
hdr->magic = MID_MT_MAGIC;
|
||||
|
||||
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
|
||||
}
|
||||
|
||||
/**
|
||||
* segment_free_local - Free to local segment (same thread)
|
||||
*
|
||||
* @param seg Segment to free to
|
||||
* @param ptr Pointer to free
|
||||
* @param ptr Pointer to free (user pointer, after header)
|
||||
*
|
||||
* Phase 6-B: Adjusted for header-based allocation
|
||||
*/
|
||||
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
|
||||
// Phase 6-B: Get block start (before header)
|
||||
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
||||
|
||||
// Push to free list (lock-free, local operation)
|
||||
*(void**)ptr = seg->free_list;
|
||||
seg->free_list = ptr;
|
||||
// Note: Overwrites header with next pointer (header no longer needed after free)
|
||||
*(void**)block = seg->free_list;
|
||||
seg->free_list = block;
|
||||
seg->used_count--;
|
||||
seg->free_count++;
|
||||
|
||||
@ -377,6 +260,8 @@ static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
|
||||
* mid_mt_init - Initialize Mid Range MT allocator
|
||||
*
|
||||
* Thread-safe, idempotent
|
||||
*
|
||||
* Phase 6-B: Simplified (no registry initialization)
|
||||
*/
|
||||
void mid_mt_init(void) {
|
||||
if (g_mid_initialized) return;
|
||||
@ -384,11 +269,7 @@ void mid_mt_init(void) {
|
||||
pthread_mutex_lock(&g_init_lock);
|
||||
|
||||
if (!g_mid_initialized) {
|
||||
// Initialize registry
|
||||
g_mid_registry.entries = NULL;
|
||||
g_mid_registry.count = 0;
|
||||
g_mid_registry.capacity = 0;
|
||||
pthread_mutex_init(&g_mid_registry.lock, NULL);
|
||||
// Phase 6-B: No registry initialization (header-based free)
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
memset(&g_mid_stats, 0, sizeof(g_mid_stats));
|
||||
@ -396,7 +277,7 @@ void mid_mt_init(void) {
|
||||
|
||||
g_mid_initialized = 1;
|
||||
|
||||
MID_LOG("Mid MT allocator initialized");
|
||||
MID_LOG("Mid MT allocator initialized (Phase 6-B: header-based)");
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&g_init_lock);
|
||||
@ -442,11 +323,13 @@ void* mid_mt_alloc(size_t size) {
|
||||
/**
|
||||
* mid_mt_free - Free memory allocated by mid_mt_alloc
|
||||
*
|
||||
* Phase 1 implementation:
|
||||
* - Local free (same thread): Fast, lock-free
|
||||
* - Remote free (cross-thread): NOT IMPLEMENTED (memory leak)
|
||||
* Phase 6-B: Header-based free (lock-free, no registry lookup!)
|
||||
* - Reads MidMTHeader to get block metadata (O(1), ~2 cycles)
|
||||
* - Eliminates pthread_mutex_lock/unlock (13.98% CPU overhead)
|
||||
* - Expected: +17-27% throughput improvement
|
||||
*
|
||||
* Phase 2 will add atomic remote free list per segment
|
||||
* Local free (same thread): Ultra-fast, lock-free
|
||||
* Remote free (cross-thread): NOT IMPLEMENTED (memory leak, Phase 2 will add atomic remote free list)
|
||||
*/
|
||||
void mid_mt_free(void* ptr, size_t size) {
|
||||
if (unlikely(!ptr)) return;
|
||||
@ -455,20 +338,34 @@ void mid_mt_free(void* ptr, size_t size) {
|
||||
__sync_fetch_and_add(&g_mid_stats.total_frees, 1);
|
||||
#endif
|
||||
|
||||
// Get size class
|
||||
int class_idx = mid_size_to_class(size);
|
||||
if (unlikely(class_idx < 0)) {
|
||||
MID_LOG("ERROR: Invalid size %zu in free", size);
|
||||
// Phase 6-B: Read header for O(1) metadata lookup (no mutex!)
|
||||
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
|
||||
MidMTHeader* hdr = (MidMTHeader*)block;
|
||||
|
||||
// Validate header magic (sanity check)
|
||||
if (unlikely(hdr->magic != MID_MT_MAGIC)) {
|
||||
MID_LOG("ERROR: Invalid Mid MT magic 0x%X (expected 0x%X) for ptr %p",
|
||||
hdr->magic, MID_MT_MAGIC, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get thread-local segment
|
||||
// Get metadata from header (no registry lookup!)
|
||||
int class_idx = hdr->class_idx;
|
||||
|
||||
// Validate class_idx
|
||||
if (unlikely(class_idx < 0 || class_idx >= MID_NUM_CLASSES)) {
|
||||
MID_LOG("ERROR: Invalid class_idx %d in header for ptr %p", class_idx, ptr);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get thread-local segment for this size class
|
||||
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
||||
|
||||
// === Fast path: Check if ptr belongs to current segment ===
|
||||
// === Fast path: Check if block belongs to current segment ===
|
||||
// Note: Check block (not ptr), since segment tracks block addresses
|
||||
if (likely(seg->chunk_base != NULL &&
|
||||
ptr >= seg->chunk_base &&
|
||||
ptr < seg->end)) {
|
||||
block >= seg->chunk_base &&
|
||||
block < seg->end)) {
|
||||
// Local free (same thread, lock-free)
|
||||
segment_free_local(seg, ptr);
|
||||
return;
|
||||
@ -476,36 +373,28 @@ void mid_mt_free(void* ptr, size_t size) {
|
||||
|
||||
// === Slow path: Remote free (cross-thread) ===
|
||||
// Phase 1: NOT IMPLEMENTED
|
||||
// We need to find the owning segment via registry,
|
||||
// then push to that segment's remote free list.
|
||||
// We would need to find the owning segment and push to its remote free list.
|
||||
//
|
||||
// For Phase 1 (benchmarking), we accept this memory leak.
|
||||
// bench_mid_large_mt uses independent working sets per thread,
|
||||
// so remote frees are rare.
|
||||
// bench_mid_mt_gap uses single-threaded workload, so remote frees never happen.
|
||||
|
||||
size_t block_size;
|
||||
int owner_class;
|
||||
|
||||
if (mid_registry_lookup(ptr, &block_size, &owner_class)) {
|
||||
// Found in registry, but we can't free it yet (no remote free list)
|
||||
MID_LOG("WARNING: Remote free not implemented, leaking %p (size=%zu)", ptr, size);
|
||||
MID_LOG("WARNING: Remote free not implemented, leaking %p (block_size=%u, class=%d)",
|
||||
ptr, hdr->block_size, class_idx);
|
||||
|
||||
#if MID_ENABLE_STATS
|
||||
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
|
||||
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
|
||||
#endif
|
||||
|
||||
// TODO Phase 2: Implement remote free
|
||||
// segment_free_remote(ptr, block_size, owner_class);
|
||||
} else {
|
||||
// Not found in registry - might be from a different allocator
|
||||
MID_LOG("ERROR: Pointer %p not found in registry (size=%zu)", ptr, size);
|
||||
}
|
||||
// TODO Phase 2: Implement remote free
|
||||
// segment_free_remote(ptr, hdr->block_size, class_idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* mid_mt_thread_exit - Cleanup thread-local segments
|
||||
*
|
||||
* Called on thread exit to release resources
|
||||
*
|
||||
* Phase 6-B: No registry cleanup needed (header-based free)
|
||||
*/
|
||||
void mid_mt_thread_exit(void) {
|
||||
MID_LOG("Thread exit cleanup");
|
||||
@ -515,8 +404,7 @@ void mid_mt_thread_exit(void) {
|
||||
MidThreadSegment* seg = &g_mid_segments[class_idx];
|
||||
|
||||
if (seg->chunk_base) {
|
||||
// Remove from registry
|
||||
registry_remove(seg->chunk_base);
|
||||
// Phase 6-B: No registry remove (no registry exists)
|
||||
|
||||
// Deallocate chunk
|
||||
chunk_deallocate(seg->chunk_base, seg->chunk_size);
|
||||
|
||||
Reference in New Issue
Block a user