Phase 6-B: Header-based Mid MT free (lock-free, +2.65% improvement)

Performance Results (bench_mid_mt_gap, 1KB-8KB, ws=256):
- Before: 41.0 M ops/s (mutex-protected registry)
- After:  42.09 M ops/s (+2.65% improvement)

Expected vs Actual:
- Expected: +17-27% (based on perf showing 13.98% mutex overhead)
- Actual:   +2.65% (needs investigation)

Implementation:
- Added MidMTHeader (8 bytes) to each Mid MT allocation
- Allocation: Write header with block_size, class_idx, magic (0xAB42)
- Free: Read header for O(1) metadata lookup (no mutex!)
- Eliminated entire registry infrastructure (127 lines deleted)

Changes:
- core/hakmem_mid_mt.h: Added MidMTHeader, removed registry structures
- core/hakmem_mid_mt.c: Updated alloc/free, removed registry functions
- core/box/mid_free_route_box.h: Header-based detection instead of registry lookup

Code Quality:
 Lock-free (no pthread_mutex operations)
 Simpler (O(1) header read vs O(log N) binary search)
 Smaller binary (127 lines deleted)
 Positive improvement (no regression)

Next: Investigate why improvement is smaller than expected

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-29 15:45:29 +09:00
parent c04cccf723
commit c19bb6a3bc
3 changed files with 143 additions and 259 deletions

View File

@ -36,13 +36,7 @@
// TLS: Each thread has independent segments (lock-free!)
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};
// Global registry (protected by lock)
MidGlobalRegistry g_mid_registry = {
.entries = NULL,
.count = 0,
.capacity = 0,
.lock = PTHREAD_MUTEX_INITIALIZER
};
// Phase 6-B: Registry removed (no longer needed with header-based free)
// Statistics (if enabled)
#if MID_ENABLE_STATS
@ -62,150 +56,7 @@ static void* segment_alloc(MidThreadSegment* seg, int class_idx);
static void segment_free_local(MidThreadSegment* seg, void* ptr);
static void* chunk_allocate(size_t chunk_size);
static void chunk_deallocate(void* chunk, size_t chunk_size);
static void registry_add(void* base, size_t block_size, int class_idx);
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx); // Public for hak_free_at()
static void registry_remove(void* base);
// ============================================================================
// Registry Operations (Protected by Lock)
// ============================================================================
/**
* registry_add - Add a new segment to global registry
*
* Called during segment refill (rare, ~0.1% of allocations)
*/
static void registry_add(void* base, size_t block_size, int class_idx) {
pthread_mutex_lock(&g_mid_registry.lock);
// Grow registry if needed
if (g_mid_registry.count >= g_mid_registry.capacity) {
uint32_t new_capacity = g_mid_registry.capacity == 0
? MID_REGISTRY_INITIAL_CAPACITY
: g_mid_registry.capacity * 2;
// CRITICAL: Use mmap() instead of realloc() to avoid deadlock!
// realloc() would go through hakmem → mid_mt → registry_add → deadlock
size_t new_size = new_capacity * sizeof(MidSegmentRegistry);
MidSegmentRegistry* new_entries = mmap(
NULL, new_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0
);
if (new_entries == MAP_FAILED) {
new_entries = NULL;
} else if (g_mid_registry.entries) {
// Copy old entries
memcpy(new_entries, g_mid_registry.entries,
g_mid_registry.count * sizeof(MidSegmentRegistry));
// Don't unmap old entries (lazy cleanup, avoids complexity)
}
if (!new_entries) {
pthread_mutex_unlock(&g_mid_registry.lock);
MID_LOG("ERROR: Registry realloc failed");
return;
}
g_mid_registry.entries = new_entries;
g_mid_registry.capacity = new_capacity;
}
// Add new entry
MidSegmentRegistry* entry = &g_mid_registry.entries[g_mid_registry.count];
entry->base = base;
entry->block_size = block_size;
entry->class_idx = class_idx;
g_mid_registry.count++;
// Keep entries sorted by base address (for binary search)
// Simple insertion: swap with previous until in order
for (uint32_t i = g_mid_registry.count - 1; i > 0; i--) {
if (g_mid_registry.entries[i].base >= g_mid_registry.entries[i - 1].base) {
break;
}
// Swap
MidSegmentRegistry tmp = g_mid_registry.entries[i];
g_mid_registry.entries[i] = g_mid_registry.entries[i - 1];
g_mid_registry.entries[i - 1] = tmp;
}
pthread_mutex_unlock(&g_mid_registry.lock);
MID_LOG("Registry add: base=%p, block_size=%zu, class=%d, count=%u",
base, block_size, class_idx, g_mid_registry.count);
}
/**
* mid_registry_lookup - Find segment containing ptr via binary search
*
* Called during free() when ptr is not in current segment (uncommon)
*
* @return true if found, false otherwise
*/
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx) {
pthread_mutex_lock(&g_mid_registry.lock);
#if MID_ENABLE_STATS
__sync_fetch_and_add(&g_mid_stats.registry_lookups, 1);
#endif
// Binary search for segment containing ptr
int left = 0;
int right = (int)g_mid_registry.count - 1;
bool found = false;
while (left <= right) {
int mid = left + (right - left) / 2;
MidSegmentRegistry* entry = &g_mid_registry.entries[mid];
void* seg_end = (uint8_t*)entry->base + MID_CHUNK_SIZE;
if (ptr < entry->base) {
right = mid - 1;
} else if (ptr >= seg_end) {
left = mid + 1;
} else {
// Found!
*out_block_size = entry->block_size;
*out_class_idx = entry->class_idx;
found = true;
break;
}
}
pthread_mutex_unlock(&g_mid_registry.lock);
return found;
}
/**
* registry_remove - Remove segment from registry
*
* Called when segment is completely freed (rare)
*/
static void registry_remove(void* base) {
pthread_mutex_lock(&g_mid_registry.lock);
// Find entry with matching base
for (uint32_t i = 0; i < g_mid_registry.count; i++) {
if (g_mid_registry.entries[i].base == base) {
// Remove by shifting remaining entries
for (uint32_t j = i; j < g_mid_registry.count - 1; j++) {
g_mid_registry.entries[j] = g_mid_registry.entries[j + 1];
}
g_mid_registry.count--;
pthread_mutex_unlock(&g_mid_registry.lock);
MID_LOG("Registry remove: base=%p, count=%u", base, g_mid_registry.count);
return;
}
}
pthread_mutex_unlock(&g_mid_registry.lock);
}
// Phase 6-B: Registry functions removed (header-based free instead)
// ============================================================================
// Chunk Management (mmap/munmap wrappers)
@ -262,6 +113,8 @@ static void chunk_deallocate(void* chunk, size_t chunk_size) {
*
* Called when segment is exhausted (rare, ~0.1% of allocations)
*
* Phase 6-B: No longer registers chunks (header-based free instead)
*
* @return true on success, false on OOM
*/
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
@ -274,8 +127,7 @@ static bool segment_refill(MidThreadSegment* seg, int class_idx) {
return false;
}
// Register chunk in global registry (for free() lookup)
registry_add(chunk, block_size, class_idx);
// Phase 6-B: No registry add (header-based free doesn't need registry)
// Setup segment
seg->chunk_base = chunk;
@ -302,11 +154,14 @@ static bool segment_refill(MidThreadSegment* seg, int class_idx) {
* 2. Bump allocation (when free list empty)
* 3. Refill (when segment exhausted)
*
* @return Allocated pointer, or NULL on OOM
* Phase 6-B: Now writes MidMTHeader for lock-free free()
*
* @return Allocated pointer (after header), or NULL on OOM
*/
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
void* p;
void* block; // Block start (includes header space)
size_t block_size = seg->block_size;
// === Path 0: First allocation - need refill ===
// CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
@ -314,27 +169,42 @@ static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
if (!segment_refill(seg, class_idx)) {
return NULL; // OOM
}
// Fall through to bump allocation after refill
block_size = seg->block_size; // Update after refill
}
// === Path 1: Free list (fastest, ~4-5 instructions) ===
p = seg->free_list;
if (likely(p != NULL)) {
seg->free_list = *(void**)p; // Pop from free list
// Note: Free list stores next pointer at block start (overwrites header when freed)
block = seg->free_list;
if (likely(block != NULL)) {
seg->free_list = *(void**)block; // Pop from free list
seg->used_count++;
seg->alloc_count++;
return p;
// Phase 6-B: Write header before returning
MidMTHeader* hdr = (MidMTHeader*)block;
hdr->block_size = (uint32_t)block_size;
hdr->class_idx = (uint16_t)class_idx;
hdr->magic = MID_MT_MAGIC;
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
}
// === Path 2: Bump allocation (fast, ~6-8 instructions) ===
p = seg->current;
void* next = (uint8_t*)p + seg->block_size;
block = seg->current;
void* next = (uint8_t*)block + block_size;
if (likely(next <= seg->end)) {
seg->current = next;
seg->used_count++;
seg->alloc_count++;
return p;
// Phase 6-B: Write header before returning
MidMTHeader* hdr = (MidMTHeader*)block;
hdr->block_size = (uint32_t)block_size;
hdr->class_idx = (uint16_t)class_idx;
hdr->magic = MID_MT_MAGIC;
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
}
// === Path 3: Refill (slow, called ~once per 64KB) ===
@ -343,24 +213,37 @@ static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
}
// Retry after refill
p = seg->current;
seg->current = (uint8_t*)p + seg->block_size;
block = seg->current;
block_size = seg->block_size; // Update after refill
seg->current = (uint8_t*)block + block_size;
seg->used_count++;
seg->alloc_count++;
return p;
// Phase 6-B: Write header before returning
MidMTHeader* hdr = (MidMTHeader*)block;
hdr->block_size = (uint32_t)block_size;
hdr->class_idx = (uint16_t)class_idx;
hdr->magic = MID_MT_MAGIC;
return (uint8_t*)block + sizeof(MidMTHeader); // Return user pointer after header
}
/**
* segment_free_local - Free to local segment (same thread)
*
* @param seg Segment to free to
* @param ptr Pointer to free
* @param ptr Pointer to free (user pointer, after header)
*
* Phase 6-B: Adjusted for header-based allocation
*/
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
// Phase 6-B: Get block start (before header)
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
// Push to free list (lock-free, local operation)
*(void**)ptr = seg->free_list;
seg->free_list = ptr;
// Note: Overwrites header with next pointer (header no longer needed after free)
*(void**)block = seg->free_list;
seg->free_list = block;
seg->used_count--;
seg->free_count++;
@ -377,6 +260,8 @@ static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
* mid_mt_init - Initialize Mid Range MT allocator
*
* Thread-safe, idempotent
*
* Phase 6-B: Simplified (no registry initialization)
*/
void mid_mt_init(void) {
if (g_mid_initialized) return;
@ -384,11 +269,7 @@ void mid_mt_init(void) {
pthread_mutex_lock(&g_init_lock);
if (!g_mid_initialized) {
// Initialize registry
g_mid_registry.entries = NULL;
g_mid_registry.count = 0;
g_mid_registry.capacity = 0;
pthread_mutex_init(&g_mid_registry.lock, NULL);
// Phase 6-B: No registry initialization (header-based free)
#if MID_ENABLE_STATS
memset(&g_mid_stats, 0, sizeof(g_mid_stats));
@ -396,7 +277,7 @@ void mid_mt_init(void) {
g_mid_initialized = 1;
MID_LOG("Mid MT allocator initialized");
MID_LOG("Mid MT allocator initialized (Phase 6-B: header-based)");
}
pthread_mutex_unlock(&g_init_lock);
@ -442,11 +323,13 @@ void* mid_mt_alloc(size_t size) {
/**
* mid_mt_free - Free memory allocated by mid_mt_alloc
*
* Phase 1 implementation:
* - Local free (same thread): Fast, lock-free
* - Remote free (cross-thread): NOT IMPLEMENTED (memory leak)
* Phase 6-B: Header-based free (lock-free, no registry lookup!)
* - Reads MidMTHeader to get block metadata (O(1), ~2 cycles)
* - Eliminates pthread_mutex_lock/unlock (13.98% CPU overhead)
* - Expected: +17-27% throughput improvement
*
* Phase 2 will add atomic remote free list per segment
* Local free (same thread): Ultra-fast, lock-free
* Remote free (cross-thread): NOT IMPLEMENTED (memory leak, Phase 2 will add atomic remote free list)
*/
void mid_mt_free(void* ptr, size_t size) {
if (unlikely(!ptr)) return;
@ -455,20 +338,34 @@ void mid_mt_free(void* ptr, size_t size) {
__sync_fetch_and_add(&g_mid_stats.total_frees, 1);
#endif
// Get size class
int class_idx = mid_size_to_class(size);
if (unlikely(class_idx < 0)) {
MID_LOG("ERROR: Invalid size %zu in free", size);
// Phase 6-B: Read header for O(1) metadata lookup (no mutex!)
void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
MidMTHeader* hdr = (MidMTHeader*)block;
// Validate header magic (sanity check)
if (unlikely(hdr->magic != MID_MT_MAGIC)) {
MID_LOG("ERROR: Invalid Mid MT magic 0x%X (expected 0x%X) for ptr %p",
hdr->magic, MID_MT_MAGIC, ptr);
return;
}
// Get thread-local segment
// Get metadata from header (no registry lookup!)
int class_idx = hdr->class_idx;
// Validate class_idx
if (unlikely(class_idx < 0 || class_idx >= MID_NUM_CLASSES)) {
MID_LOG("ERROR: Invalid class_idx %d in header for ptr %p", class_idx, ptr);
return;
}
// Get thread-local segment for this size class
MidThreadSegment* seg = &g_mid_segments[class_idx];
// === Fast path: Check if ptr belongs to current segment ===
// === Fast path: Check if block belongs to current segment ===
// Note: Check block (not ptr), since segment tracks block addresses
if (likely(seg->chunk_base != NULL &&
ptr >= seg->chunk_base &&
ptr < seg->end)) {
block >= seg->chunk_base &&
block < seg->end)) {
// Local free (same thread, lock-free)
segment_free_local(seg, ptr);
return;
@ -476,36 +373,28 @@ void mid_mt_free(void* ptr, size_t size) {
// === Slow path: Remote free (cross-thread) ===
// Phase 1: NOT IMPLEMENTED
// We need to find the owning segment via registry,
// then push to that segment's remote free list.
// We would need to find the owning segment and push to its remote free list.
//
// For Phase 1 (benchmarking), we accept this memory leak.
// bench_mid_large_mt uses independent working sets per thread,
// so remote frees are rare.
// bench_mid_mt_gap uses single-threaded workload, so remote frees never happen.
size_t block_size;
int owner_class;
if (mid_registry_lookup(ptr, &block_size, &owner_class)) {
// Found in registry, but we can't free it yet (no remote free list)
MID_LOG("WARNING: Remote free not implemented, leaking %p (size=%zu)", ptr, size);
MID_LOG("WARNING: Remote free not implemented, leaking %p (block_size=%u, class=%d)",
ptr, hdr->block_size, class_idx);
#if MID_ENABLE_STATS
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
#endif
// TODO Phase 2: Implement remote free
// segment_free_remote(ptr, block_size, owner_class);
} else {
// Not found in registry - might be from a different allocator
MID_LOG("ERROR: Pointer %p not found in registry (size=%zu)", ptr, size);
}
// TODO Phase 2: Implement remote free
// segment_free_remote(ptr, hdr->block_size, class_idx);
}
/**
* mid_mt_thread_exit - Cleanup thread-local segments
*
* Called on thread exit to release resources
*
* Phase 6-B: No registry cleanup needed (header-based free)
*/
void mid_mt_thread_exit(void) {
MID_LOG("Thread exit cleanup");
@ -515,8 +404,7 @@ void mid_mt_thread_exit(void) {
MidThreadSegment* seg = &g_mid_segments[class_idx];
if (seg->chunk_base) {
// Remove from registry
registry_remove(seg->chunk_base);
// Phase 6-B: No registry remove (no registry exists)
// Deallocate chunk
chunk_deallocate(seg->chunk_base, seg->chunk_size);