hakmem/core/hakmem_mid_mt.c

/**
 * hakmem_mid_mt.c
 *
 * Mid Range Multi-threaded Allocator Implementation (8-32KB)
 * mimalloc-style per-thread segment for optimal MT performance
 *
 * Design:
 * - Per-thread segments (TLS) for lock-free allocation
 * - Global registry for segment lookup during free()
 * - 64KB chunks with bump + free list allocation
 * - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking)
 * - Phase 2: Will add atomic remote free list
 */

#include "hakmem_mid_mt.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <assert.h>
#include <stdatomic.h>

// Use likely/unlikely hints for branch prediction
#ifndef likely
#define likely(x)   __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif

// ============================================================================
// Global and TLS Variables
// ============================================================================

// TLS: Each thread has independent segments (lock-free!)
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};

// Global registry (protected by lock)
MidGlobalRegistry g_mid_registry = {
    .entries = NULL,
    .count = 0,
    .capacity = 0,
    .lock = PTHREAD_MUTEX_INITIALIZER
};

// Statistics (if enabled)
#if MID_ENABLE_STATS
MidStats g_mid_stats = {0};
#endif

// Initialization flag
static volatile int g_mid_initialized = 0;
static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER;

// ============================================================================
// Forward Declarations
// ============================================================================

static bool segment_refill(MidThreadSegment* seg, int class_idx);
static void* segment_alloc(MidThreadSegment* seg, int class_idx);
static void segment_free_local(MidThreadSegment* seg, void* ptr);
static void* chunk_allocate(size_t chunk_size);
static void chunk_deallocate(void* chunk, size_t chunk_size);
static void registry_add(void* base, size_t block_size, int class_idx);
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx);  // Public for hak_free_at()
static void registry_remove(void* base);

// ============================================================================
// Registry Operations (Protected by Lock)
// ============================================================================

/**
 * registry_add - Add a new segment to global registry
 *
 * Called during segment refill (rare, ~0.1% of allocations)
 */
static void registry_add(void* base, size_t block_size, int class_idx) {
    pthread_mutex_lock(&g_mid_registry.lock);

    // Grow registry if needed
    if (g_mid_registry.count >= g_mid_registry.capacity) {
        uint32_t new_capacity = g_mid_registry.capacity == 0
            ? MID_REGISTRY_INITIAL_CAPACITY
            : g_mid_registry.capacity * 2;

        // CRITICAL: Use mmap() instead of realloc() to avoid deadlock!
        // realloc() would go through hakmem → mid_mt → registry_add → deadlock
        size_t new_size = new_capacity * sizeof(MidSegmentRegistry);
        MidSegmentRegistry* new_entries = mmap(
            NULL, new_size,
            PROT_READ | PROT_WRITE,
            MAP_PRIVATE | MAP_ANONYMOUS,
            -1, 0
        );

        if (new_entries == MAP_FAILED) {
            new_entries = NULL;
        } else if (g_mid_registry.entries) {
            // Copy old entries
            memcpy(new_entries, g_mid_registry.entries,
                   g_mid_registry.count * sizeof(MidSegmentRegistry));
            // Don't unmap old entries (lazy cleanup, avoids complexity)
        }

        if (!new_entries) {
            pthread_mutex_unlock(&g_mid_registry.lock);
            MID_LOG("ERROR: Registry realloc failed");
            return;
        }

        g_mid_registry.entries = new_entries;
        g_mid_registry.capacity = new_capacity;
    }

    // Add new entry
    MidSegmentRegistry* entry = &g_mid_registry.entries[g_mid_registry.count];
    entry->base = base;
    entry->block_size = block_size;
    entry->class_idx = class_idx;
    g_mid_registry.count++;

    // Keep entries sorted by base address (for binary search)
    // Simple insertion: swap with previous until in order
    for (uint32_t i = g_mid_registry.count - 1; i > 0; i--) {
        if (g_mid_registry.entries[i].base >= g_mid_registry.entries[i - 1].base) {
            break;
        }
        // Swap
        MidSegmentRegistry tmp = g_mid_registry.entries[i];
        g_mid_registry.entries[i] = g_mid_registry.entries[i - 1];
        g_mid_registry.entries[i - 1] = tmp;
    }

    pthread_mutex_unlock(&g_mid_registry.lock);

    MID_LOG("Registry add: base=%p, block_size=%zu, class=%d, count=%u",
            base, block_size, class_idx, g_mid_registry.count);
}

/**
 * mid_registry_lookup - Find segment containing ptr via binary search
 *
 * Called during free() when ptr is not in current segment (uncommon)
 *
 * @return true if found, false otherwise
 */
bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx) {
    pthread_mutex_lock(&g_mid_registry.lock);

#if MID_ENABLE_STATS
    __sync_fetch_and_add(&g_mid_stats.registry_lookups, 1);
#endif

    // Binary search for segment containing ptr
    int left = 0;
    int right = (int)g_mid_registry.count - 1;
    bool found = false;

    while (left <= right) {
        int mid = left + (right - left) / 2;
        MidSegmentRegistry* entry = &g_mid_registry.entries[mid];

        void* seg_end = (uint8_t*)entry->base + MID_CHUNK_SIZE;

        if (ptr < entry->base) {
            right = mid - 1;
        } else if (ptr >= seg_end) {
            left = mid + 1;
        } else {
            // Found!
            *out_block_size = entry->block_size;
            *out_class_idx = entry->class_idx;
            found = true;
            break;
        }
    }

    pthread_mutex_unlock(&g_mid_registry.lock);

    return found;
}

/**
 * registry_remove - Remove segment from registry
 *
 * Called when segment is completely freed (rare)
 */
static void registry_remove(void* base) {
    pthread_mutex_lock(&g_mid_registry.lock);

    // Find entry with matching base
    for (uint32_t i = 0; i < g_mid_registry.count; i++) {
        if (g_mid_registry.entries[i].base == base) {
            // Remove by shifting remaining entries
            for (uint32_t j = i; j < g_mid_registry.count - 1; j++) {
                g_mid_registry.entries[j] = g_mid_registry.entries[j + 1];
            }
            g_mid_registry.count--;
            pthread_mutex_unlock(&g_mid_registry.lock);

            MID_LOG("Registry remove: base=%p, count=%u", base, g_mid_registry.count);
            return;
        }
    }

    pthread_mutex_unlock(&g_mid_registry.lock);
}

// ============================================================================
// Chunk Management (mmap/munmap wrappers)
// ============================================================================

/**
 * chunk_allocate - Allocate a new chunk via mmap
 *
 * @param chunk_size  Size of chunk (typically 64KB)
 * @return            Chunk base address, or NULL on failure
 */
static void* chunk_allocate(size_t chunk_size) {
    void* chunk = mmap(
        NULL,
        chunk_size,
        PROT_READ | PROT_WRITE,
        MAP_PRIVATE | MAP_ANONYMOUS,
        -1,
        0
    );

    if (chunk == MAP_FAILED) {
        MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size);
        return NULL;
    }

    MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size);
    return chunk;
}

/**
 * chunk_deallocate - Free chunk via munmap
 *
 * @param chunk       Chunk base address
 * @param chunk_size  Size of chunk
 */
static void chunk_deallocate(void* chunk, size_t chunk_size) {
    if (!chunk) return;

    int ret = munmap(chunk, chunk_size);
    if (ret != 0) {
        MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size);
    } else {
        MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size);
    }
}

// ============================================================================
// Segment Operations
// ============================================================================

/**
 * segment_refill - Allocate new chunk and setup segment
 *
 * Called when segment is exhausted (rare, ~0.1% of allocations)
 *
 * @return true on success, false on OOM
 */
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
    size_t block_size = mid_class_to_size(class_idx);
    size_t chunk_size = MID_CHUNK_SIZE;

    // Allocate new chunk via mmap
    void* chunk = chunk_allocate(chunk_size);
    if (!chunk) {
        return false;
    }

    // Register chunk in global registry (for free() lookup)
    registry_add(chunk, block_size, class_idx);

    // Setup segment
    seg->chunk_base = chunk;
    seg->chunk_size = chunk_size;
    seg->block_size = block_size;
    seg->current = chunk;
    seg->end = (uint8_t*)chunk + chunk_size;
    seg->capacity = chunk_size / block_size;
    seg->refill_count++;

    MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p",
            class_idx, block_size, seg->capacity, chunk);

    return true;
}

/**
 * segment_alloc - Allocate from segment (fast path)
 *
 * PERFORMANCE: Force inline for maximum speed
 *
 * Fast path priority:
 *   1. Free list (most common, ~90-95% hit rate)
 *   2. Bump allocation (when free list empty)
 *   3. Refill (when segment exhausted)
 *
 * @return Allocated pointer, or NULL on OOM
 */
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
    void* p;

    // === Path 0: First allocation - need refill ===
    // CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
    if (unlikely(seg->chunk_base == NULL)) {
        if (!segment_refill(seg, class_idx)) {
            return NULL;  // OOM
        }
        // Fall through to bump allocation after refill
    }

    // === Path 1: Free list (fastest, ~4-5 instructions) ===
    p = seg->free_list;
    if (likely(p != NULL)) {
        seg->free_list = *(void**)p;  // Pop from free list
        seg->used_count++;
        seg->alloc_count++;
        return p;
    }

    // === Path 2: Bump allocation (fast, ~6-8 instructions) ===
    p = seg->current;
    void* next = (uint8_t*)p + seg->block_size;

    if (likely(next <= seg->end)) {
        seg->current = next;
        seg->used_count++;
        seg->alloc_count++;
        return p;
    }

    // === Path 3: Refill (slow, called ~once per 64KB) ===
    if (!segment_refill(seg, class_idx)) {
        return NULL;  // OOM
    }

    // Retry after refill
    p = seg->current;
    seg->current = (uint8_t*)p + seg->block_size;
    seg->used_count++;
    seg->alloc_count++;

    return p;
}

/**
 * segment_free_local - Free to local segment (same thread)
 *
 * @param seg  Segment to free to
 * @param ptr  Pointer to free
 */
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
    // Push to free list (lock-free, local operation)
    *(void**)ptr = seg->free_list;
    seg->free_list = ptr;
    seg->used_count--;
    seg->free_count++;

#if MID_ENABLE_STATS
    __sync_fetch_and_add(&g_mid_stats.local_frees, 1);
#endif
}

// ============================================================================
// Public API
// ============================================================================

/**
 * mid_mt_init - Initialize Mid Range MT allocator
 *
 * Thread-safe, idempotent
 */
void mid_mt_init(void) {
    if (g_mid_initialized) return;

    pthread_mutex_lock(&g_init_lock);

    if (!g_mid_initialized) {
        // Initialize registry
        g_mid_registry.entries = NULL;
        g_mid_registry.count = 0;
        g_mid_registry.capacity = 0;
        pthread_mutex_init(&g_mid_registry.lock, NULL);

#if MID_ENABLE_STATS
        memset(&g_mid_stats, 0, sizeof(g_mid_stats));
#endif

        g_mid_initialized = 1;

        MID_LOG("Mid MT allocator initialized");
    }

    pthread_mutex_unlock(&g_init_lock);
}

/**
 * mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB)
 *
 * Thread-safe, lock-free (uses TLS)
 */
void* mid_mt_alloc(size_t size) {
    // Validate size range (Phase 16: dynamic min size based on Tiny's max)
    if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) {
        return NULL;
    }

    // Initialize if needed (thread-safe)
    if (unlikely(!g_mid_initialized)) {
        mid_mt_init();
    }

    // Get size class
    int class_idx = mid_size_to_class(size);
    if (unlikely(class_idx < 0)) {
        return NULL;
    }

    // Get thread-local segment
    MidThreadSegment* seg = &g_mid_segments[class_idx];

    // Allocate from segment (fast path)
    void* p = segment_alloc(seg, class_idx);

#if MID_ENABLE_STATS
    if (p) {
        __sync_fetch_and_add(&g_mid_stats.total_allocs, 1);
    }
#endif

    return p;
}

/**
 * mid_mt_free - Free memory allocated by mid_mt_alloc
 *
 * Phase 1 implementation:
 * - Local free (same thread): Fast, lock-free
 * - Remote free (cross-thread): NOT IMPLEMENTED (memory leak)
 *
 * Phase 2 will add atomic remote free list per segment
 */
void mid_mt_free(void* ptr, size_t size) {
    if (unlikely(!ptr)) return;

#if MID_ENABLE_STATS
    __sync_fetch_and_add(&g_mid_stats.total_frees, 1);
#endif

    // Get size class
    int class_idx = mid_size_to_class(size);
    if (unlikely(class_idx < 0)) {
        MID_LOG("ERROR: Invalid size %zu in free", size);
        return;
    }

    // Get thread-local segment
    MidThreadSegment* seg = &g_mid_segments[class_idx];

    // === Fast path: Check if ptr belongs to current segment ===
    if (likely(seg->chunk_base != NULL &&
               ptr >= seg->chunk_base &&
               ptr < seg->end)) {
        // Local free (same thread, lock-free)
        segment_free_local(seg, ptr);
        return;
    }

    // === Slow path: Remote free (cross-thread) ===
    // Phase 1: NOT IMPLEMENTED
    // We need to find the owning segment via registry,
    // then push to that segment's remote free list.
    //
    // For Phase 1 (benchmarking), we accept this memory leak.
    // bench_mid_large_mt uses independent working sets per thread,
    // so remote frees are rare.

    size_t block_size;
    int owner_class;

    if (mid_registry_lookup(ptr, &block_size, &owner_class)) {
        // Found in registry, but we can't free it yet (no remote free list)
        MID_LOG("WARNING: Remote free not implemented, leaking %p (size=%zu)", ptr, size);

#if MID_ENABLE_STATS
        __sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
#endif

        // TODO Phase 2: Implement remote free
        // segment_free_remote(ptr, block_size, owner_class);
    } else {
        // Not found in registry - might be from a different allocator
        MID_LOG("ERROR: Pointer %p not found in registry (size=%zu)", ptr, size);
    }
}

/**
 * mid_mt_thread_exit - Cleanup thread-local segments
 *
 * Called on thread exit to release resources
 */
void mid_mt_thread_exit(void) {
    MID_LOG("Thread exit cleanup");

    // Free all chunks from this thread's segments
    for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
        MidThreadSegment* seg = &g_mid_segments[class_idx];

        if (seg->chunk_base) {
            // Remove from registry
            registry_remove(seg->chunk_base);

            // Deallocate chunk
            chunk_deallocate(seg->chunk_base, seg->chunk_size);

            // Clear segment
            memset(seg, 0, sizeof(MidThreadSegment));
        }
    }
}

// ============================================================================
// Statistics (Debug/Profiling)
// ============================================================================

#if MID_ENABLE_STATS

void mid_mt_print_stats(void) {
    printf("\n=== Mid Range MT Statistics ===\n");
    printf("Total allocations:   %lu\n", g_mid_stats.total_allocs);
    printf("Total frees:         %lu\n", g_mid_stats.total_frees);
    printf("Local frees:         %lu (%.1f%%)\n",
           g_mid_stats.local_frees,
           100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1));
    printf("Remote frees:        %lu (%.1f%%)\n",
           g_mid_stats.remote_frees,
           100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1));
    printf("Registry lookups:    %lu\n", g_mid_stats.registry_lookups);
    printf("\n");

    // Per-segment stats
    for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
        MidThreadSegment* seg = &g_mid_segments[class_idx];
        if (seg->alloc_count > 0) {
            printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx));
            printf("  Allocations:  %lu\n", seg->alloc_count);
            printf("  Frees:        %lu\n", seg->free_count);
            printf("  Refills:      %u\n", seg->refill_count);
            printf("  Used count:   %u / %u\n", seg->used_count, seg->capacity);
        }
    }
    printf("\n");
}

#endif // MID_ENABLE_STATS
Debug Counters Implementation - Clean History Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-05 12:31:14 +09:00			`/**`
			`* hakmem_mid_mt.c`
			`*`
			`* Mid Range Multi-threaded Allocator Implementation (8-32KB)`
			`* mimalloc-style per-thread segment for optimal MT performance`
			`*`
			`* Design:`
			`* - Per-thread segments (TLS) for lock-free allocation`
			`* - Global registry for segment lookup during free()`
			`* - 64KB chunks with bump + free list allocation`
			`* - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking)`
			`* - Phase 2: Will add atomic remote free list`
			`*/`

			`#include "hakmem_mid_mt.h"`
			`#include <stdlib.h>`
			`#include <stdio.h>`
			`#include <string.h>`
			`#include <sys/mman.h>`
			`#include <unistd.h>`
			`#include <assert.h>`
			`#include <stdatomic.h>`

			`// Use likely/unlikely hints for branch prediction`
			`#ifndef likely`
			`#define likely(x) __builtin_expect(!!(x), 1)`
			`#endif`
			`#ifndef unlikely`
			`#define unlikely(x) __builtin_expect(!!(x), 0)`
			`#endif`

			`// ============================================================================`
			`// Global and TLS Variables`
			`// ============================================================================`

			`// TLS: Each thread has independent segments (lock-free!)`
			`__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};`

			`// Global registry (protected by lock)`
			`MidGlobalRegistry g_mid_registry = {`
			`.entries = NULL,`
			`.count = 0,`
			`.capacity = 0,`
			`.lock = PTHREAD_MUTEX_INITIALIZER`
			`};`

			`// Statistics (if enabled)`
			`#if MID_ENABLE_STATS`
			`MidStats g_mid_stats = {0};`
			`#endif`

			`// Initialization flag`
			`static volatile int g_mid_initialized = 0;`
			`static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER;`

			`// ============================================================================`
			`// Forward Declarations`
			`// ============================================================================`

			`static bool segment_refill(MidThreadSegment* seg, int class_idx);`
			`static void* segment_alloc(MidThreadSegment* seg, int class_idx);`
			`static void segment_free_local(MidThreadSegment* seg, void* ptr);`
			`static void* chunk_allocate(size_t chunk_size);`
			`static void chunk_deallocate(void* chunk, size_t chunk_size);`
			`static void registry_add(void* base, size_t block_size, int class_idx);`
			`bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx); // Public for hak_free_at()`
			`static void registry_remove(void* base);`

			`// ============================================================================`
			`// Registry Operations (Protected by Lock)`
			`// ============================================================================`

			`/**`
			`* registry_add - Add a new segment to global registry`
			`*`
			`* Called during segment refill (rare, ~0.1% of allocations)`
			`*/`
			`static void registry_add(void* base, size_t block_size, int class_idx) {`
			`pthread_mutex_lock(&g_mid_registry.lock);`

			`// Grow registry if needed`
			`if (g_mid_registry.count >= g_mid_registry.capacity) {`
			`uint32_t new_capacity = g_mid_registry.capacity == 0`
			`? MID_REGISTRY_INITIAL_CAPACITY`
			`: g_mid_registry.capacity * 2;`

			`// CRITICAL: Use mmap() instead of realloc() to avoid deadlock!`
			`// realloc() would go through hakmem → mid_mt → registry_add → deadlock`
			`size_t new_size = new_capacity * sizeof(MidSegmentRegistry);`
			`MidSegmentRegistry* new_entries = mmap(`
			`NULL, new_size,`
			`PROT_READ \| PROT_WRITE,`
			`MAP_PRIVATE \| MAP_ANONYMOUS,`
			`-1, 0`
			`);`

			`if (new_entries == MAP_FAILED) {`
			`new_entries = NULL;`
			`} else if (g_mid_registry.entries) {`
			`// Copy old entries`
			`memcpy(new_entries, g_mid_registry.entries,`
			`g_mid_registry.count * sizeof(MidSegmentRegistry));`
			`// Don't unmap old entries (lazy cleanup, avoids complexity)`
			`}`

			`if (!new_entries) {`
			`pthread_mutex_unlock(&g_mid_registry.lock);`
			`MID_LOG("ERROR: Registry realloc failed");`
			`return;`
			`}`

			`g_mid_registry.entries = new_entries;`
			`g_mid_registry.capacity = new_capacity;`
			`}`

			`// Add new entry`
			`MidSegmentRegistry* entry = &g_mid_registry.entries[g_mid_registry.count];`
			`entry->base = base;`
			`entry->block_size = block_size;`
			`entry->class_idx = class_idx;`
			`g_mid_registry.count++;`

			`// Keep entries sorted by base address (for binary search)`
			`// Simple insertion: swap with previous until in order`
			`for (uint32_t i = g_mid_registry.count - 1; i > 0; i--) {`
			`if (g_mid_registry.entries[i].base >= g_mid_registry.entries[i - 1].base) {`
			`break;`
			`}`
			`// Swap`
			`MidSegmentRegistry tmp = g_mid_registry.entries[i];`
			`g_mid_registry.entries[i] = g_mid_registry.entries[i - 1];`
			`g_mid_registry.entries[i - 1] = tmp;`
			`}`

			`pthread_mutex_unlock(&g_mid_registry.lock);`

			`MID_LOG("Registry add: base=%p, block_size=%zu, class=%d, count=%u",`
			`base, block_size, class_idx, g_mid_registry.count);`
			`}`

			`/**`
			`* mid_registry_lookup - Find segment containing ptr via binary search`
			`*`
			`* Called during free() when ptr is not in current segment (uncommon)`
			`*`
			`* @return true if found, false otherwise`
			`*/`
			`bool mid_registry_lookup(void* ptr, size_t* out_block_size, int* out_class_idx) {`
			`pthread_mutex_lock(&g_mid_registry.lock);`

			`#if MID_ENABLE_STATS`
			`__sync_fetch_and_add(&g_mid_stats.registry_lookups, 1);`
			`#endif`

			`// Binary search for segment containing ptr`
			`int left = 0;`
			`int right = (int)g_mid_registry.count - 1;`
			`bool found = false;`

			`while (left <= right) {`
			`int mid = left + (right - left) / 2;`
			`MidSegmentRegistry* entry = &g_mid_registry.entries[mid];`

			`void* seg_end = (uint8_t*)entry->base + MID_CHUNK_SIZE;`

			`if (ptr < entry->base) {`
			`right = mid - 1;`
			`} else if (ptr >= seg_end) {`
			`left = mid + 1;`
			`} else {`
			`// Found!`
			`*out_block_size = entry->block_size;`
			`*out_class_idx = entry->class_idx;`
			`found = true;`
			`break;`
			`}`
			`}`

			`pthread_mutex_unlock(&g_mid_registry.lock);`

			`return found;`
			`}`

			`/**`
			`* registry_remove - Remove segment from registry`
			`*`
			`* Called when segment is completely freed (rare)`
			`*/`
			`static void registry_remove(void* base) {`
			`pthread_mutex_lock(&g_mid_registry.lock);`

			`// Find entry with matching base`
			`for (uint32_t i = 0; i < g_mid_registry.count; i++) {`
			`if (g_mid_registry.entries[i].base == base) {`
			`// Remove by shifting remaining entries`
			`for (uint32_t j = i; j < g_mid_registry.count - 1; j++) {`
			`g_mid_registry.entries[j] = g_mid_registry.entries[j + 1];`
			`}`
			`g_mid_registry.count--;`
			`pthread_mutex_unlock(&g_mid_registry.lock);`

			`MID_LOG("Registry remove: base=%p, count=%u", base, g_mid_registry.count);`
			`return;`
			`}`
			`}`

			`pthread_mutex_unlock(&g_mid_registry.lock);`
			`}`

			`// ============================================================================`
			`// Chunk Management (mmap/munmap wrappers)`
			`// ============================================================================`

			`/**`
			`* chunk_allocate - Allocate a new chunk via mmap`
			`*`
			`* @param chunk_size Size of chunk (typically 64KB)`
			`* @return Chunk base address, or NULL on failure`
			`*/`
			`static void* chunk_allocate(size_t chunk_size) {`
			`void* chunk = mmap(`
			`NULL,`
			`chunk_size,`
			`PROT_READ \| PROT_WRITE,`
			`MAP_PRIVATE \| MAP_ANONYMOUS,`
			`-1,`
			`0`
			`);`

			`if (chunk == MAP_FAILED) {`
			`MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size);`
			`return NULL;`
			`}`

			`MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size);`
			`return chunk;`
			`}`

			`/**`
			`* chunk_deallocate - Free chunk via munmap`
			`*`
			`* @param chunk Chunk base address`
			`* @param chunk_size Size of chunk`
			`*/`
			`static void chunk_deallocate(void* chunk, size_t chunk_size) {`
			`if (!chunk) return;`

			`int ret = munmap(chunk, chunk_size);`
			`if (ret != 0) {`
			`MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size);`
			`} else {`
			`MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size);`
			`}`
			`}`

			`// ============================================================================`
			`// Segment Operations`
			`// ============================================================================`

			`/**`
			`* segment_refill - Allocate new chunk and setup segment`
			`*`
			`* Called when segment is exhausted (rare, ~0.1% of allocations)`
			`*`
			`* @return true on success, false on OOM`
			`*/`
			`static bool segment_refill(MidThreadSegment* seg, int class_idx) {`
			`size_t block_size = mid_class_to_size(class_idx);`
			`size_t chunk_size = MID_CHUNK_SIZE;`

			`// Allocate new chunk via mmap`
			`void* chunk = chunk_allocate(chunk_size);`
			`if (!chunk) {`
			`return false;`
			`}`

			`// Register chunk in global registry (for free() lookup)`
			`registry_add(chunk, block_size, class_idx);`

			`// Setup segment`
			`seg->chunk_base = chunk;`
			`seg->chunk_size = chunk_size;`
			`seg->block_size = block_size;`
			`seg->current = chunk;`
			`seg->end = (uint8_t*)chunk + chunk_size;`
			`seg->capacity = chunk_size / block_size;`
			`seg->refill_count++;`

			`MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p",`
			`class_idx, block_size, seg->capacity, chunk);`

			`return true;`
			`}`

			`/**`
			`* segment_alloc - Allocate from segment (fast path)`
			`*`
			`* PERFORMANCE: Force inline for maximum speed`
			`*`
			`* Fast path priority:`
			`* 1. Free list (most common, ~90-95% hit rate)`
			`* 2. Bump allocation (when free list empty)`
			`* 3. Refill (when segment exhausted)`
			`*`
			`* @return Allocated pointer, or NULL on OOM`
			`*/`
			`static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));`
			`static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {`
			`void* p;`

			`// === Path 0: First allocation - need refill ===`
			`// CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call`
			`if (unlikely(seg->chunk_base == NULL)) {`
			`if (!segment_refill(seg, class_idx)) {`
			`return NULL; // OOM`
			`}`
			`// Fall through to bump allocation after refill`
			`}`

			`// === Path 1: Free list (fastest, ~4-5 instructions) ===`
			`p = seg->free_list;`
			`if (likely(p != NULL)) {`
			`seg->free_list = (void*)p; // Pop from free list`
			`seg->used_count++;`
			`seg->alloc_count++;`
			`return p;`
			`}`

			`// === Path 2: Bump allocation (fast, ~6-8 instructions) ===`
			`p = seg->current;`
			`void* next = (uint8_t*)p + seg->block_size;`

			`if (likely(next <= seg->end)) {`
			`seg->current = next;`
			`seg->used_count++;`
			`seg->alloc_count++;`
			`return p;`
			`}`

			`// === Path 3: Refill (slow, called ~once per 64KB) ===`
			`if (!segment_refill(seg, class_idx)) {`
			`return NULL; // OOM`
			`}`

			`// Retry after refill`
			`p = seg->current;`
			`seg->current = (uint8_t*)p + seg->block_size;`
			`seg->used_count++;`
			`seg->alloc_count++;`

			`return p;`
			`}`

			`/**`
			`* segment_free_local - Free to local segment (same thread)`
			`*`
			`* @param seg Segment to free to`
			`* @param ptr Pointer to free`
			`*/`
			`static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {`
			`// Push to free list (lock-free, local operation)`
			`(void*)ptr = seg->free_list;`
			`seg->free_list = ptr;`
			`seg->used_count--;`
			`seg->free_count++;`

			`#if MID_ENABLE_STATS`
			`__sync_fetch_and_add(&g_mid_stats.local_frees, 1);`
			`#endif`
			`}`

			`// ============================================================================`
			`// Public API`
			`// ============================================================================`

			`/**`
			`* mid_mt_init - Initialize Mid Range MT allocator`
			`*`
			`* Thread-safe, idempotent`
			`*/`
			`void mid_mt_init(void) {`
			`if (g_mid_initialized) return;`

			`pthread_mutex_lock(&g_init_lock);`

			`if (!g_mid_initialized) {`
			`// Initialize registry`
			`g_mid_registry.entries = NULL;`
			`g_mid_registry.count = 0;`
			`g_mid_registry.capacity = 0;`
			`pthread_mutex_init(&g_mid_registry.lock, NULL);`

			`#if MID_ENABLE_STATS`
			`memset(&g_mid_stats, 0, sizeof(g_mid_stats));`
			`#endif`

			`g_mid_initialized = 1;`

			`MID_LOG("Mid MT allocator initialized");`
			`}`

			`pthread_mutex_unlock(&g_init_lock);`
			`}`

			`/**`
			`* mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB)`
			`*`
			`* Thread-safe, lock-free (uses TLS)`
			`*/`
			`void* mid_mt_alloc(size_t size) {`
Phase 16: Dynamic Tiny/Mid Boundary with A/B Testing (ENV-controlled) IMPLEMENTATION: =============== Add dynamic boundary adjustment between Tiny and Mid allocators via HAKMEM_TINY_MAX_CLASS environment variable for performance tuning. Changes: -------- 1. hakmem_tiny.h/c: Add tiny_get_max_size() - reads ENV and maps class to max usable size (default: class 7 = 1023B, can reduce to class 5 = 255B) 2. hakmem_mid_mt.h/c: Add mid_get_min_size() - returns tiny_get_max_size() + 1 to ensure no size gap between allocators 3. hak_alloc_api.inc.h: Replace static TINY_MAX_SIZE with dynamic tiny_get_max_size() call in allocation routing logic 4. Size gap fix: Mid's range now dynamically adjusts based on Tiny's max (prevents 256-1023B from falling through when HAKMEM_TINY_MAX_CLASS=5) A/B BENCHMARK RESULTS: ====================== Config A (Default, C0-C7, Tiny up to 1023B): 128B: 6.34M ops/s \| 256B: 6.34M ops/s 512B: 5.55M ops/s \| 1024B: 5.91M ops/s Config B (Reduced, C0-C5, Tiny up to 255B): 128B: 1.38M ops/s (-78%) \| 256B: 1.36M ops/s (-79%) 512B: 1.33M ops/s (-76%) \| 1024B: 1.37M ops/s (-77%) FINDINGS: ========= ✅ Size gap fixed - no OOM crashes with HAKMEM_TINY_MAX_CLASS=5 ❌ Severe performance degradation (-76% to -79%) when reducing Tiny coverage ❌ Even 128B degraded (should still use Tiny) - possible class filtering issue ⚠️ Mid's coarse size classes (8KB/16KB/32KB) cause fragmentation for small sizes HYPOTHESIS: ----------- Mid allocator uses 8KB blocks for all 256-1024B allocations, causing: - Severe internal fragmentation (1024B request → 8KB block = 87% waste) - Poor cache utilization - Consistent ~1.3M ops/s across all sizes (same 8KB class) RECOMMENDATION: =============== Keep default HAKMEM_TINY_MAX_CLASS=7 (C0-C7, up to 1023B) Reducing Tiny coverage is COUNTERPRODUCTIVE with current Mid allocator design. To make this viable, Mid would need finer size classes for 256B-8KB range. ENV USAGE (for future experimentation): ---------------------------------------- export HAKMEM_TINY_MAX_CLASS=7 # Default (C0-C7, up to 1023B) export HAKMEM_TINY_MAX_CLASS=5 # Reduced (C0-C5, up to 255B) - NOT recommended 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-16 01:26:48 +09:00			`// Validate size range (Phase 16: dynamic min size based on Tiny's max)`
			`if (unlikely(size < mid_get_min_size() \|\| size > MID_MAX_SIZE)) {`
Debug Counters Implementation - Clean History Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-05 12:31:14 +09:00			`return NULL;`
			`}`

			`// Initialize if needed (thread-safe)`
			`if (unlikely(!g_mid_initialized)) {`
			`mid_mt_init();`
			`}`

			`// Get size class`
			`int class_idx = mid_size_to_class(size);`
			`if (unlikely(class_idx < 0)) {`
			`return NULL;`
			`}`

			`// Get thread-local segment`
			`MidThreadSegment* seg = &g_mid_segments[class_idx];`

			`// Allocate from segment (fast path)`
			`void* p = segment_alloc(seg, class_idx);`

			`#if MID_ENABLE_STATS`
			`if (p) {`
			`__sync_fetch_and_add(&g_mid_stats.total_allocs, 1);`
			`}`
			`#endif`

			`return p;`
			`}`

			`/**`
			`* mid_mt_free - Free memory allocated by mid_mt_alloc`
			`*`
			`* Phase 1 implementation:`
			`* - Local free (same thread): Fast, lock-free`
			`* - Remote free (cross-thread): NOT IMPLEMENTED (memory leak)`
			`*`
			`* Phase 2 will add atomic remote free list per segment`
			`*/`
			`void mid_mt_free(void* ptr, size_t size) {`
			`if (unlikely(!ptr)) return;`

			`#if MID_ENABLE_STATS`
			`__sync_fetch_and_add(&g_mid_stats.total_frees, 1);`
			`#endif`

			`// Get size class`
			`int class_idx = mid_size_to_class(size);`
			`if (unlikely(class_idx < 0)) {`
			`MID_LOG("ERROR: Invalid size %zu in free", size);`
			`return;`
			`}`

			`// Get thread-local segment`
			`MidThreadSegment* seg = &g_mid_segments[class_idx];`

			`// === Fast path: Check if ptr belongs to current segment ===`
			`if (likely(seg->chunk_base != NULL &&`
			`ptr >= seg->chunk_base &&`
			`ptr < seg->end)) {`
			`// Local free (same thread, lock-free)`
			`segment_free_local(seg, ptr);`
			`return;`
			`}`

			`// === Slow path: Remote free (cross-thread) ===`
			`// Phase 1: NOT IMPLEMENTED`
			`// We need to find the owning segment via registry,`
			`// then push to that segment's remote free list.`
			`//`
			`// For Phase 1 (benchmarking), we accept this memory leak.`
			`// bench_mid_large_mt uses independent working sets per thread,`
			`// so remote frees are rare.`

			`size_t block_size;`
			`int owner_class;`

			`if (mid_registry_lookup(ptr, &block_size, &owner_class)) {`
			`// Found in registry, but we can't free it yet (no remote free list)`
			`MID_LOG("WARNING: Remote free not implemented, leaking %p (size=%zu)", ptr, size);`

			`#if MID_ENABLE_STATS`
			`__sync_fetch_and_add(&g_mid_stats.remote_frees, 1);`
			`#endif`

			`// TODO Phase 2: Implement remote free`
			`// segment_free_remote(ptr, block_size, owner_class);`
			`} else {`
			`// Not found in registry - might be from a different allocator`
			`MID_LOG("ERROR: Pointer %p not found in registry (size=%zu)", ptr, size);`
			`}`
			`}`

			`/**`
			`* mid_mt_thread_exit - Cleanup thread-local segments`
			`*`
			`* Called on thread exit to release resources`
			`*/`
			`void mid_mt_thread_exit(void) {`
			`MID_LOG("Thread exit cleanup");`

			`// Free all chunks from this thread's segments`
			`for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {`
			`MidThreadSegment* seg = &g_mid_segments[class_idx];`

			`if (seg->chunk_base) {`
			`// Remove from registry`
			`registry_remove(seg->chunk_base);`

			`// Deallocate chunk`
			`chunk_deallocate(seg->chunk_base, seg->chunk_size);`

			`// Clear segment`
			`memset(seg, 0, sizeof(MidThreadSegment));`
			`}`
			`}`
			`}`

			`// ============================================================================`
			`// Statistics (Debug/Profiling)`
			`// ============================================================================`

			`#if MID_ENABLE_STATS`

			`void mid_mt_print_stats(void) {`
			`printf("\n=== Mid Range MT Statistics ===\n");`
			`printf("Total allocations: %lu\n", g_mid_stats.total_allocs);`
			`printf("Total frees: %lu\n", g_mid_stats.total_frees);`
			`printf("Local frees: %lu (%.1f%%)\n",`
			`g_mid_stats.local_frees,`
			`100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1));`
			`printf("Remote frees: %lu (%.1f%%)\n",`
			`g_mid_stats.remote_frees,`
			`100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1));`
			`printf("Registry lookups: %lu\n", g_mid_stats.registry_lookups);`
			`printf("\n");`

			`// Per-segment stats`
			`for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {`
			`MidThreadSegment* seg = &g_mid_segments[class_idx];`
			`if (seg->alloc_count > 0) {`
			`printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx));`
			`printf(" Allocations: %lu\n", seg->alloc_count);`
			`printf(" Frees: %lu\n", seg->free_count);`
			`printf(" Refills: %u\n", seg->refill_count);`
			`printf(" Used count: %u / %u\n", seg->used_count, seg->capacity);`
			`}`
			`}`
			`printf("\n");`
			`}`

			`#endif // MID_ENABLE_STATS`