hakmem/core/hakmem_smallmid_superslab.c

/**
 * hakmem_smallmid_superslab.c - Small-Mid SuperSlab Backend Implementation
 *
 * Phase 17-2: Dedicated SuperSlab pool for Small-Mid allocator
 * Goal: 2-3x performance improvement via batch refills and dedicated backend
 *
 * Created: 2025-11-16
 */

#include "hakmem_smallmid_superslab.h"
#include "hakmem_smallmid.h"
#include <sys/mman.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <errno.h>

// ============================================================================
// Global State
// ============================================================================

SmallMidSSHead g_smallmid_ss_pools[SMALLMID_NUM_CLASSES];

static pthread_once_t g_smallmid_ss_init_once = PTHREAD_ONCE_INIT;
static int g_smallmid_ss_initialized = 0;

#ifdef HAKMEM_SMALLMID_SS_STATS
SmallMidSSStats g_smallmid_ss_stats = {0};
#endif

// ============================================================================
// Initialization
// ============================================================================

static void smallmid_superslab_init_once(void) {
    for (int i = 0; i < SMALLMID_NUM_CLASSES; i++) {
        SmallMidSSHead* pool = &g_smallmid_ss_pools[i];

        pool->class_idx = i;
        pool->total_ss = 0;
        pool->first_ss = NULL;
        pool->current_ss = NULL;
        pool->lru_head = NULL;
        pool->lru_tail = NULL;

        pthread_mutex_init(&pool->lock, NULL);

        pool->alloc_count = 0;
        pool->refill_count = 0;
        pool->ss_alloc_count = 0;
        pool->ss_free_count = 0;
    }

    g_smallmid_ss_initialized = 1;

    #ifndef SMALLMID_DEBUG
    #define SMALLMID_DEBUG 0
    #endif

    #if SMALLMID_DEBUG
    fprintf(stderr, "[SmallMid SuperSlab] Initialized (%d classes)\n", SMALLMID_NUM_CLASSES);
    #endif
}

void smallmid_superslab_init(void) {
    pthread_once(&g_smallmid_ss_init_once, smallmid_superslab_init_once);
}

// ============================================================================
// SuperSlab Allocation/Deallocation
// ============================================================================

/**
 * smallmid_superslab_alloc - Allocate a new 1MB SuperSlab
 *
 * Strategy:
 * - mmap 1MB aligned region (PROT_READ|WRITE, MAP_PRIVATE|ANONYMOUS)
 * - Initialize header, metadata, counters
 * - Add to per-class pool chain
 * - Return SuperSlab pointer
 */
SmallMidSuperSlab* smallmid_superslab_alloc(int class_idx) {
    if (class_idx < 0 || class_idx >= SMALLMID_NUM_CLASSES) {
        return NULL;
    }

    // Allocate 1MB aligned region
    void* mem = mmap(NULL, SMALLMID_SUPERSLAB_SIZE,
                     PROT_READ | PROT_WRITE,
                     MAP_PRIVATE | MAP_ANONYMOUS,
                     -1, 0);

    if (mem == MAP_FAILED) {
        fprintf(stderr, "[SmallMid SS] mmap failed: %s\n", strerror(errno));
        return NULL;
    }

    // Ensure alignment (mmap should return aligned address)
    uintptr_t addr = (uintptr_t)mem;
    if ((addr & (SMALLMID_SS_ALIGNMENT - 1)) != 0) {
        fprintf(stderr, "[SmallMid SS] WARNING: mmap returned unaligned address %p\n", mem);
        munmap(mem, SMALLMID_SUPERSLAB_SIZE);
        return NULL;
    }

    SmallMidSuperSlab* ss = (SmallMidSuperSlab*)mem;

    // Initialize header
    ss->magic = SMALLMID_SS_MAGIC;
    ss->num_slabs = SMALLMID_SLABS_PER_SS;
    ss->active_slabs = 0;
    ss->refcount = 1;
    ss->total_active = 0;
    ss->slab_bitmap = 0;
    ss->nonempty_mask = 0;
    ss->last_used_ns = 0;
    ss->generation = 0;
    ss->next = NULL;
    ss->lru_next = NULL;
    ss->lru_prev = NULL;

    // Initialize slab metadata (all inactive initially)
    for (int i = 0; i < SMALLMID_SLABS_PER_SS; i++) {
        SmallMidSlabMeta* meta = &ss->slabs[i];
        meta->freelist = NULL;
        meta->used = 0;
        meta->capacity = 0;
        meta->carved = 0;
        meta->class_idx = class_idx;
        meta->flags = SMALLMID_SLAB_INACTIVE;
    }

    // Update pool stats
    SmallMidSSHead* pool = &g_smallmid_ss_pools[class_idx];
    atomic_fetch_add(&pool->total_ss, 1);
    atomic_fetch_add(&pool->ss_alloc_count, 1);

    #ifdef HAKMEM_SMALLMID_SS_STATS
    atomic_fetch_add(&g_smallmid_ss_stats.total_ss_alloc, 1);
    #endif

    #if SMALLMID_DEBUG
    fprintf(stderr, "[SmallMid SS] Allocated SuperSlab %p (class=%d, size=1MB)\n",
            ss, class_idx);
    #endif

    return ss;
}

/**
 * smallmid_superslab_free - Free a SuperSlab
 *
 * Strategy:
 * - Validate refcount == 0 (all blocks freed)
 * - munmap the 1MB region
 * - Update pool stats
 */
void smallmid_superslab_free(SmallMidSuperSlab* ss) {
    if (!ss || ss->magic != SMALLMID_SS_MAGIC) {
        fprintf(stderr, "[SmallMid SS] ERROR: Invalid SuperSlab %p\n", ss);
        return;
    }

    uint32_t refcount = atomic_load(&ss->refcount);
    if (refcount > 0) {
        fprintf(stderr, "[SmallMid SS] WARNING: Freeing SuperSlab with refcount=%u\n", refcount);
    }

    uint32_t active = atomic_load(&ss->total_active);
    if (active > 0) {
        fprintf(stderr, "[SmallMid SS] WARNING: Freeing SuperSlab with active blocks=%u\n", active);
    }

    // Invalidate magic
    ss->magic = 0xDEADBEEF;

    // munmap
    if (munmap(ss, SMALLMID_SUPERSLAB_SIZE) != 0) {
        fprintf(stderr, "[SmallMid SS] munmap failed: %s\n", strerror(errno));
    }

    #ifdef HAKMEM_SMALLMID_SS_STATS
    atomic_fetch_add(&g_smallmid_ss_stats.total_ss_free, 1);
    #endif

    #if SMALLMID_DEBUG
    fprintf(stderr, "[SmallMid SS] Freed SuperSlab %p\n", ss);
    #endif
}

// ============================================================================
// Slab Initialization
// ============================================================================

/**
 * smallmid_slab_init - Initialize a slab within SuperSlab
 *
 * Strategy:
 * - Calculate slab base address (ss_base + slab_idx * 64KB)
 * - Set capacity based on size class (256/128/64 blocks)
 * - Mark slab as active
 * - Update SuperSlab bitmaps
 */
void smallmid_slab_init(SmallMidSuperSlab* ss, int slab_idx, int class_idx) {
    if (!ss || slab_idx < 0 || slab_idx >= SMALLMID_SLABS_PER_SS) {
        return;
    }

    SmallMidSlabMeta* meta = &ss->slabs[slab_idx];

    // Set capacity based on class
    const uint16_t capacities[SMALLMID_NUM_CLASSES] = {
        SMALLMID_BLOCKS_256B,
        SMALLMID_BLOCKS_512B,
        SMALLMID_BLOCKS_1KB
    };

    meta->freelist = NULL;
    meta->used = 0;
    meta->capacity = capacities[class_idx];
    meta->carved = 0;
    meta->class_idx = class_idx;
    meta->flags = SMALLMID_SLAB_ACTIVE;

    // Update SuperSlab bitmaps
    ss->slab_bitmap |= (1u << slab_idx);
    ss->nonempty_mask |= (1u << slab_idx);
    ss->active_slabs++;

    #if SMALLMID_DEBUG
    fprintf(stderr, "[SmallMid SS] Initialized slab %d in SS %p (class=%d, capacity=%u)\n",
            slab_idx, ss, class_idx, meta->capacity);
    #endif
}

// ============================================================================
// Batch Refill (Performance-Critical Path)
// ============================================================================

/**
 * smallmid_refill_batch - Batch refill TLS freelist from SuperSlab
 *
 * Performance target: 5-8 instructions per call (amortized)
 *
 * Strategy:
 * 1. Try current slab's freelist (fast path: pop batch_max blocks)
 * 2. Fall back to bump allocation if freelist empty
 * 3. Allocate new slab if current is full
 * 4. Allocate new SuperSlab if no slabs available
 *
 * Returns: Number of blocks refilled (0 on failure)
 */
int smallmid_refill_batch(int class_idx, void** batch_out, int batch_max) {
    if (class_idx < 0 || class_idx >= SMALLMID_NUM_CLASSES || !batch_out || batch_max <= 0) {
        return 0;
    }

    SmallMidSSHead* pool = &g_smallmid_ss_pools[class_idx];

    // Ensure SuperSlab pool is initialized
    if (!g_smallmid_ss_initialized) {
        smallmid_superslab_init();
    }

    // Allocate first SuperSlab if needed
    pthread_mutex_lock(&pool->lock);

    if (!pool->current_ss) {
        pool->current_ss = smallmid_superslab_alloc(class_idx);
        if (!pool->current_ss) {
            pthread_mutex_unlock(&pool->lock);
            return 0;
        }

        // Add to chain
        if (!pool->first_ss) {
            pool->first_ss = pool->current_ss;
        }

        // Initialize first slab
        smallmid_slab_init(pool->current_ss, 0, class_idx);
    }

    SmallMidSuperSlab* ss = pool->current_ss;
    pthread_mutex_unlock(&pool->lock);

    // Find active slab with available blocks
    int slab_idx = -1;
    SmallMidSlabMeta* meta = NULL;

    for (int i = 0; i < SMALLMID_SLABS_PER_SS; i++) {
        if (!(ss->slab_bitmap & (1u << i))) {
            continue;  // Slab not active
        }

        meta = &ss->slabs[i];
        if (meta->used < meta->capacity) {
            slab_idx = i;
            break;  // Found slab with space
        }
    }

    // No slab with space - try to allocate new slab
    if (slab_idx == -1) {
        pthread_mutex_lock(&pool->lock);

        // Find first inactive slab
        for (int i = 0; i < SMALLMID_SLABS_PER_SS; i++) {
            if (!(ss->slab_bitmap & (1u << i))) {
                smallmid_slab_init(ss, i, class_idx);
                slab_idx = i;
                meta = &ss->slabs[i];
                break;
            }
        }

        pthread_mutex_unlock(&pool->lock);

        // All slabs exhausted - need new SuperSlab
        if (slab_idx == -1) {
            pthread_mutex_lock(&pool->lock);

            SmallMidSuperSlab* new_ss = smallmid_superslab_alloc(class_idx);
            if (!new_ss) {
                pthread_mutex_unlock(&pool->lock);
                return 0;
            }

            // Link to chain
            new_ss->next = pool->first_ss;
            pool->first_ss = new_ss;
            pool->current_ss = new_ss;

            // Initialize first slab
            smallmid_slab_init(new_ss, 0, class_idx);

            pthread_mutex_unlock(&pool->lock);

            ss = new_ss;
            slab_idx = 0;
            meta = &ss->slabs[0];
        }
    }

    // Now we have a slab with available capacity
    // Strategy: Try freelist first, then bump allocation

    const size_t block_sizes[SMALLMID_NUM_CLASSES] = {256, 512, 1024};
    size_t block_size = block_sizes[class_idx];
    int refilled = 0;

    // Calculate slab data base address
    uintptr_t ss_base = (uintptr_t)ss;
    uintptr_t slab_base = ss_base + (slab_idx * SMALLMID_SLAB_SIZE);

    // Fast path: Pop from freelist (if available)
    void* freelist_head = meta->freelist;
    while (freelist_head && refilled < batch_max) {
        // Add 1-byte header space (Phase 7 technology)
        void* user_ptr = (uint8_t*)freelist_head + 1;
        batch_out[refilled++] = user_ptr;

        // Next block (freelist stored at offset 0 in user data)
        freelist_head = *(void**)user_ptr;
    }
    meta->freelist = freelist_head;

    // Slow path: Bump allocation
    while (refilled < batch_max && meta->carved < meta->capacity) {
        // Calculate block base address (with 1-byte header)
        uintptr_t block_base = slab_base + (meta->carved * (block_size + 1));
        void* base_ptr = (void*)block_base;
        void* user_ptr = (uint8_t*)base_ptr + 1;

        // Write header (0xb0 | class_idx)
        *(uint8_t*)base_ptr = 0xb0 | class_idx;

        batch_out[refilled++] = user_ptr;
        meta->carved++;
        meta->used++;

        // Update SuperSlab active counter
        atomic_fetch_add(&ss->total_active, 1);
    }

    // Update stats
    atomic_fetch_add(&pool->alloc_count, refilled);
    atomic_fetch_add(&pool->refill_count, 1);

    #ifdef HAKMEM_SMALLMID_SS_STATS
    atomic_fetch_add(&g_smallmid_ss_stats.total_refills, 1);
    atomic_fetch_add(&g_smallmid_ss_stats.total_blocks_carved, refilled);
    #endif

    #if SMALLMID_DEBUG
    if (refilled > 0) {
        fprintf(stderr, "[SmallMid SS] Refilled %d blocks (class=%d, slab=%d, carved=%u/%u)\n",
                refilled, class_idx, slab_idx, meta->carved, meta->capacity);
    }
    #endif

    return refilled;
}

// ============================================================================
// Statistics
// ============================================================================

#ifdef HAKMEM_SMALLMID_SS_STATS
void smallmid_ss_print_stats(void) {
    fprintf(stderr, "\n=== Small-Mid SuperSlab Statistics ===\n");
    fprintf(stderr, "Total SuperSlab allocs: %lu\n", g_smallmid_ss_stats.total_ss_alloc);
    fprintf(stderr, "Total SuperSlab frees:  %lu\n", g_smallmid_ss_stats.total_ss_free);
    fprintf(stderr, "Total refills:          %lu\n", g_smallmid_ss_stats.total_refills);
    fprintf(stderr, "Total blocks carved:    %lu\n", g_smallmid_ss_stats.total_blocks_carved);
    fprintf(stderr, "Total blocks freed:     %lu\n", g_smallmid_ss_stats.total_blocks_freed);

    fprintf(stderr, "\nPer-class statistics:\n");
    for (int i = 0; i < SMALLMID_NUM_CLASSES; i++) {
        SmallMidSSHead* pool = &g_smallmid_ss_pools[i];
        fprintf(stderr, "  Class %d (%zuB):\n", i, g_smallmid_class_sizes[i]);
        fprintf(stderr, "    Total SS: %zu\n", pool->total_ss);
        fprintf(stderr, "    Allocs:   %lu\n", pool->alloc_count);
        fprintf(stderr, "    Refills:  %lu\n", pool->refill_count);
    }

    fprintf(stderr, "=======================================\n\n");
}
#endif
Phase 17-2: Small-Mid Dedicated SuperSlab Backend (実験結果: 70% page fault, 性能改善なし) Summary: ======== Phase 17-2 implements dedicated SuperSlab backend for Small-Mid allocator (256B-1KB). Result: No performance improvement (-0.9%), worse than Phase 17-1 (+0.3%). Root cause: 70% page fault (ChatGPT + perf profiling). Conclusion: Small-Mid専用層戦略は失敗。Tiny SuperSlab最適化が必要。 Implementation: =============== 1. Dedicated Small-Mid SuperSlab pool (1MB, 16 slabs/SS) - Separate from Tiny SuperSlab (no competition) - Batch refill (8-16 blocks per TLS refill) - Direct 0xb0 header writes (no Tiny delegation) 2. Backend architecture - SmallMidSuperSlab: 1MB aligned region, fast ptr→SS lookup - SmallMidSlabMeta: per-slab metadata (capacity/used/carved/freelist) - SmallMidSSHead: per-class pool with LRU tracking 3. Batch refill implementation - smallmid_refill_batch(): 8-16 blocks/call (vs 1 in Phase 17-1) - Freelist priority → bump allocation fallback - Auto SuperSlab expansion when exhausted Files Added: ============ - core/hakmem_smallmid_superslab.h: SuperSlab metadata structures - core/hakmem_smallmid_superslab.c: Backend implementation (~450 lines) Files Modified: =============== - core/hakmem_smallmid.c: Removed Tiny delegation, added batch refill - Makefile: Added hakmem_smallmid_superslab.o to build - CURRENT_TASK.md: Phase 17 完了記録 + Phase 18 計画 A/B Benchmark Results: ====================== \| Size \| Phase 17-1 (ON) \| Phase 17-2 (ON) \| Delta \| vs Baseline \| \|--------\|-----------------\|-----------------\|----------\|-------------\| \| 256B \| 6.06M ops/s \| 5.84M ops/s \| -3.6% \| -4.1% \| \| 512B \| 5.91M ops/s \| 5.86M ops/s \| -0.8% \| +1.2% \| \| 1024B \| 5.54M ops/s \| 5.44M ops/s \| -1.8% \| +0.4% \| \| Avg \| 5.84M ops/s \| 5.71M ops/s \| -2.2% \| -0.9% \| Performance Analysis (ChatGPT + perf): ====================================== ✅ Frontend (TLS/batch refill): OK - Only 30% CPU time - Batch refill logic is efficient - Direct 0xb0 header writes work correctly ❌ Backend (SuperSlab allocation): BOTTLENECK - 70% CPU time in asm_exc_page_fault - mmap(1MB) → kernel page allocation → very slow - New SuperSlab allocation per benchmark run - No warm SuperSlab reuse (used counter never decrements) Root Cause: =========== Small-Mid allocates new SuperSlabs frequently: alloc → TLS miss → refill → new SuperSlab → mmap(1MB) → page fault (70%) Tiny reuses warm SuperSlabs: alloc → TLS miss → refill → existing warm SuperSlab → no page fault Key Finding: "70% page fault" reveals SuperSlab layer needs optimization, NOT frontend layer (TLS/batch refill design is correct). Lessons Learned: ================ 1. ❌ Small-Mid専用層戦略は失敗 (Phase 17-1: +0.3%, Phase 17-2: -0.9%) 2. ✅ Frontend実装は成功 (30% CPU, batch refill works) 3. 🔥 70% page fault = SuperSlab allocation bottleneck 4. ✅ Tiny (6.08M ops/s) is already well-optimized, hard to beat 5. ✅ Layer separation doesn't improve performance - backend optimization needed Next Steps (Phase 18): ====================== ChatGPT recommendation: Optimize Tiny SuperSlab (NOT Small-Mid specific layer) Box SS-Reuse (Priority 1): - Implement meta->freelist reuse (currently bump-only) - Detect slab empty → return to shared_pool - Reuse same SuperSlab for longer (reduce page faults) - Target: 70% page fault → 5-10%, 2-4x improvement Box SS-Prewarm (Priority 2): - Pre-allocate SuperSlabs per class (Phase 11: +6.4%) - Concentrate page faults at benchmark start - Benchmark-only optimization Small-Mid Implementation Status: ================================= - ENV=0 by default (zero overhead, branch predictor learns) - Complete separation from Tiny (no interference) - Valuable as experimental record ("why dedicated layer failed") - Can be removed later if needed (not blocking Tiny optimization) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-16 03:21:13 +09:00			`/**`
			`* hakmem_smallmid_superslab.c - Small-Mid SuperSlab Backend Implementation`
			`*`
			`* Phase 17-2: Dedicated SuperSlab pool for Small-Mid allocator`
			`* Goal: 2-3x performance improvement via batch refills and dedicated backend`
			`*`
			`* Created: 2025-11-16`
			`*/`

			`#include "hakmem_smallmid_superslab.h"`
			`#include "hakmem_smallmid.h"`
			`#include <sys/mman.h>`
			`#include <string.h>`
			`#include <stdio.h>`
			`#include <time.h>`
			`#include <errno.h>`

			`// ============================================================================`
			`// Global State`
			`// ============================================================================`

			`SmallMidSSHead g_smallmid_ss_pools[SMALLMID_NUM_CLASSES];`

			`static pthread_once_t g_smallmid_ss_init_once = PTHREAD_ONCE_INIT;`
			`static int g_smallmid_ss_initialized = 0;`

			`#ifdef HAKMEM_SMALLMID_SS_STATS`
			`SmallMidSSStats g_smallmid_ss_stats = {0};`
			`#endif`

			`// ============================================================================`
			`// Initialization`
			`// ============================================================================`

			`static void smallmid_superslab_init_once(void) {`
			`for (int i = 0; i < SMALLMID_NUM_CLASSES; i++) {`
			`SmallMidSSHead* pool = &g_smallmid_ss_pools[i];`

			`pool->class_idx = i;`
			`pool->total_ss = 0;`
			`pool->first_ss = NULL;`
			`pool->current_ss = NULL;`
			`pool->lru_head = NULL;`
			`pool->lru_tail = NULL;`

			`pthread_mutex_init(&pool->lock, NULL);`

			`pool->alloc_count = 0;`
			`pool->refill_count = 0;`
			`pool->ss_alloc_count = 0;`
			`pool->ss_free_count = 0;`
			`}`

			`g_smallmid_ss_initialized = 1;`

			`#ifndef SMALLMID_DEBUG`
			`#define SMALLMID_DEBUG 0`
			`#endif`

			`#if SMALLMID_DEBUG`
			`fprintf(stderr, "[SmallMid SuperSlab] Initialized (%d classes)\n", SMALLMID_NUM_CLASSES);`
			`#endif`
			`}`

			`void smallmid_superslab_init(void) {`
			`pthread_once(&g_smallmid_ss_init_once, smallmid_superslab_init_once);`
			`}`

			`// ============================================================================`
			`// SuperSlab Allocation/Deallocation`
			`// ============================================================================`

			`/**`
			`* smallmid_superslab_alloc - Allocate a new 1MB SuperSlab`
			`*`
			`* Strategy:`
			`* - mmap 1MB aligned region (PROT_READ\|WRITE, MAP_PRIVATE\|ANONYMOUS)`
			`* - Initialize header, metadata, counters`
			`* - Add to per-class pool chain`
			`* - Return SuperSlab pointer`
			`*/`
			`SmallMidSuperSlab* smallmid_superslab_alloc(int class_idx) {`
			`if (class_idx < 0 \|\| class_idx >= SMALLMID_NUM_CLASSES) {`
			`return NULL;`
			`}`

			`// Allocate 1MB aligned region`
			`void* mem = mmap(NULL, SMALLMID_SUPERSLAB_SIZE,`
			`PROT_READ \| PROT_WRITE,`
			`MAP_PRIVATE \| MAP_ANONYMOUS,`
			`-1, 0);`

			`if (mem == MAP_FAILED) {`
			`fprintf(stderr, "[SmallMid SS] mmap failed: %s\n", strerror(errno));`
			`return NULL;`
			`}`

			`// Ensure alignment (mmap should return aligned address)`
			`uintptr_t addr = (uintptr_t)mem;`
			`if ((addr & (SMALLMID_SS_ALIGNMENT - 1)) != 0) {`
			`fprintf(stderr, "[SmallMid SS] WARNING: mmap returned unaligned address %p\n", mem);`
			`munmap(mem, SMALLMID_SUPERSLAB_SIZE);`
			`return NULL;`
			`}`

			`SmallMidSuperSlab* ss = (SmallMidSuperSlab*)mem;`

			`// Initialize header`
			`ss->magic = SMALLMID_SS_MAGIC;`
			`ss->num_slabs = SMALLMID_SLABS_PER_SS;`
			`ss->active_slabs = 0;`
			`ss->refcount = 1;`
			`ss->total_active = 0;`
			`ss->slab_bitmap = 0;`
			`ss->nonempty_mask = 0;`
			`ss->last_used_ns = 0;`
			`ss->generation = 0;`
			`ss->next = NULL;`
			`ss->lru_next = NULL;`
			`ss->lru_prev = NULL;`

			`// Initialize slab metadata (all inactive initially)`
			`for (int i = 0; i < SMALLMID_SLABS_PER_SS; i++) {`
			`SmallMidSlabMeta* meta = &ss->slabs[i];`
			`meta->freelist = NULL;`
			`meta->used = 0;`
			`meta->capacity = 0;`
			`meta->carved = 0;`
			`meta->class_idx = class_idx;`
			`meta->flags = SMALLMID_SLAB_INACTIVE;`
			`}`

			`// Update pool stats`
			`SmallMidSSHead* pool = &g_smallmid_ss_pools[class_idx];`
			`atomic_fetch_add(&pool->total_ss, 1);`
			`atomic_fetch_add(&pool->ss_alloc_count, 1);`

			`#ifdef HAKMEM_SMALLMID_SS_STATS`
			`atomic_fetch_add(&g_smallmid_ss_stats.total_ss_alloc, 1);`
			`#endif`

			`#if SMALLMID_DEBUG`
			`fprintf(stderr, "[SmallMid SS] Allocated SuperSlab %p (class=%d, size=1MB)\n",`
			`ss, class_idx);`
			`#endif`

			`return ss;`
			`}`

			`/**`
			`* smallmid_superslab_free - Free a SuperSlab`
			`*`
			`* Strategy:`
			`* - Validate refcount == 0 (all blocks freed)`
			`* - munmap the 1MB region`
			`* - Update pool stats`
			`*/`
			`void smallmid_superslab_free(SmallMidSuperSlab* ss) {`
			`if (!ss \|\| ss->magic != SMALLMID_SS_MAGIC) {`
			`fprintf(stderr, "[SmallMid SS] ERROR: Invalid SuperSlab %p\n", ss);`
			`return;`
			`}`

			`uint32_t refcount = atomic_load(&ss->refcount);`
			`if (refcount > 0) {`
			`fprintf(stderr, "[SmallMid SS] WARNING: Freeing SuperSlab with refcount=%u\n", refcount);`
			`}`

			`uint32_t active = atomic_load(&ss->total_active);`
			`if (active > 0) {`
			`fprintf(stderr, "[SmallMid SS] WARNING: Freeing SuperSlab with active blocks=%u\n", active);`
			`}`

			`// Invalidate magic`
			`ss->magic = 0xDEADBEEF;`

			`// munmap`
			`if (munmap(ss, SMALLMID_SUPERSLAB_SIZE) != 0) {`
			`fprintf(stderr, "[SmallMid SS] munmap failed: %s\n", strerror(errno));`
			`}`

			`#ifdef HAKMEM_SMALLMID_SS_STATS`
			`atomic_fetch_add(&g_smallmid_ss_stats.total_ss_free, 1);`
			`#endif`

			`#if SMALLMID_DEBUG`
			`fprintf(stderr, "[SmallMid SS] Freed SuperSlab %p\n", ss);`
			`#endif`
			`}`

			`// ============================================================================`
			`// Slab Initialization`
			`// ============================================================================`

			`/**`
			`* smallmid_slab_init - Initialize a slab within SuperSlab`
			`*`
			`* Strategy:`
			`* - Calculate slab base address (ss_base + slab_idx * 64KB)`
			`* - Set capacity based on size class (256/128/64 blocks)`
			`* - Mark slab as active`
			`* - Update SuperSlab bitmaps`
			`*/`
			`void smallmid_slab_init(SmallMidSuperSlab* ss, int slab_idx, int class_idx) {`
			`if (!ss \|\| slab_idx < 0 \|\| slab_idx >= SMALLMID_SLABS_PER_SS) {`
			`return;`
			`}`

			`SmallMidSlabMeta* meta = &ss->slabs[slab_idx];`

			`// Set capacity based on class`
			`const uint16_t capacities[SMALLMID_NUM_CLASSES] = {`
			`SMALLMID_BLOCKS_256B,`
			`SMALLMID_BLOCKS_512B,`
			`SMALLMID_BLOCKS_1KB`
			`};`

			`meta->freelist = NULL;`
			`meta->used = 0;`
			`meta->capacity = capacities[class_idx];`
			`meta->carved = 0;`
			`meta->class_idx = class_idx;`
			`meta->flags = SMALLMID_SLAB_ACTIVE;`

			`// Update SuperSlab bitmaps`
			`ss->slab_bitmap \|= (1u << slab_idx);`
			`ss->nonempty_mask \|= (1u << slab_idx);`
			`ss->active_slabs++;`

			`#if SMALLMID_DEBUG`
			`fprintf(stderr, "[SmallMid SS] Initialized slab %d in SS %p (class=%d, capacity=%u)\n",`
			`slab_idx, ss, class_idx, meta->capacity);`
			`#endif`
			`}`

			`// ============================================================================`
			`// Batch Refill (Performance-Critical Path)`
			`// ============================================================================`

			`/**`
			`* smallmid_refill_batch - Batch refill TLS freelist from SuperSlab`
			`*`
			`* Performance target: 5-8 instructions per call (amortized)`
			`*`
			`* Strategy:`
			`* 1. Try current slab's freelist (fast path: pop batch_max blocks)`
			`* 2. Fall back to bump allocation if freelist empty`
			`* 3. Allocate new slab if current is full`
			`* 4. Allocate new SuperSlab if no slabs available`
			`*`
			`* Returns: Number of blocks refilled (0 on failure)`
			`*/`
			`int smallmid_refill_batch(int class_idx, void** batch_out, int batch_max) {`
			`if (class_idx < 0 \|\| class_idx >= SMALLMID_NUM_CLASSES \|\| !batch_out \|\| batch_max <= 0) {`
			`return 0;`
			`}`

			`SmallMidSSHead* pool = &g_smallmid_ss_pools[class_idx];`

			`// Ensure SuperSlab pool is initialized`
			`if (!g_smallmid_ss_initialized) {`
			`smallmid_superslab_init();`
			`}`

			`// Allocate first SuperSlab if needed`
			`pthread_mutex_lock(&pool->lock);`

			`if (!pool->current_ss) {`
			`pool->current_ss = smallmid_superslab_alloc(class_idx);`
			`if (!pool->current_ss) {`
			`pthread_mutex_unlock(&pool->lock);`
			`return 0;`
			`}`

			`// Add to chain`
			`if (!pool->first_ss) {`
			`pool->first_ss = pool->current_ss;`
			`}`

			`// Initialize first slab`
			`smallmid_slab_init(pool->current_ss, 0, class_idx);`
			`}`

			`SmallMidSuperSlab* ss = pool->current_ss;`
			`pthread_mutex_unlock(&pool->lock);`

			`// Find active slab with available blocks`
			`int slab_idx = -1;`
			`SmallMidSlabMeta* meta = NULL;`

			`for (int i = 0; i < SMALLMID_SLABS_PER_SS; i++) {`
			`if (!(ss->slab_bitmap & (1u << i))) {`
			`continue; // Slab not active`
			`}`

			`meta = &ss->slabs[i];`
			`if (meta->used < meta->capacity) {`
			`slab_idx = i;`
			`break; // Found slab with space`
			`}`
			`}`

			`// No slab with space - try to allocate new slab`
			`if (slab_idx == -1) {`
			`pthread_mutex_lock(&pool->lock);`

			`// Find first inactive slab`
			`for (int i = 0; i < SMALLMID_SLABS_PER_SS; i++) {`
			`if (!(ss->slab_bitmap & (1u << i))) {`
			`smallmid_slab_init(ss, i, class_idx);`
			`slab_idx = i;`
			`meta = &ss->slabs[i];`
			`break;`
			`}`
			`}`

			`pthread_mutex_unlock(&pool->lock);`

			`// All slabs exhausted - need new SuperSlab`
			`if (slab_idx == -1) {`
			`pthread_mutex_lock(&pool->lock);`

			`SmallMidSuperSlab* new_ss = smallmid_superslab_alloc(class_idx);`
			`if (!new_ss) {`
			`pthread_mutex_unlock(&pool->lock);`
			`return 0;`
			`}`

			`// Link to chain`
			`new_ss->next = pool->first_ss;`
			`pool->first_ss = new_ss;`
			`pool->current_ss = new_ss;`

			`// Initialize first slab`
			`smallmid_slab_init(new_ss, 0, class_idx);`

			`pthread_mutex_unlock(&pool->lock);`

			`ss = new_ss;`
			`slab_idx = 0;`
			`meta = &ss->slabs[0];`
			`}`
			`}`

			`// Now we have a slab with available capacity`
			`// Strategy: Try freelist first, then bump allocation`

			`const size_t block_sizes[SMALLMID_NUM_CLASSES] = {256, 512, 1024};`
			`size_t block_size = block_sizes[class_idx];`
			`int refilled = 0;`

			`// Calculate slab data base address`
			`uintptr_t ss_base = (uintptr_t)ss;`
			`uintptr_t slab_base = ss_base + (slab_idx * SMALLMID_SLAB_SIZE);`

			`// Fast path: Pop from freelist (if available)`
			`void* freelist_head = meta->freelist;`
			`while (freelist_head && refilled < batch_max) {`
			`// Add 1-byte header space (Phase 7 technology)`
			`void* user_ptr = (uint8_t*)freelist_head + 1;`
			`batch_out[refilled++] = user_ptr;`

			`// Next block (freelist stored at offset 0 in user data)`
			`freelist_head = (void*)user_ptr;`
			`}`
			`meta->freelist = freelist_head;`

			`// Slow path: Bump allocation`
			`while (refilled < batch_max && meta->carved < meta->capacity) {`
			`// Calculate block base address (with 1-byte header)`
			`uintptr_t block_base = slab_base + (meta->carved * (block_size + 1));`
			`void* base_ptr = (void*)block_base;`
			`void* user_ptr = (uint8_t*)base_ptr + 1;`

			`// Write header (0xb0 \| class_idx)`
			`(uint8_t)base_ptr = 0xb0 \| class_idx;`

			`batch_out[refilled++] = user_ptr;`
			`meta->carved++;`
			`meta->used++;`

			`// Update SuperSlab active counter`
			`atomic_fetch_add(&ss->total_active, 1);`
			`}`

			`// Update stats`
			`atomic_fetch_add(&pool->alloc_count, refilled);`
			`atomic_fetch_add(&pool->refill_count, 1);`

			`#ifdef HAKMEM_SMALLMID_SS_STATS`
			`atomic_fetch_add(&g_smallmid_ss_stats.total_refills, 1);`
			`atomic_fetch_add(&g_smallmid_ss_stats.total_blocks_carved, refilled);`
			`#endif`

			`#if SMALLMID_DEBUG`
			`if (refilled > 0) {`
			`fprintf(stderr, "[SmallMid SS] Refilled %d blocks (class=%d, slab=%d, carved=%u/%u)\n",`
			`refilled, class_idx, slab_idx, meta->carved, meta->capacity);`
			`}`
			`#endif`

			`return refilled;`
			`}`

			`// ============================================================================`
			`// Statistics`
			`// ============================================================================`

			`#ifdef HAKMEM_SMALLMID_SS_STATS`
			`void smallmid_ss_print_stats(void) {`
			`fprintf(stderr, "\n=== Small-Mid SuperSlab Statistics ===\n");`
			`fprintf(stderr, "Total SuperSlab allocs: %lu\n", g_smallmid_ss_stats.total_ss_alloc);`
			`fprintf(stderr, "Total SuperSlab frees: %lu\n", g_smallmid_ss_stats.total_ss_free);`
			`fprintf(stderr, "Total refills: %lu\n", g_smallmid_ss_stats.total_refills);`
			`fprintf(stderr, "Total blocks carved: %lu\n", g_smallmid_ss_stats.total_blocks_carved);`
			`fprintf(stderr, "Total blocks freed: %lu\n", g_smallmid_ss_stats.total_blocks_freed);`

			`fprintf(stderr, "\nPer-class statistics:\n");`
			`for (int i = 0; i < SMALLMID_NUM_CLASSES; i++) {`
			`SmallMidSSHead* pool = &g_smallmid_ss_pools[i];`
			`fprintf(stderr, " Class %d (%zuB):\n", i, g_smallmid_class_sizes[i]);`
			`fprintf(stderr, " Total SS: %zu\n", pool->total_ss);`
			`fprintf(stderr, " Allocs: %lu\n", pool->alloc_count);`
			`fprintf(stderr, " Refills: %lu\n", pool->refill_count);`
			`}`

			`fprintf(stderr, "=======================================\n\n");`
			`}`
			`#endif`