From 8789542a9f4ecdc7d396688ca19268958a4c687b Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Thu, 11 Dec 2025 13:32:46 +0900 Subject: [PATCH] Phase v5-7: C6 ULTRA pattern (research mode, 32-slot TLS freelist) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implementation: - ENV: HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED=0|1 (default: 0) - SmallHeapCtxV5: added c6_tls_freelist[32], c6_tls_count, ultra_c6_enabled - small_segment_v5_owns_ptr_fast(): lightweight segment check for free path - small_alloc_slow_v5_c6_refill(): batch TLS fill from page freelist - small_free_slow_v5_c6_drain(): drain half of TLS to page on overflow Performance (C6-heavy 257-768B, 2M iters, ws=400): - v5 OFF baseline: 47M ops/s - v5 ULTRA: 37-38M ops/s (-20%) - vs v5 base (no opts): +3-5% improvement Design limitation identified: - Header write required on every alloc (freelist overwrites header byte) - Segment validation required on every free - page->used tracking required for retirement - These prevent matching baseline pool v1 performance πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- core/box/smallobject_hotbox_v5_box.h | 7 ++ core/box/smallobject_v5_env_box.h | 17 ++++ core/smallobject_hotbox_v5.c | 127 ++++++++++++++++++++++++++- core/smallsegment_v5.c | 10 +++ 4 files changed, 159 insertions(+), 2 deletions(-) diff --git a/core/box/smallobject_hotbox_v5_box.h b/core/box/smallobject_hotbox_v5_box.h index d8d1d3e0..70eecdeb 100644 --- a/core/box/smallobject_hotbox_v5_box.h +++ b/core/box/smallobject_hotbox_v5_box.h @@ -44,6 +44,9 @@ typedef struct SmallV5Batch { uint8_t count; } SmallV5Batch; +// Phase v5-7: C6 ULTRA TLS freelist capacity +#define SMALL_V5_ULTRA_C6_CAP 32 + // SmallHeapCtxV5: per-thread γƒ›γƒƒγƒˆγƒ’γƒΌγƒ—γ‚³γƒ³γƒ†γ‚­γ‚Ήγƒˆ typedef struct SmallHeapCtxV5 { SmallClassHeapV5 cls[NUM_SMALL_CLASSES_V5]; @@ -52,6 +55,10 @@ typedef struct SmallHeapCtxV5 { void* c6_cached_block; // Phase v5-5: C6 TLS cache (1-slot cache) bool batch_enabled; // Phase v5-6: Batch enabled flag (cached from ENV) SmallV5Batch c6_batch; // Phase v5-6: C6 TLS batch (4-slot buffer) + // Phase v5-7: C6 ULTRA TLS freelist + bool ultra_c6_enabled; // cached from ENV + void* c6_tls_freelist[SMALL_V5_ULTRA_C6_CAP]; // 32-slot TLS freelist + uint8_t c6_tls_count; // current slot count } SmallHeapCtxV5; // ============================================================================ diff --git a/core/box/smallobject_v5_env_box.h b/core/box/smallobject_v5_env_box.h index c9076f6d..8f56fb22 100644 --- a/core/box/smallobject_v5_env_box.h +++ b/core/box/smallobject_v5_env_box.h @@ -171,4 +171,21 @@ static inline int small_heap_v5_batch_enabled(void) { return (g_batch_enabled == ENV_ENABLED); } +// ============================================================================ +// Phase v5-7: C6 ULTRA mode configuration (research mode) +// ============================================================================ + +// small_heap_v5_ultra_c6_enabled() - C6 ULTRA mode enable check (default: disabled) +// ENV: HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED={0|1}, default: 0 +// - 0: disabled (use existing v5 cache/batch path) +// - 1: enabled (C6 uses 32-slot TLS freelist, ULTRA pattern) +static inline int small_heap_v5_ultra_c6_enabled(void) { + static int g_ultra_c6_enabled = ENV_UNINIT; + if (__builtin_expect(g_ultra_c6_enabled == ENV_UNINIT, 0)) { + const char* e = getenv("HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED"); + g_ultra_c6_enabled = (e && *e && *e != '0') ? ENV_ENABLED : ENV_DISABLED; + } + return (g_ultra_c6_enabled == ENV_ENABLED); +} + #endif // HAKMEM_SMALLOBJECT_V5_ENV_BOX_H diff --git a/core/smallobject_hotbox_v5.c b/core/smallobject_hotbox_v5.c index 01f2d220..d25a7710 100644 --- a/core/smallobject_hotbox_v5.c +++ b/core/smallobject_hotbox_v5.c @@ -21,7 +21,7 @@ static __thread SmallHeapCtxV5 g_small_heap_ctx_v5; static __thread int g_small_heap_ctx_v5_init = 0; SmallHeapCtxV5* small_heap_ctx_v5(void) { - // Phase v5-4/v5-5/v5-6: Lazy initialization of cached ENV flags + // Phase v5-4/v5-5/v5-6/v5-7: Lazy initialization of cached ENV flags if (unlikely(!g_small_heap_ctx_v5_init)) { g_small_heap_ctx_v5.header_mode = (uint8_t)small_heap_v5_header_mode(); g_small_heap_ctx_v5.tls_cache_enabled = small_heap_v5_tls_cache_enabled(); @@ -31,6 +31,12 @@ SmallHeapCtxV5* small_heap_ctx_v5(void) { for (int i = 0; i < SMALL_V5_BATCH_CAP; i++) { g_small_heap_ctx_v5.c6_batch.slots[i] = NULL; } + // Phase v5-7: ULTRA C6 initialization + g_small_heap_ctx_v5.ultra_c6_enabled = small_heap_v5_ultra_c6_enabled(); + g_small_heap_ctx_v5.c6_tls_count = 0; + for (int i = 0; i < SMALL_V5_ULTRA_C6_CAP; i++) { + g_small_heap_ctx_v5.c6_tls_freelist[i] = NULL; + } g_small_heap_ctx_v5_init = 1; } return &g_small_heap_ctx_v5; @@ -70,6 +76,81 @@ static SmallPageMetaV5* alloc_slow_v5(SmallHeapCtxV5* ctx, uint32_t class_idx) { return page; } +// ============================================================================ +// Phase v5-7: C6 ULTRA slow path helpers +// ============================================================================ + +// ULTRA refill: Get blocks from segment and fill TLS freelist +// Optimized: Batch page->used update, minimal loop overhead +static void* small_alloc_slow_v5_c6_refill(SmallHeapCtxV5* ctx, uint32_t class_idx) { + // Get page from existing slow path + SmallPageMetaV5* page = alloc_slow_v5(ctx, class_idx); + if (unlikely(!page || !page->free_list)) { + // Cold refill failed, fallback to pool v1 + extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id); + return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0); + } + + // Pre-compute header value + const uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); + + // Fill TLS freelist from page freelist (up to ULTRA_CAP) + // Optimized: count filled blocks, batch update page->used at end + int filled = 0; + const int max_fill = SMALL_V5_ULTRA_C6_CAP - ctx->c6_tls_count; + + while (page->free_list && filled < max_fill) { + void* blk = page->free_list; + void* next; + memcpy(&next, blk, sizeof(void*)); + page->free_list = next; + + // Write header (required because freelist overwrites it) + *((uint8_t*)blk) = desired_header; + + ctx->c6_tls_freelist[ctx->c6_tls_count++] = blk; + filled++; + } + + // Batch update page->used (one write instead of N) + page->used += (uint16_t)filled; + + if (unlikely(filled == 0)) { + extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id); + return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0); + } + + // Pop one and return (already has header written) + void* ret = ctx->c6_tls_freelist[--ctx->c6_tls_count]; + return (uint8_t*)ret + 1; // Return USER pointer +} + +// ULTRA drain: Push TLS freelist back to page freelist +// Optimized: batch page->used update, use page from arg when possible +static void small_free_slow_v5_c6_drain(void* base_ptr, SmallHeapCtxV5* ctx, SmallPageMetaV5* page) { + // Drain half of TLS freelist to make room + int drain_count = ctx->c6_tls_count / 2; + if (drain_count < 1) drain_count = 1; + + // Drain blocks back to their pages + // Note: All blocks in TLS likely belong to the same page (common case) + for (int i = 0; i < drain_count; i++) { + void* blk = ctx->c6_tls_freelist[--ctx->c6_tls_count]; + // blk is BASE pointer, look up its page + SmallPageMetaV5* blk_page = small_segment_v5_page_meta_of((uint8_t*)blk + 1); + if (likely(blk_page)) { + // Push as BASE pointer (next at offset 0) + void* head = blk_page->free_list; + memcpy(blk, &head, sizeof(void*)); + blk_page->free_list = blk; + blk_page->used--; // Decrement used (no underflow check for speed) + } + } + + // Push the current block to TLS freelist + ctx->c6_tls_freelist[ctx->c6_tls_count++] = base_ptr; +} + // ============================================================================ // Phase v5-2: Fast alloc (C6-only full implementation) // ============================================================================ @@ -83,6 +164,18 @@ void* small_alloc_fast_v5(size_t size, uint32_t class_idx, SmallHeapCtxV5* ctx) return hak_pool_try_alloc(size, 0); } + // Phase v5-7: ULTRA fast path (C6 only, minimal branches) + if (ctx->ultra_c6_enabled) { + uint8_t cnt = ctx->c6_tls_count; + if (likely(cnt > 0)) { + // ULTRA fast: pop from TLS freelist (header already written at refill) + ctx->c6_tls_count = cnt - 1; + return (uint8_t*)ctx->c6_tls_freelist[cnt - 1] + 1; // Return USER pointer + } + // ULTRA slow: refill TLS freelist from page + return small_alloc_slow_v5_c6_refill(ctx, class_idx); + } + // Phase v5-5: TLS cache hit path (C6 only) if (unlikely(ctx->tls_cache_enabled)) { void* cached = ctx->c6_cached_block; @@ -221,6 +314,13 @@ static inline page_loc_t get_page_location(SmallClassHeapV5* h, SmallPageMetaV5* return LOC_NONE; } +// ============================================================================ +// Phase v5-7: Lightweight segment check (faster than page_meta_of) +// ============================================================================ + +// Import from smallsegment_v5.c +extern int small_segment_v5_owns_ptr_fast(void* ptr); + // ============================================================================ // Phase v5-3: Fast free (C6-only O(1) implementation) // ============================================================================ @@ -236,7 +336,30 @@ void small_free_fast_v5(void* ptr, uint32_t class_idx, SmallHeapCtxV5* ctx) { return; } - // Phase v5-3: O(1) segment lookup (no list search) + // Phase v5-7: ULTRA free path - skip page_meta_of for fast path + if (ctx->ultra_c6_enabled) { + // Quick segment ownership check (no page_meta access) + if (likely(small_segment_v5_owns_ptr_fast(ptr))) { + uint8_t cnt = ctx->c6_tls_count; + if (likely(cnt < SMALL_V5_ULTRA_C6_CAP)) { + // ULTRA fast: push to TLS freelist (no page_meta touch) + ctx->c6_tls_freelist[cnt] = (uint8_t*)ptr - 1; // Store BASE + ctx->c6_tls_count = cnt + 1; + return; + } + // ULTRA slow: need page_meta for drain + SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr); + if (page) { + small_free_slow_v5_c6_drain((uint8_t*)ptr - 1, ctx, page); + return; + } + } + // Not in v5 segment, fallback to pool v1 + hak_pool_free(ptr, 0, 0); + return; + } + + // Non-ULTRA path: need page_meta_of SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr); if (unlikely(!page)) { // Not in v5 segment, fallback to pool v1 diff --git a/core/smallsegment_v5.c b/core/smallsegment_v5.c index 8d5ab7a1..7f00e85e 100644 --- a/core/smallsegment_v5.c +++ b/core/smallsegment_v5.c @@ -185,6 +185,16 @@ void small_segment_v5_free_page(SmallPageMetaV5* page) { // O(1) Page Metadata Lookup (Phase v5-3: Single segment, no search) // ============================================================================ +// Phase v5-7: Fast segment ownership check (no page_meta access) +int small_segment_v5_owns_ptr_fast(void* ptr) { + if (unlikely(!ptr)) return 0; + TLSSegmentSlot* slot = &g_tls_segment_v5; + if (unlikely(!slot->in_use)) return 0; + uintptr_t addr = (uintptr_t)ptr; + uintptr_t seg_base = slot->seg.base; + return (addr >= seg_base && addr < seg_base + SMALL_SEGMENT_V5_SIZE); +} + SmallPageMetaV5* small_segment_v5_page_meta_of(void* ptr) { if (unlikely(!ptr)) { return NULL;