Phase v5-7: C6 ULTRA pattern (research mode, 32-slot TLS freelist)

Implementation:
- ENV: HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED=0|1 (default: 0)
- SmallHeapCtxV5: added c6_tls_freelist[32], c6_tls_count, ultra_c6_enabled
- small_segment_v5_owns_ptr_fast(): lightweight segment check for free path
- small_alloc_slow_v5_c6_refill(): batch TLS fill from page freelist
- small_free_slow_v5_c6_drain(): drain half of TLS to page on overflow

Performance (C6-heavy 257-768B, 2M iters, ws=400):
- v5 OFF baseline: 47M ops/s
- v5 ULTRA: 37-38M ops/s (-20%)
- vs v5 base (no opts): +3-5% improvement

Design limitation identified:
- Header write required on every alloc (freelist overwrites header byte)
- Segment validation required on every free
- page->used tracking required for retirement
- These prevent matching baseline pool v1 performance

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-11 13:32:46 +09:00
parent f191774c1e
commit 8789542a9f
4 changed files with 159 additions and 2 deletions

View File

@ -44,6 +44,9 @@ typedef struct SmallV5Batch {
uint8_t count; uint8_t count;
} SmallV5Batch; } SmallV5Batch;
// Phase v5-7: C6 ULTRA TLS freelist capacity
#define SMALL_V5_ULTRA_C6_CAP 32
// SmallHeapCtxV5: per-thread ホットヒープコンテキスト // SmallHeapCtxV5: per-thread ホットヒープコンテキスト
typedef struct SmallHeapCtxV5 { typedef struct SmallHeapCtxV5 {
SmallClassHeapV5 cls[NUM_SMALL_CLASSES_V5]; SmallClassHeapV5 cls[NUM_SMALL_CLASSES_V5];
@ -52,6 +55,10 @@ typedef struct SmallHeapCtxV5 {
void* c6_cached_block; // Phase v5-5: C6 TLS cache (1-slot cache) void* c6_cached_block; // Phase v5-5: C6 TLS cache (1-slot cache)
bool batch_enabled; // Phase v5-6: Batch enabled flag (cached from ENV) bool batch_enabled; // Phase v5-6: Batch enabled flag (cached from ENV)
SmallV5Batch c6_batch; // Phase v5-6: C6 TLS batch (4-slot buffer) SmallV5Batch c6_batch; // Phase v5-6: C6 TLS batch (4-slot buffer)
// Phase v5-7: C6 ULTRA TLS freelist
bool ultra_c6_enabled; // cached from ENV
void* c6_tls_freelist[SMALL_V5_ULTRA_C6_CAP]; // 32-slot TLS freelist
uint8_t c6_tls_count; // current slot count
} SmallHeapCtxV5; } SmallHeapCtxV5;
// ============================================================================ // ============================================================================

View File

@ -171,4 +171,21 @@ static inline int small_heap_v5_batch_enabled(void) {
return (g_batch_enabled == ENV_ENABLED); return (g_batch_enabled == ENV_ENABLED);
} }
// ============================================================================
// Phase v5-7: C6 ULTRA mode configuration (research mode)
// ============================================================================
// small_heap_v5_ultra_c6_enabled() - C6 ULTRA mode enable check (default: disabled)
// ENV: HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED={0|1}, default: 0
// - 0: disabled (use existing v5 cache/batch path)
// - 1: enabled (C6 uses 32-slot TLS freelist, ULTRA pattern)
static inline int small_heap_v5_ultra_c6_enabled(void) {
static int g_ultra_c6_enabled = ENV_UNINIT;
if (__builtin_expect(g_ultra_c6_enabled == ENV_UNINIT, 0)) {
const char* e = getenv("HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED");
g_ultra_c6_enabled = (e && *e && *e != '0') ? ENV_ENABLED : ENV_DISABLED;
}
return (g_ultra_c6_enabled == ENV_ENABLED);
}
#endif // HAKMEM_SMALLOBJECT_V5_ENV_BOX_H #endif // HAKMEM_SMALLOBJECT_V5_ENV_BOX_H

View File

@ -21,7 +21,7 @@ static __thread SmallHeapCtxV5 g_small_heap_ctx_v5;
static __thread int g_small_heap_ctx_v5_init = 0; static __thread int g_small_heap_ctx_v5_init = 0;
SmallHeapCtxV5* small_heap_ctx_v5(void) { SmallHeapCtxV5* small_heap_ctx_v5(void) {
// Phase v5-4/v5-5/v5-6: Lazy initialization of cached ENV flags // Phase v5-4/v5-5/v5-6/v5-7: Lazy initialization of cached ENV flags
if (unlikely(!g_small_heap_ctx_v5_init)) { if (unlikely(!g_small_heap_ctx_v5_init)) {
g_small_heap_ctx_v5.header_mode = (uint8_t)small_heap_v5_header_mode(); g_small_heap_ctx_v5.header_mode = (uint8_t)small_heap_v5_header_mode();
g_small_heap_ctx_v5.tls_cache_enabled = small_heap_v5_tls_cache_enabled(); g_small_heap_ctx_v5.tls_cache_enabled = small_heap_v5_tls_cache_enabled();
@ -31,6 +31,12 @@ SmallHeapCtxV5* small_heap_ctx_v5(void) {
for (int i = 0; i < SMALL_V5_BATCH_CAP; i++) { for (int i = 0; i < SMALL_V5_BATCH_CAP; i++) {
g_small_heap_ctx_v5.c6_batch.slots[i] = NULL; g_small_heap_ctx_v5.c6_batch.slots[i] = NULL;
} }
// Phase v5-7: ULTRA C6 initialization
g_small_heap_ctx_v5.ultra_c6_enabled = small_heap_v5_ultra_c6_enabled();
g_small_heap_ctx_v5.c6_tls_count = 0;
for (int i = 0; i < SMALL_V5_ULTRA_C6_CAP; i++) {
g_small_heap_ctx_v5.c6_tls_freelist[i] = NULL;
}
g_small_heap_ctx_v5_init = 1; g_small_heap_ctx_v5_init = 1;
} }
return &g_small_heap_ctx_v5; return &g_small_heap_ctx_v5;
@ -70,6 +76,81 @@ static SmallPageMetaV5* alloc_slow_v5(SmallHeapCtxV5* ctx, uint32_t class_idx) {
return page; return page;
} }
// ============================================================================
// Phase v5-7: C6 ULTRA slow path helpers
// ============================================================================
// ULTRA refill: Get blocks from segment and fill TLS freelist
// Optimized: Batch page->used update, minimal loop overhead
static void* small_alloc_slow_v5_c6_refill(SmallHeapCtxV5* ctx, uint32_t class_idx) {
// Get page from existing slow path
SmallPageMetaV5* page = alloc_slow_v5(ctx, class_idx);
if (unlikely(!page || !page->free_list)) {
// Cold refill failed, fallback to pool v1
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0);
}
// Pre-compute header value
const uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
// Fill TLS freelist from page freelist (up to ULTRA_CAP)
// Optimized: count filled blocks, batch update page->used at end
int filled = 0;
const int max_fill = SMALL_V5_ULTRA_C6_CAP - ctx->c6_tls_count;
while (page->free_list && filled < max_fill) {
void* blk = page->free_list;
void* next;
memcpy(&next, blk, sizeof(void*));
page->free_list = next;
// Write header (required because freelist overwrites it)
*((uint8_t*)blk) = desired_header;
ctx->c6_tls_freelist[ctx->c6_tls_count++] = blk;
filled++;
}
// Batch update page->used (one write instead of N)
page->used += (uint16_t)filled;
if (unlikely(filled == 0)) {
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0);
}
// Pop one and return (already has header written)
void* ret = ctx->c6_tls_freelist[--ctx->c6_tls_count];
return (uint8_t*)ret + 1; // Return USER pointer
}
// ULTRA drain: Push TLS freelist back to page freelist
// Optimized: batch page->used update, use page from arg when possible
static void small_free_slow_v5_c6_drain(void* base_ptr, SmallHeapCtxV5* ctx, SmallPageMetaV5* page) {
// Drain half of TLS freelist to make room
int drain_count = ctx->c6_tls_count / 2;
if (drain_count < 1) drain_count = 1;
// Drain blocks back to their pages
// Note: All blocks in TLS likely belong to the same page (common case)
for (int i = 0; i < drain_count; i++) {
void* blk = ctx->c6_tls_freelist[--ctx->c6_tls_count];
// blk is BASE pointer, look up its page
SmallPageMetaV5* blk_page = small_segment_v5_page_meta_of((uint8_t*)blk + 1);
if (likely(blk_page)) {
// Push as BASE pointer (next at offset 0)
void* head = blk_page->free_list;
memcpy(blk, &head, sizeof(void*));
blk_page->free_list = blk;
blk_page->used--; // Decrement used (no underflow check for speed)
}
}
// Push the current block to TLS freelist
ctx->c6_tls_freelist[ctx->c6_tls_count++] = base_ptr;
}
// ============================================================================ // ============================================================================
// Phase v5-2: Fast alloc (C6-only full implementation) // Phase v5-2: Fast alloc (C6-only full implementation)
// ============================================================================ // ============================================================================
@ -83,6 +164,18 @@ void* small_alloc_fast_v5(size_t size, uint32_t class_idx, SmallHeapCtxV5* ctx)
return hak_pool_try_alloc(size, 0); return hak_pool_try_alloc(size, 0);
} }
// Phase v5-7: ULTRA fast path (C6 only, minimal branches)
if (ctx->ultra_c6_enabled) {
uint8_t cnt = ctx->c6_tls_count;
if (likely(cnt > 0)) {
// ULTRA fast: pop from TLS freelist (header already written at refill)
ctx->c6_tls_count = cnt - 1;
return (uint8_t*)ctx->c6_tls_freelist[cnt - 1] + 1; // Return USER pointer
}
// ULTRA slow: refill TLS freelist from page
return small_alloc_slow_v5_c6_refill(ctx, class_idx);
}
// Phase v5-5: TLS cache hit path (C6 only) // Phase v5-5: TLS cache hit path (C6 only)
if (unlikely(ctx->tls_cache_enabled)) { if (unlikely(ctx->tls_cache_enabled)) {
void* cached = ctx->c6_cached_block; void* cached = ctx->c6_cached_block;
@ -221,6 +314,13 @@ static inline page_loc_t get_page_location(SmallClassHeapV5* h, SmallPageMetaV5*
return LOC_NONE; return LOC_NONE;
} }
// ============================================================================
// Phase v5-7: Lightweight segment check (faster than page_meta_of)
// ============================================================================
// Import from smallsegment_v5.c
extern int small_segment_v5_owns_ptr_fast(void* ptr);
// ============================================================================ // ============================================================================
// Phase v5-3: Fast free (C6-only O(1) implementation) // Phase v5-3: Fast free (C6-only O(1) implementation)
// ============================================================================ // ============================================================================
@ -236,7 +336,30 @@ void small_free_fast_v5(void* ptr, uint32_t class_idx, SmallHeapCtxV5* ctx) {
return; return;
} }
// Phase v5-3: O(1) segment lookup (no list search) // Phase v5-7: ULTRA free path - skip page_meta_of for fast path
if (ctx->ultra_c6_enabled) {
// Quick segment ownership check (no page_meta access)
if (likely(small_segment_v5_owns_ptr_fast(ptr))) {
uint8_t cnt = ctx->c6_tls_count;
if (likely(cnt < SMALL_V5_ULTRA_C6_CAP)) {
// ULTRA fast: push to TLS freelist (no page_meta touch)
ctx->c6_tls_freelist[cnt] = (uint8_t*)ptr - 1; // Store BASE
ctx->c6_tls_count = cnt + 1;
return;
}
// ULTRA slow: need page_meta for drain
SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr);
if (page) {
small_free_slow_v5_c6_drain((uint8_t*)ptr - 1, ctx, page);
return;
}
}
// Not in v5 segment, fallback to pool v1
hak_pool_free(ptr, 0, 0);
return;
}
// Non-ULTRA path: need page_meta_of
SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr); SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr);
if (unlikely(!page)) { if (unlikely(!page)) {
// Not in v5 segment, fallback to pool v1 // Not in v5 segment, fallback to pool v1

View File

@ -185,6 +185,16 @@ void small_segment_v5_free_page(SmallPageMetaV5* page) {
// O(1) Page Metadata Lookup (Phase v5-3: Single segment, no search) // O(1) Page Metadata Lookup (Phase v5-3: Single segment, no search)
// ============================================================================ // ============================================================================
// Phase v5-7: Fast segment ownership check (no page_meta access)
int small_segment_v5_owns_ptr_fast(void* ptr) {
if (unlikely(!ptr)) return 0;
TLSSegmentSlot* slot = &g_tls_segment_v5;
if (unlikely(!slot->in_use)) return 0;
uintptr_t addr = (uintptr_t)ptr;
uintptr_t seg_base = slot->seg.base;
return (addr >= seg_base && addr < seg_base + SMALL_SEGMENT_V5_SIZE);
}
SmallPageMetaV5* small_segment_v5_page_meta_of(void* ptr) { SmallPageMetaV5* small_segment_v5_page_meta_of(void* ptr) {
if (unlikely(!ptr)) { if (unlikely(!ptr)) {
return NULL; return NULL;