Phase v5-7: C6 ULTRA pattern (research mode, 32-slot TLS freelist)
Implementation: - ENV: HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED=0|1 (default: 0) - SmallHeapCtxV5: added c6_tls_freelist[32], c6_tls_count, ultra_c6_enabled - small_segment_v5_owns_ptr_fast(): lightweight segment check for free path - small_alloc_slow_v5_c6_refill(): batch TLS fill from page freelist - small_free_slow_v5_c6_drain(): drain half of TLS to page on overflow Performance (C6-heavy 257-768B, 2M iters, ws=400): - v5 OFF baseline: 47M ops/s - v5 ULTRA: 37-38M ops/s (-20%) - vs v5 base (no opts): +3-5% improvement Design limitation identified: - Header write required on every alloc (freelist overwrites header byte) - Segment validation required on every free - page->used tracking required for retirement - These prevent matching baseline pool v1 performance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -44,6 +44,9 @@ typedef struct SmallV5Batch {
|
||||
uint8_t count;
|
||||
} SmallV5Batch;
|
||||
|
||||
// Phase v5-7: C6 ULTRA TLS freelist capacity
|
||||
#define SMALL_V5_ULTRA_C6_CAP 32
|
||||
|
||||
// SmallHeapCtxV5: per-thread ホットヒープコンテキスト
|
||||
typedef struct SmallHeapCtxV5 {
|
||||
SmallClassHeapV5 cls[NUM_SMALL_CLASSES_V5];
|
||||
@ -52,6 +55,10 @@ typedef struct SmallHeapCtxV5 {
|
||||
void* c6_cached_block; // Phase v5-5: C6 TLS cache (1-slot cache)
|
||||
bool batch_enabled; // Phase v5-6: Batch enabled flag (cached from ENV)
|
||||
SmallV5Batch c6_batch; // Phase v5-6: C6 TLS batch (4-slot buffer)
|
||||
// Phase v5-7: C6 ULTRA TLS freelist
|
||||
bool ultra_c6_enabled; // cached from ENV
|
||||
void* c6_tls_freelist[SMALL_V5_ULTRA_C6_CAP]; // 32-slot TLS freelist
|
||||
uint8_t c6_tls_count; // current slot count
|
||||
} SmallHeapCtxV5;
|
||||
|
||||
// ============================================================================
|
||||
|
||||
@ -171,4 +171,21 @@ static inline int small_heap_v5_batch_enabled(void) {
|
||||
return (g_batch_enabled == ENV_ENABLED);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Phase v5-7: C6 ULTRA mode configuration (research mode)
|
||||
// ============================================================================
|
||||
|
||||
// small_heap_v5_ultra_c6_enabled() - C6 ULTRA mode enable check (default: disabled)
|
||||
// ENV: HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED={0|1}, default: 0
|
||||
// - 0: disabled (use existing v5 cache/batch path)
|
||||
// - 1: enabled (C6 uses 32-slot TLS freelist, ULTRA pattern)
|
||||
static inline int small_heap_v5_ultra_c6_enabled(void) {
|
||||
static int g_ultra_c6_enabled = ENV_UNINIT;
|
||||
if (__builtin_expect(g_ultra_c6_enabled == ENV_UNINIT, 0)) {
|
||||
const char* e = getenv("HAKMEM_SMALL_HEAP_V5_ULTRA_C6_ENABLED");
|
||||
g_ultra_c6_enabled = (e && *e && *e != '0') ? ENV_ENABLED : ENV_DISABLED;
|
||||
}
|
||||
return (g_ultra_c6_enabled == ENV_ENABLED);
|
||||
}
|
||||
|
||||
#endif // HAKMEM_SMALLOBJECT_V5_ENV_BOX_H
|
||||
|
||||
@ -21,7 +21,7 @@ static __thread SmallHeapCtxV5 g_small_heap_ctx_v5;
|
||||
static __thread int g_small_heap_ctx_v5_init = 0;
|
||||
|
||||
SmallHeapCtxV5* small_heap_ctx_v5(void) {
|
||||
// Phase v5-4/v5-5/v5-6: Lazy initialization of cached ENV flags
|
||||
// Phase v5-4/v5-5/v5-6/v5-7: Lazy initialization of cached ENV flags
|
||||
if (unlikely(!g_small_heap_ctx_v5_init)) {
|
||||
g_small_heap_ctx_v5.header_mode = (uint8_t)small_heap_v5_header_mode();
|
||||
g_small_heap_ctx_v5.tls_cache_enabled = small_heap_v5_tls_cache_enabled();
|
||||
@ -31,6 +31,12 @@ SmallHeapCtxV5* small_heap_ctx_v5(void) {
|
||||
for (int i = 0; i < SMALL_V5_BATCH_CAP; i++) {
|
||||
g_small_heap_ctx_v5.c6_batch.slots[i] = NULL;
|
||||
}
|
||||
// Phase v5-7: ULTRA C6 initialization
|
||||
g_small_heap_ctx_v5.ultra_c6_enabled = small_heap_v5_ultra_c6_enabled();
|
||||
g_small_heap_ctx_v5.c6_tls_count = 0;
|
||||
for (int i = 0; i < SMALL_V5_ULTRA_C6_CAP; i++) {
|
||||
g_small_heap_ctx_v5.c6_tls_freelist[i] = NULL;
|
||||
}
|
||||
g_small_heap_ctx_v5_init = 1;
|
||||
}
|
||||
return &g_small_heap_ctx_v5;
|
||||
@ -70,6 +76,81 @@ static SmallPageMetaV5* alloc_slow_v5(SmallHeapCtxV5* ctx, uint32_t class_idx) {
|
||||
return page;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Phase v5-7: C6 ULTRA slow path helpers
|
||||
// ============================================================================
|
||||
|
||||
// ULTRA refill: Get blocks from segment and fill TLS freelist
|
||||
// Optimized: Batch page->used update, minimal loop overhead
|
||||
static void* small_alloc_slow_v5_c6_refill(SmallHeapCtxV5* ctx, uint32_t class_idx) {
|
||||
// Get page from existing slow path
|
||||
SmallPageMetaV5* page = alloc_slow_v5(ctx, class_idx);
|
||||
if (unlikely(!page || !page->free_list)) {
|
||||
// Cold refill failed, fallback to pool v1
|
||||
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
|
||||
return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
// Pre-compute header value
|
||||
const uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
|
||||
|
||||
// Fill TLS freelist from page freelist (up to ULTRA_CAP)
|
||||
// Optimized: count filled blocks, batch update page->used at end
|
||||
int filled = 0;
|
||||
const int max_fill = SMALL_V5_ULTRA_C6_CAP - ctx->c6_tls_count;
|
||||
|
||||
while (page->free_list && filled < max_fill) {
|
||||
void* blk = page->free_list;
|
||||
void* next;
|
||||
memcpy(&next, blk, sizeof(void*));
|
||||
page->free_list = next;
|
||||
|
||||
// Write header (required because freelist overwrites it)
|
||||
*((uint8_t*)blk) = desired_header;
|
||||
|
||||
ctx->c6_tls_freelist[ctx->c6_tls_count++] = blk;
|
||||
filled++;
|
||||
}
|
||||
|
||||
// Batch update page->used (one write instead of N)
|
||||
page->used += (uint16_t)filled;
|
||||
|
||||
if (unlikely(filled == 0)) {
|
||||
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
|
||||
return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
// Pop one and return (already has header written)
|
||||
void* ret = ctx->c6_tls_freelist[--ctx->c6_tls_count];
|
||||
return (uint8_t*)ret + 1; // Return USER pointer
|
||||
}
|
||||
|
||||
// ULTRA drain: Push TLS freelist back to page freelist
|
||||
// Optimized: batch page->used update, use page from arg when possible
|
||||
static void small_free_slow_v5_c6_drain(void* base_ptr, SmallHeapCtxV5* ctx, SmallPageMetaV5* page) {
|
||||
// Drain half of TLS freelist to make room
|
||||
int drain_count = ctx->c6_tls_count / 2;
|
||||
if (drain_count < 1) drain_count = 1;
|
||||
|
||||
// Drain blocks back to their pages
|
||||
// Note: All blocks in TLS likely belong to the same page (common case)
|
||||
for (int i = 0; i < drain_count; i++) {
|
||||
void* blk = ctx->c6_tls_freelist[--ctx->c6_tls_count];
|
||||
// blk is BASE pointer, look up its page
|
||||
SmallPageMetaV5* blk_page = small_segment_v5_page_meta_of((uint8_t*)blk + 1);
|
||||
if (likely(blk_page)) {
|
||||
// Push as BASE pointer (next at offset 0)
|
||||
void* head = blk_page->free_list;
|
||||
memcpy(blk, &head, sizeof(void*));
|
||||
blk_page->free_list = blk;
|
||||
blk_page->used--; // Decrement used (no underflow check for speed)
|
||||
}
|
||||
}
|
||||
|
||||
// Push the current block to TLS freelist
|
||||
ctx->c6_tls_freelist[ctx->c6_tls_count++] = base_ptr;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Phase v5-2: Fast alloc (C6-only full implementation)
|
||||
// ============================================================================
|
||||
@ -83,6 +164,18 @@ void* small_alloc_fast_v5(size_t size, uint32_t class_idx, SmallHeapCtxV5* ctx)
|
||||
return hak_pool_try_alloc(size, 0);
|
||||
}
|
||||
|
||||
// Phase v5-7: ULTRA fast path (C6 only, minimal branches)
|
||||
if (ctx->ultra_c6_enabled) {
|
||||
uint8_t cnt = ctx->c6_tls_count;
|
||||
if (likely(cnt > 0)) {
|
||||
// ULTRA fast: pop from TLS freelist (header already written at refill)
|
||||
ctx->c6_tls_count = cnt - 1;
|
||||
return (uint8_t*)ctx->c6_tls_freelist[cnt - 1] + 1; // Return USER pointer
|
||||
}
|
||||
// ULTRA slow: refill TLS freelist from page
|
||||
return small_alloc_slow_v5_c6_refill(ctx, class_idx);
|
||||
}
|
||||
|
||||
// Phase v5-5: TLS cache hit path (C6 only)
|
||||
if (unlikely(ctx->tls_cache_enabled)) {
|
||||
void* cached = ctx->c6_cached_block;
|
||||
@ -221,6 +314,13 @@ static inline page_loc_t get_page_location(SmallClassHeapV5* h, SmallPageMetaV5*
|
||||
return LOC_NONE;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Phase v5-7: Lightweight segment check (faster than page_meta_of)
|
||||
// ============================================================================
|
||||
|
||||
// Import from smallsegment_v5.c
|
||||
extern int small_segment_v5_owns_ptr_fast(void* ptr);
|
||||
|
||||
// ============================================================================
|
||||
// Phase v5-3: Fast free (C6-only O(1) implementation)
|
||||
// ============================================================================
|
||||
@ -236,7 +336,30 @@ void small_free_fast_v5(void* ptr, uint32_t class_idx, SmallHeapCtxV5* ctx) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Phase v5-3: O(1) segment lookup (no list search)
|
||||
// Phase v5-7: ULTRA free path - skip page_meta_of for fast path
|
||||
if (ctx->ultra_c6_enabled) {
|
||||
// Quick segment ownership check (no page_meta access)
|
||||
if (likely(small_segment_v5_owns_ptr_fast(ptr))) {
|
||||
uint8_t cnt = ctx->c6_tls_count;
|
||||
if (likely(cnt < SMALL_V5_ULTRA_C6_CAP)) {
|
||||
// ULTRA fast: push to TLS freelist (no page_meta touch)
|
||||
ctx->c6_tls_freelist[cnt] = (uint8_t*)ptr - 1; // Store BASE
|
||||
ctx->c6_tls_count = cnt + 1;
|
||||
return;
|
||||
}
|
||||
// ULTRA slow: need page_meta for drain
|
||||
SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr);
|
||||
if (page) {
|
||||
small_free_slow_v5_c6_drain((uint8_t*)ptr - 1, ctx, page);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Not in v5 segment, fallback to pool v1
|
||||
hak_pool_free(ptr, 0, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
// Non-ULTRA path: need page_meta_of
|
||||
SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr);
|
||||
if (unlikely(!page)) {
|
||||
// Not in v5 segment, fallback to pool v1
|
||||
|
||||
@ -185,6 +185,16 @@ void small_segment_v5_free_page(SmallPageMetaV5* page) {
|
||||
// O(1) Page Metadata Lookup (Phase v5-3: Single segment, no search)
|
||||
// ============================================================================
|
||||
|
||||
// Phase v5-7: Fast segment ownership check (no page_meta access)
|
||||
int small_segment_v5_owns_ptr_fast(void* ptr) {
|
||||
if (unlikely(!ptr)) return 0;
|
||||
TLSSegmentSlot* slot = &g_tls_segment_v5;
|
||||
if (unlikely(!slot->in_use)) return 0;
|
||||
uintptr_t addr = (uintptr_t)ptr;
|
||||
uintptr_t seg_base = slot->seg.base;
|
||||
return (addr >= seg_base && addr < seg_base + SMALL_SEGMENT_V5_SIZE);
|
||||
}
|
||||
|
||||
SmallPageMetaV5* small_segment_v5_page_meta_of(void* ptr) {
|
||||
if (unlikely(!ptr)) {
|
||||
return NULL;
|
||||
|
||||
Reference in New Issue
Block a user