Phase v6-6: Inline hot path optimization for SmallObject Core v6

Optimize v6 alloc/free by eliminating redundant route checks and adding
inline hot path functions:

- smallobject_core_v6_box.h: Add inline hot path functions:
  - small_alloc_c6_hot_v6() / small_alloc_c5_hot_v6(): Direct TLS pop
  - small_free_c6_hot_v6() / small_free_c5_hot_v6(): Direct TLS push
  - No route check needed (caller already validated via switch case)

- smallobject_core_v6.c: Add cold path functions:
  - small_alloc_cold_v6(): Handle TLS refill from page
  - small_free_cold_v6(): Handle page freelist push (TLS full/cross-thread)

- malloc_tiny_fast.h: Update front gate to use inline hot path:
  - Alloc: hot path first, cold path fallback on TLS miss
  - Free: hot path first, cold path fallback on TLS full

Performance results:
- C5-heavy: v6 ON 42.2M ≈ baseline (parity restored)
- C6-heavy: v6 ON 34.5M ≈ baseline (parity restored)
- Mixed 16-1024B: ~26.5M (v3-only: ~28.1M, gap is routing overhead)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-11 15:59:29 +09:00
parent 1e04debb1b
commit e2ca52d59d
3 changed files with 190 additions and 6 deletions

View File

@ -243,3 +243,99 @@ void small_free_fast_v6(void* ptr,
small_cold_v6_retire_page(page);
}
}
// ============================================================================
// Cold Path Implementation (Phase v6-6)
// ============================================================================
/// Cold path: alloc with refill - called when TLS is empty
/// @param class_idx: C5 or C6
/// @param ctx: TLS context
/// @return: USER pointer or NULL
void* small_alloc_cold_v6(uint32_t class_idx, SmallHeapCtxV6* ctx) {
// Refill TLS from page
SmallPageMetaV6* page = small_cold_v6_refill_page(class_idx);
if (!page || !page->free_list) {
return hak_pool_try_alloc(class_idx == SMALL_V6_C6_CLASS_IDX ? 512 : 256, 0);
}
uint8_t header_byte = SMALL_V6_HEADER_FROM_CLASS(class_idx);
if (class_idx == SMALL_V6_C6_CLASS_IDX) {
int max_fill = SMALL_V6_TLS_CAP - ctx->tls_count_c6;
int filled = 0;
while (page->free_list && filled < max_fill - 1) {
void* blk = page->free_list;
page->free_list = *(void**)blk;
((uint8_t*)blk)[0] = header_byte;
ctx->tls_freelist_c6[ctx->tls_count_c6++] = blk;
filled++;
}
page->used += filled;
if (page->free_list) {
void* blk = page->free_list;
page->free_list = *(void**)blk;
page->used++;
((uint8_t*)blk)[0] = header_byte;
return SMALL_V6_USER_FROM_BASE(blk);
}
if (ctx->tls_count_c6 > 0) {
void* blk = ctx->tls_freelist_c6[--ctx->tls_count_c6];
return SMALL_V6_USER_FROM_BASE(blk);
}
}
else if (class_idx == SMALL_V6_C5_CLASS_IDX) {
int max_fill = SMALL_V6_TLS_CAP - ctx->tls_count_c5;
int filled = 0;
while (page->free_list && filled < max_fill - 1) {
void* blk = page->free_list;
page->free_list = *(void**)blk;
((uint8_t*)blk)[0] = header_byte;
ctx->tls_freelist_c5[ctx->tls_count_c5++] = blk;
filled++;
}
page->used += filled;
if (page->free_list) {
void* blk = page->free_list;
page->free_list = *(void**)blk;
page->used++;
((uint8_t*)blk)[0] = header_byte;
return SMALL_V6_USER_FROM_BASE(blk);
}
if (ctx->tls_count_c5 > 0) {
void* blk = ctx->tls_freelist_c5[--ctx->tls_count_c5];
return SMALL_V6_USER_FROM_BASE(blk);
}
}
return hak_pool_try_alloc(class_idx == SMALL_V6_C6_CLASS_IDX ? 512 : 256, 0);
}
/// Cold path: free to page freelist - called when TLS full or cross-thread
/// @param ptr: USER pointer
/// @param class_idx: C5 or C6
void small_free_cold_v6(void* ptr, uint32_t class_idx) {
(void)class_idx; // Not needed for page lookup
void* base = SMALL_V6_BASE_FROM_USER(ptr);
SmallPageMetaV6* page = small_page_meta_v6_of(ptr);
if (!page) {
hak_pool_free(ptr, 0, 0);
return;
}
*(void**)base = page->free_list;
page->free_list = base;
if (page->used > 0) page->used--;
if (page->used == 0) {
small_cold_v6_retire_page(page);
}
}