Files
hakmem/core/smallobject_hotbox_v5.c

494 lines
18 KiB
C
Raw Normal View History

// smallobject_hotbox_v5.c - SmallObject HotBox v5 Full Implementation (Phase v5-2)
//
// Phase v5-2: C6-only full implementation with segment-based allocation
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include "box/smallsegment_v5_box.h"
#include "box/smallobject_hotbox_v5_box.h"
#include "box/smallobject_cold_iface_v5.h"
#include "box/smallobject_v5_env_box.h"
#include "tiny_region_id.h" // For HEADER_MAGIC and HEADER_CLASS_MASK
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
// TLS context
static __thread SmallHeapCtxV5 g_small_heap_ctx_v5;
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
static __thread int g_small_heap_ctx_v5_init = 0;
SmallHeapCtxV5* small_heap_ctx_v5(void) {
// Phase v5-4/v5-5/v5-6/v5-7: Lazy initialization of cached ENV flags
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
if (unlikely(!g_small_heap_ctx_v5_init)) {
g_small_heap_ctx_v5.header_mode = (uint8_t)small_heap_v5_header_mode();
g_small_heap_ctx_v5.tls_cache_enabled = small_heap_v5_tls_cache_enabled();
g_small_heap_ctx_v5.c6_cached_block = NULL; // Initialize cache to empty
g_small_heap_ctx_v5.batch_enabled = small_heap_v5_batch_enabled();
g_small_heap_ctx_v5.c6_batch.count = 0; // Initialize batch to empty
for (int i = 0; i < SMALL_V5_BATCH_CAP; i++) {
g_small_heap_ctx_v5.c6_batch.slots[i] = NULL;
}
// Phase v5-7: ULTRA C6 initialization
g_small_heap_ctx_v5.ultra_c6_enabled = small_heap_v5_ultra_c6_enabled();
g_small_heap_ctx_v5.c6_tls_count = 0;
for (int i = 0; i < SMALL_V5_ULTRA_C6_CAP; i++) {
g_small_heap_ctx_v5.c6_tls_freelist[i] = NULL;
}
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
g_small_heap_ctx_v5_init = 1;
}
return &g_small_heap_ctx_v5;
}
// Forward declarations for pool v1 fallback
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
extern void hak_pool_free(void* ptr, size_t size, uintptr_t site_id);
// ============================================================================
// Helper: Slow path (refill from partial or cold)
// ============================================================================
static SmallPageMetaV5* alloc_slow_v5(SmallHeapCtxV5* ctx, uint32_t class_idx) {
SmallClassHeapV5* h = &ctx->cls[class_idx];
SmallPageMetaV5* cur = h->current;
// If current exists but is exhausted, move to full list only
// (exhausted pages are fully allocated, not partially free)
if (cur && !cur->free_list) {
SMALL_PAGE_V5_PUSH_FULL(h, cur);
h->current = NULL;
}
// Try to pop from partial list (pages with some free blocks)
SmallPageMetaV5* from_partial = SMALL_PAGE_V5_POP_PARTIAL(h);
if (from_partial) {
h->current = from_partial;
return from_partial;
}
// Refill from cold interface (allocates new page)
SmallPageMetaV5* page = small_cold_v5_refill_page(ctx, class_idx);
if (!page) return NULL;
h->current = page;
return page;
}
// ============================================================================
// Phase v5-7: C6 ULTRA slow path helpers
// ============================================================================
// ULTRA refill: Get blocks from segment and fill TLS freelist
// Optimized: Batch page->used update, minimal loop overhead
static void* small_alloc_slow_v5_c6_refill(SmallHeapCtxV5* ctx, uint32_t class_idx) {
// Get page from existing slow path
SmallPageMetaV5* page = alloc_slow_v5(ctx, class_idx);
if (unlikely(!page || !page->free_list)) {
// Cold refill failed, fallback to pool v1
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0);
}
// Pre-compute header value
const uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
// Fill TLS freelist from page freelist (up to ULTRA_CAP)
// Optimized: count filled blocks, batch update page->used at end
int filled = 0;
const int max_fill = SMALL_V5_ULTRA_C6_CAP - ctx->c6_tls_count;
while (page->free_list && filled < max_fill) {
void* blk = page->free_list;
void* next;
memcpy(&next, blk, sizeof(void*));
page->free_list = next;
// Write header (required because freelist overwrites it)
*((uint8_t*)blk) = desired_header;
ctx->c6_tls_freelist[ctx->c6_tls_count++] = blk;
filled++;
}
// Batch update page->used (one write instead of N)
page->used += (uint16_t)filled;
if (unlikely(filled == 0)) {
extern void* hak_pool_try_alloc(size_t size, uintptr_t site_id);
return hak_pool_try_alloc(SMALL_HEAP_V5_C6_BLOCK_SIZE, 0);
}
// Pop one and return (already has header written)
void* ret = ctx->c6_tls_freelist[--ctx->c6_tls_count];
return (uint8_t*)ret + 1; // Return USER pointer
}
// ULTRA drain: Push TLS freelist back to page freelist
// Optimized: batch page->used update, use page from arg when possible
static void small_free_slow_v5_c6_drain(void* base_ptr, SmallHeapCtxV5* ctx, SmallPageMetaV5* page) {
// Drain half of TLS freelist to make room
int drain_count = ctx->c6_tls_count / 2;
if (drain_count < 1) drain_count = 1;
// Drain blocks back to their pages
// Note: All blocks in TLS likely belong to the same page (common case)
for (int i = 0; i < drain_count; i++) {
void* blk = ctx->c6_tls_freelist[--ctx->c6_tls_count];
// blk is BASE pointer, look up its page
SmallPageMetaV5* blk_page = small_segment_v5_page_meta_of((uint8_t*)blk + 1);
if (likely(blk_page)) {
// Push as BASE pointer (next at offset 0)
void* head = blk_page->free_list;
memcpy(blk, &head, sizeof(void*));
blk_page->free_list = blk;
blk_page->used--; // Decrement used (no underflow check for speed)
}
}
// Push the current block to TLS freelist
ctx->c6_tls_freelist[ctx->c6_tls_count++] = base_ptr;
}
// ============================================================================
// Phase v5-2: Fast alloc (C6-only full implementation)
// ============================================================================
void* small_alloc_fast_v5(size_t size, uint32_t class_idx, SmallHeapCtxV5* ctx) {
(void)size; // Not used in fast path
// C6-only check
if (unlikely(class_idx != SMALL_HEAP_V5_C6_CLASS_IDX)) {
// Fallback to pool v1 for non-C6 classes
return hak_pool_try_alloc(size, 0);
}
// Phase v5-7: ULTRA fast path (C6 only, minimal branches)
if (ctx->ultra_c6_enabled) {
uint8_t cnt = ctx->c6_tls_count;
if (likely(cnt > 0)) {
// ULTRA fast: pop from TLS freelist (header already written at refill)
ctx->c6_tls_count = cnt - 1;
return (uint8_t*)ctx->c6_tls_freelist[cnt - 1] + 1; // Return USER pointer
}
// ULTRA slow: refill TLS freelist from page
return small_alloc_slow_v5_c6_refill(ctx, class_idx);
}
// Phase v5-5: TLS cache hit path (C6 only)
if (unlikely(ctx->tls_cache_enabled)) {
void* cached = ctx->c6_cached_block;
if (likely(cached != NULL)) {
ctx->c6_cached_block = NULL; // Consume cache slot
// NOTE: cached is BASE pointer (same as freelist format), convert to USER pointer
// This is consistent with the free path which stores (ptr - 1) as BASE
// Header mode handling (same logic as freelist path)
uint8_t* header_ptr = (uint8_t*)cached;
uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
if (ctx->header_mode == SMALL_HEAP_V5_HEADER_MODE_LIGHT) {
// light mode: only write if invalid
uint8_t existing = *header_ptr;
if (existing != desired_header) {
*header_ptr = desired_header;
}
} else {
// full mode: always write header
*header_ptr = desired_header;
}
return header_ptr + 1;
}
}
// Phase v5-6: Batch alloc path (C6 only, after cache)
if (ctx->batch_enabled && class_idx == SMALL_HEAP_V5_C6_CLASS_IDX && ctx->c6_batch.count > 0) {
uint8_t idx = --ctx->c6_batch.count;
void* b = ctx->c6_batch.slots[idx];
ctx->c6_batch.slots[idx] = NULL;
// b is BASE pointer, return based on header mode
if (ctx->header_mode == SMALL_HEAP_V5_HEADER_MODE_LIGHT) {
return (uint8_t*)b + 1;
} else {
// full mode: write header
uint8_t* header_ptr = (uint8_t*)b;
uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
*header_ptr = desired_header;
return header_ptr + 1;
}
}
// Cache miss - proceed to existing page_meta path
SmallClassHeapV5* h = &ctx->cls[SMALL_HEAP_V5_C6_CLASS_IDX];
SmallPageMetaV5* page = h->current;
// Fast path: Try current page freelist
if (likely(page && page->free_list)) {
void* blk = page->free_list;
void* next = NULL;
memcpy(&next, blk, sizeof(void*));
page->free_list = next;
page->used++;
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
// Phase v5-4: Header mode handling
uint8_t* header_ptr = (uint8_t*)blk;
uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
if (ctx->header_mode == SMALL_HEAP_V5_HEADER_MODE_LIGHT) {
// light mode: only write header if it's invalid/incorrect
// This saves redundant writes when blocks are recycled
uint8_t existing = *header_ptr;
if (existing != desired_header) {
*header_ptr = desired_header;
}
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
} else {
// full mode: always write header (safety first)
*header_ptr = desired_header;
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
}
return header_ptr + 1;
}
// Slow path: Current exhausted or NULL
page = alloc_slow_v5(ctx, class_idx);
if (unlikely(!page || !page->free_list)) {
// Cold refill failed, fallback to pool v1
return hak_pool_try_alloc(size, 0);
}
// Allocate from newly acquired page
void* blk = page->free_list;
void* next = NULL;
memcpy(&next, blk, sizeof(void*));
page->free_list = next;
page->used++;
// Phase v5-4: Header mode handling (same logic as fast path)
uint8_t* header_ptr = (uint8_t*)blk;
uint8_t desired_header = (uint8_t)(HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
if (ctx->header_mode == SMALL_HEAP_V5_HEADER_MODE_LIGHT) {
// light mode: only write if invalid
uint8_t existing = *header_ptr;
if (existing != desired_header) {
*header_ptr = desired_header;
}
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
} else {
// full mode: always write header
*header_ptr = desired_header;
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
}
return header_ptr + 1;
}
// ============================================================================
// Helper: Determine page location in heap lists (Phase v5-3)
// ============================================================================
static inline page_loc_t get_page_location(SmallClassHeapV5* h, SmallPageMetaV5* page,
SmallPageMetaV5** prev_out) {
if (prev_out) *prev_out = NULL;
if (!h || !page) return LOC_NONE;
// Check current (O(1))
if (h->current == page) {
return LOC_CURRENT;
}
// Check partial list (typically 0-1 pages in v5-3)
SmallPageMetaV5* prev = NULL;
for (SmallPageMetaV5* p = h->partial_head; p; prev = p, p = p->next) {
if (p == page) {
if (prev_out) *prev_out = prev;
return LOC_PARTIAL;
}
}
// Check full list
prev = NULL;
for (SmallPageMetaV5* p = h->full_head; p; prev = p, p = p->next) {
if (p == page) {
if (prev_out) *prev_out = prev;
return LOC_FULL;
}
}
return LOC_NONE;
}
// ============================================================================
// Phase v5-7: Lightweight segment check (faster than page_meta_of)
// ============================================================================
// Import from smallsegment_v5.c
extern int small_segment_v5_owns_ptr_fast(void* ptr);
// ============================================================================
// Phase v5-3: Fast free (C6-only O(1) implementation)
// ============================================================================
void small_free_fast_v5(void* ptr, uint32_t class_idx, SmallHeapCtxV5* ctx) {
if (unlikely(!ptr)) {
return;
}
// C6-only check
if (unlikely(class_idx != SMALL_HEAP_V5_C6_CLASS_IDX)) {
hak_pool_free(ptr, 0, 0);
return;
}
// Phase v5-7: ULTRA free path - skip page_meta_of for fast path
if (ctx->ultra_c6_enabled) {
// Quick segment ownership check (no page_meta access)
if (likely(small_segment_v5_owns_ptr_fast(ptr))) {
uint8_t cnt = ctx->c6_tls_count;
if (likely(cnt < SMALL_V5_ULTRA_C6_CAP)) {
// ULTRA fast: push to TLS freelist (no page_meta touch)
ctx->c6_tls_freelist[cnt] = (uint8_t*)ptr - 1; // Store BASE
ctx->c6_tls_count = cnt + 1;
return;
}
// ULTRA slow: need page_meta for drain
SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr);
if (page) {
small_free_slow_v5_c6_drain((uint8_t*)ptr - 1, ctx, page);
return;
}
}
// Not in v5 segment, fallback to pool v1
hak_pool_free(ptr, 0, 0);
return;
}
// Non-ULTRA path: need page_meta_of
SmallPageMetaV5* page = small_segment_v5_page_meta_of(ptr);
if (unlikely(!page)) {
// Not in v5 segment, fallback to pool v1
hak_pool_free(ptr, 0, 0);
return;
}
SmallClassHeapV5* h = &ctx->cls[SMALL_HEAP_V5_C6_CLASS_IDX];
// Phase v5-5: TLS cache refill path (before pushing to freelist)
if (unlikely(ctx->tls_cache_enabled)) {
if (ctx->c6_cached_block == NULL) {
// Cache is empty, refill it with this block
// NOTE: ptr is USER pointer, convert to BASE pointer for cache storage
// (consistent with freelist storage format)
void* base = (uint8_t*)ptr - 1;
ctx->c6_cached_block = base;
// IMPORTANT: Do NOT decrement page->used here!
// The cached block is still logically "allocated" until it's:
// - consumed during alloc (at which point it becomes allocated again)
// - evicted to freelist (at which point page->used is decremented)
// This prevents premature page retirement while holding a cached reference
return;
}
// Cache full - evict cached block to freelist first, then cache this one
else {
void* evicted = ctx->c6_cached_block;
// Evicted block is BASE pointer, convert to USER pointer for freelist push
void* evicted_user = (uint8_t*)evicted + 1;
// Look up the page for the evicted block (might be different from current page)
SmallPageMetaV5* evicted_page = small_segment_v5_page_meta_of(evicted_user);
if (evicted_page) {
// Push evicted block to its page's freelist
void* evicted_head = evicted_page->free_list;
memcpy(evicted_user, &evicted_head, sizeof(void*));
evicted_page->free_list = evicted_user;
if (evicted_page->used > 0) {
evicted_page->used--;
}
// Note: We don't handle empty page transition here for evicted page
// to keep this path fast. Empty pages will be handled on next alloc/free.
}
// Now cache the new block
void* base = (uint8_t*)ptr - 1;
ctx->c6_cached_block = base;
return;
}
}
// Phase v5-6: Batch free path (C6 only, after cache, before freelist)
SmallV5Batch* batch = &ctx->c6_batch;
if (ctx->batch_enabled && class_idx == SMALL_HEAP_V5_C6_CLASS_IDX && batch->count < SMALL_V5_BATCH_CAP) {
// ptr is USER pointer, convert to BASE pointer for batch storage
void* base = (uint8_t*)ptr - 1;
batch->slots[batch->count++] = base;
return;
}
// Cache disabled or batch full - push to freelist (standard path)
void* head = page->free_list;
memcpy(ptr, &head, sizeof(void*));
page->free_list = ptr;
if (page->used > 0) {
page->used--;
}
// Handle empty page (used == 0)
if (page->used == 0) {
// Fast path: if this is current, just keep it
if (h->current == page) {
return;
}
// Determine location and unlink (rare path)
SmallPageMetaV5* prev = NULL;
page_loc_t loc = get_page_location(h, page, &prev);
if (loc != LOC_NONE && loc != LOC_CURRENT) {
SMALL_PAGE_V5_UNLINK(h, loc, prev, page);
}
// Promote to current if empty
if (!h->current) {
h->current = page;
page->next = NULL;
return;
}
// Try partial (limit 1)
if (h->partial_count < SMALL_HEAP_V5_C6_PARTIAL_LIMIT) {
SMALL_PAGE_V5_PUSH_PARTIAL(h, page);
return;
}
// Retire to cold
small_cold_v5_retire_page(ctx, page);
return;
}
// Page not empty - handle full→partial transition
if (h->current != page) {
SmallPageMetaV5* prev = NULL;
page_loc_t loc = get_page_location(h, page, &prev);
if (loc == LOC_FULL && page->free_list) {
// Move from full to partial
SMALL_PAGE_V5_UNLINK(h, loc, prev, page);
if (h->partial_count < SMALL_HEAP_V5_C6_PARTIAL_LIMIT) {
SMALL_PAGE_V5_PUSH_PARTIAL(h, page);
} else {
SMALL_PAGE_V5_PUSH_FULL(h, page);
}
} else if (!h->current) {
// No current, promote this
if (loc != LOC_NONE) {
SMALL_PAGE_V5_UNLINK(h, loc, prev, page);
}
h->current = page;
page->next = NULL;
}
}
}
// ============================================================================
// Helper: C6 block size query
// ============================================================================
uint32_t small_heap_v5_c6_block_size(void) {
return SMALL_HEAP_V5_C6_BLOCK_SIZE;
}