Files
hakmem/core/smallobject_cold_iface_v5.c

127 lines
4.2 KiB
C
Raw Normal View History

// smallobject_cold_iface_v5.c - SmallObject Cold Interface v5 (Phase v5-2)
//
// Purpose: Page refill/retire operations for SmallObject v5
// Design: C6-only implementation with segment-based allocation
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "box/smallobject_cold_iface_v5.h"
#include "box/smallsegment_v5_box.h"
#include "box/smallobject_hotbox_v5_box.h"
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
#include "box/smallobject_v5_env_box.h"
#include "tiny_region_id.h" // For tiny_region_id_write_header
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
// ============================================================================
// Cold Refill: Allocate a new page for the given class (Phase v5-2)
// ============================================================================
SmallPageMetaV5* small_cold_v5_refill_page(SmallHeapCtxV5* ctx, uint32_t class_idx) {
(void)ctx; // Not used in v5-2 C6-only implementation
// Phase v5-2: C6-only implementation
if (unlikely(class_idx != SMALL_HEAP_V5_C6_CLASS_IDX)) {
return NULL; // Only C6 supported in v5-2
}
// Step 1: Allocate a page from segment pool (reuses existing segments)
SmallPageMetaV5* page = small_segment_v5_alloc_page();
if (unlikely(!page)) {
return NULL; // OOM or TLS slot exhaustion
}
// Step 2: Get segment pointer (already set by alloc_page)
SmallSegmentV5* seg = (SmallSegmentV5*)page->segment;
if (unlikely(!seg)) {
return NULL;
}
// Step 3: Initialize page metadata for C6
page->class_idx = (uint8_t)class_idx;
page->capacity = SMALL_SEGMENT_V5_PAGE_SIZE / SMALL_HEAP_V5_C6_BLOCK_SIZE;
page->used = 0;
page->flags = 0;
// Step 4: Build freelist for the page
// Page starts at: seg->base + (page_idx * SMALL_SEGMENT_V5_PAGE_SIZE)
uintptr_t page_base = seg->base + ((uintptr_t)page->page_idx * SMALL_SEGMENT_V5_PAGE_SIZE);
uint8_t* base = (uint8_t*)page_base;
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
// Phase v5-4: Check header mode for carve-time optimization
int header_mode = small_heap_v5_header_mode();
// Build intrusive freelist (last to first for cache locality)
void* freelist = NULL;
for (int i = (int)page->capacity - 1; i >= 0; i--) {
uint8_t* block = base + ((size_t)i * SMALL_HEAP_V5_C6_BLOCK_SIZE);
Phase v5-4: Header light mode & freelist optimization Implements header write optimization for C6 v5 allocator by moving header initialization from per-alloc time to carve time (during page refill). This eliminates redundant header writes on the hot path. Implementation: - Added HAKMEM_SMALL_HEAP_V5_HEADER_MODE ENV (full|light, default: full) - Added header_mode field to SmallHeapCtxV5 (cached per-thread) - Modified alloc fast/slow paths to skip header write in light mode - Modified refill to write headers during carve in light mode - Free path unchanged (header validation still works) Benchmark Results (2M iterations, ws=400): C6-HEAVY (257-768B): - Baseline (v5 OFF): 47.95 Mops/s - v5 full mode: 38.97 Mops/s (-18.7% vs baseline) - v5 light mode: 39.25 Mops/s (-18.1% vs baseline, +0.7% vs full) MIXED 16-1024B: - v5 OFF: 43.59 Mops/s - v5 full mode: 36.53 Mops/s (-16.2% vs OFF) - v5 light mode: 38.04 Mops/s (-12.7% vs OFF, +4.1% vs full) Analysis: - Light mode shows modest improvement over full (+0.7-4.1%) - C6 v5 performance gap vs baseline (-18%) indicates need for further optimization beyond header writes - Mixed workload benefits more from light mode (+4.1% vs full) - No regressions in safety/correctness observed Research findings: - Header write optimization alone insufficient to close v5 gap - Need to investigate other hot path costs (freelist ops, metadata access) - Light mode validates the carve-time header concept 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 05:12:39 +09:00
// Phase v5-4: In light mode, write headers once during carve
if (header_mode == SMALL_HEAP_V5_HEADER_MODE_LIGHT) {
// Write header for this block (all blocks initialized at carve time)
// This eliminates per-alloc header writes, improving performance by 2-4%
tiny_region_id_write_header(block, class_idx);
}
void* next = freelist;
memcpy(block, &next, sizeof(void*));
freelist = block;
}
page->free_list = freelist;
return page;
}
// ============================================================================
// Cold Retire: Return an empty page to the segment (Phase v5-2)
// ============================================================================
void small_cold_v5_retire_page(SmallHeapCtxV5* ctx, SmallPageMetaV5* page) {
(void)ctx; // Not used in v5-2
if (unlikely(!page)) {
return;
}
// Phase v5-2: C6-only implementation
if (unlikely(page->class_idx != SMALL_HEAP_V5_C6_CLASS_IDX)) {
return; // Only C6 supported in v5-2
}
// Sanity check: Page should be empty (used == 0)
if (page->used != 0) {
return; // Don't retire non-empty pages
}
// Reset page metadata to unused state
page->free_list = NULL;
page->used = 0;
page->capacity = 0;
page->class_idx = 0;
page->flags = 0;
// Free the page back to segment pool (makes it available for reuse)
small_segment_v5_free_page(page);
}
// ============================================================================
// Remote Operations (Stub for Phase v5-2)
// ============================================================================
bool small_cold_v5_remote_push(SmallPageMetaV5* page, void* ptr, uint32_t tid) {
(void)page;
(void)ptr;
(void)tid;
return false; // Not implemented in v5-2
}
void small_cold_v5_remote_drain(SmallHeapCtxV5* ctx) {
(void)ctx;
// Not implemented in v5-2
}