Phase v5-3: O(1) path optimization for C6-only v5

- Single TLS segment (eliminates slot search loop)
- O(1) page_meta_of() (direct segment range check, no iteration)
- __builtin_ctz for O(1) free page finding in bitmap
- Simplified free path using page_meta_of() only (no find_page)
- Partial limit 1 (minimal list traversal)

Performance:
- Before (v5-2): 14.7M ops/s
- After (v5-3): 38.5M ops/s (+162%)
- vs baseline: 44.9M ops/s (-14%)
- SEGV: None, stable at ws=800

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-11 04:33:16 +09:00
parent 4c2869397f
commit 7b5ee8cee2
2 changed files with 101 additions and 186 deletions

View File

@ -19,29 +19,22 @@
// Segment Allocation (Phase v5-2)
// ============================================================================
// Thread-local segment list (static allocation to avoid malloc recursion)
#define MAX_SEGMENTS_PER_THREAD 4
// Thread-local segment (Phase v5-3: single segment per thread for O(1) lookup)
// C6-only v5 uses at most 1 segment per thread - this eliminates slot search
typedef struct {
SmallSegmentV5 seg;
int in_use;
uint32_t used_pages; // Bitmap: which pages are currently in use
} TLSSegmentSlot;
static __thread TLSSegmentSlot g_segment_slots_v5[MAX_SEGMENTS_PER_THREAD];
static __thread int g_last_alloc_slot_v5 = -1; // Last slot we allocated from
static __thread TLSSegmentSlot g_tls_segment_v5; // Single TLS segment
SmallSegmentV5* small_segment_v5_acquire(void) {
// Find free slot in TLS (avoid malloc to prevent recursion)
TLSSegmentSlot* slot = NULL;
for (int i = 0; i < MAX_SEGMENTS_PER_THREAD; i++) {
if (!g_segment_slots_v5[i].in_use) {
slot = &g_segment_slots_v5[i];
break;
}
}
// Phase v5-3: Single segment per thread - no slot search needed
TLSSegmentSlot* slot = &g_tls_segment_v5;
if (!slot) {
return NULL; // Out of TLS segment slots
if (slot->in_use) {
return NULL; // Already have a segment (reuse pages instead)
}
// Allocate 2MiB aligned segment via mmap
@ -138,17 +131,9 @@ void small_segment_v5_release(SmallSegmentV5* seg) {
// Release the 2MiB backing memory
munmap((void*)seg->base, SMALL_SEGMENT_V5_SIZE);
// Mark slot as free (TLS memory is never freed, just reused)
for (int i = 0; i < MAX_SEGMENTS_PER_THREAD; i++) {
if (&g_segment_slots_v5[i].seg == seg) {
g_segment_slots_v5[i].in_use = 0;
g_segment_slots_v5[i].used_pages = 0;
if (g_last_alloc_slot_v5 == i) {
g_last_alloc_slot_v5 = -1;
}
break;
}
}
// Phase v5-3: Single segment - direct reset
g_tls_segment_v5.in_use = 0;
g_tls_segment_v5.used_pages = 0;
}
// ============================================================================
@ -156,55 +141,31 @@ void small_segment_v5_release(SmallSegmentV5* seg) {
// ============================================================================
SmallPageMetaV5* small_segment_v5_alloc_page(void) {
// Try to reuse existing segment with free pages
if (g_last_alloc_slot_v5 >= 0 && g_last_alloc_slot_v5 < MAX_SEGMENTS_PER_THREAD) {
TLSSegmentSlot* slot = &g_segment_slots_v5[g_last_alloc_slot_v5];
// Check if not all pages are used (used_pages != 0xFFFFFFFF for 32 pages)
if (slot->in_use && slot->used_pages != 0xFFFFFFFF) {
// This segment has free pages
SmallSegmentV5* seg = &slot->seg;
for (uint32_t i = 0; i < seg->num_pages; i++) {
if ((slot->used_pages & (1U << i)) == 0) {
// Found free page
slot->used_pages |= (1U << i);
return &seg->page_meta[i];
}
}
TLSSegmentSlot* slot = &g_tls_segment_v5;
// Phase v5-3: Single segment - direct access, no slot search
if (slot->in_use && slot->used_pages != 0xFFFFFFFF) {
// Segment has free pages - use __builtin_ctz for O(1) free page find
uint32_t free_mask = ~slot->used_pages;
if (free_mask) {
uint32_t page_idx = (uint32_t)__builtin_ctz(free_mask);
slot->used_pages |= (1U << page_idx);
return &slot->seg.page_meta[page_idx];
}
}
// Search all slots for a segment with free pages
for (int s = 0; s < MAX_SEGMENTS_PER_THREAD; s++) {
TLSSegmentSlot* slot = &g_segment_slots_v5[s];
if (slot->in_use && slot->used_pages != 0xFFFFFFFF) {
SmallSegmentV5* seg = &slot->seg;
for (uint32_t i = 0; i < seg->num_pages; i++) {
if ((slot->used_pages & (1U << i)) == 0) {
// Found free page
slot->used_pages |= (1U << i);
g_last_alloc_slot_v5 = s;
return &seg->page_meta[i];
}
}
// Need new segment
if (!slot->in_use) {
SmallSegmentV5* seg = small_segment_v5_acquire();
if (!seg) {
return NULL;
}
// First page
slot->used_pages |= 1U;
return &seg->page_meta[0];
}
// No free pages in existing segments, allocate new segment
SmallSegmentV5* seg = small_segment_v5_acquire();
if (!seg) {
return NULL;
}
// Mark first page as used
for (int s = 0; s < MAX_SEGMENTS_PER_THREAD; s++) {
if (&g_segment_slots_v5[s].seg == seg) {
g_segment_slots_v5[s].used_pages |= 1U; // Mark page 0 as used
g_last_alloc_slot_v5 = s;
break;
}
}
return &seg->page_meta[0];
return NULL; // Segment full (shouldn't happen with page recycling)
}
void small_segment_v5_free_page(SmallPageMetaV5* page) {
@ -212,22 +173,16 @@ void small_segment_v5_free_page(SmallPageMetaV5* page) {
return;
}
SmallSegmentV5* seg = (SmallSegmentV5*)page->segment;
// Find the slot and clear the used bit
for (int s = 0; s < MAX_SEGMENTS_PER_THREAD; s++) {
if (&g_segment_slots_v5[s].seg == seg) {
g_segment_slots_v5[s].used_pages &= ~(1U << page->page_idx);
// If segment is now empty, we could release it
// For now, keep it for reuse
break;
}
// Phase v5-3: Single segment - direct bitmap clear
TLSSegmentSlot* slot = &g_tls_segment_v5;
if (slot->in_use && page->segment == &slot->seg) {
slot->used_pages &= ~(1U << page->page_idx);
// Keep segment for reuse even if empty
}
}
// ============================================================================
// O(1) Page Metadata Lookup (Phase v5-2)
// O(1) Page Metadata Lookup (Phase v5-3: Single segment, no search)
// ============================================================================
SmallPageMetaV5* small_segment_v5_page_meta_of(void* ptr) {
@ -235,46 +190,35 @@ SmallPageMetaV5* small_segment_v5_page_meta_of(void* ptr) {
return NULL;
}
// Phase v5-3: Single segment - direct TLS access, no loop
TLSSegmentSlot* slot = &g_tls_segment_v5;
if (unlikely(!slot->in_use)) {
return NULL;
}
SmallSegmentV5* seg = &slot->seg;
uintptr_t addr = (uintptr_t)ptr;
uintptr_t seg_base = addr & ~(SMALL_SEGMENT_V5_SIZE - 1);
uintptr_t seg_base = seg->base;
// Search for segment in TLS slots
SmallSegmentV5* seg = NULL;
for (int i = 0; i < MAX_SEGMENTS_PER_THREAD; i++) {
if (g_segment_slots_v5[i].in_use) {
SmallSegmentV5* candidate = &g_segment_slots_v5[i].seg;
if (candidate->base == seg_base) {
seg = candidate;
break;
}
}
}
if (unlikely(!seg)) {
// Check if ptr is within this segment's range
if (unlikely(addr < seg_base || addr >= seg_base + SMALL_SEGMENT_V5_SIZE)) {
return NULL;
}
// Verify magic number (Fail-Fast validation)
if (unlikely(seg->magic != SMALL_SEGMENT_V5_MAGIC)) {
return NULL;
}
// Compute page index via shift
// Compute page index via shift (O(1))
size_t page_idx = (addr - seg_base) >> SMALL_SEGMENT_V5_PAGE_SHIFT;
// Bounds check
// Bounds check (should always pass if within segment)
if (unlikely(page_idx >= seg->num_pages)) {
return NULL;
}
SmallPageMetaV5* page = &seg->page_meta[page_idx];
// Validate that this page is actually in use (has been allocated)
// Unallocated pages have capacity == 0
// Validate that this page is actually in use
if (unlikely(page->capacity == 0)) {
return NULL;
}
// Return page metadata
return page;
}