2025-12-11 20:39:46 +09:00
|
|
|
// tiny_c7_ultra.c - Phase PERF-ULTRA-ALLOC-OPT-1: Optimized array-based TLS cache for C7 ULTRA
|
2025-12-10 22:19:32 +09:00
|
|
|
|
|
|
|
|
#include <stddef.h>
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include <stdbool.h>
|
2025-12-10 22:57:26 +09:00
|
|
|
#include <string.h>
|
2025-12-10 22:19:32 +09:00
|
|
|
#include "box/tiny_c7_ultra_box.h"
|
|
|
|
|
#include "box/smallobject_hotbox_v3_box.h"
|
|
|
|
|
#include "box/tiny_geometry_box.h"
|
|
|
|
|
#include "tiny_region_id.h"
|
|
|
|
|
#include "box/tiny_c7_ultra_segment_box.h"
|
2025-12-10 22:57:26 +09:00
|
|
|
#include "box/tiny_front_v3_env_box.h"
|
Phase PERF-ULTRA-FREE-OPT-1: C4-C7 ULTRA free 薄型化
- C4-C7 ULTRA free を pure TLS push + cold segment learning に統一
- C7 ULTRA free を同じパターンに整列(likely/unlikely + FREE_PATH_STAT_INC)
- C4/C5/C6 ULTRA は既に最適化済み(統一 legacy fallback 経由)
- base/user 変換を tiny_ptr_convert_box.h マクロで統一
実測値 (Mixed 16-1024B, 1M iter, ws=400):
- Baseline (C7 のみ): 42.0M ops/s, legacy=266,943 (49.2%)
- Optimized (C4-C7): 46.5M ops/s, legacy=26,025 (4.8%)
- 改善: +9.3% (+4M ops/s)
FREE_PATH_STATS:
- C6 ULTRA: 137,319 free + 137,241 alloc (100% カバー)
- C5 ULTRA: 68,871 free + 68,827 alloc (100% カバー)
- C4 ULTRA: 34,727 free + 34,696 alloc (100% カバー)
- Legacy: 266,943 → 26,025 (−90.2%, C2/C3 のみ)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 20:49:39 +09:00
|
|
|
#include "box/free_path_stats_box.h"
|
|
|
|
|
|
|
|
|
|
#ifndef likely
|
|
|
|
|
#define likely(x) __builtin_expect(!!(x), 1)
|
|
|
|
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
|
|
|
|
#endif
|
2025-12-10 22:19:32 +09:00
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// TLS context
|
|
|
|
|
static __thread tiny_c7_ultra_tls_t g_tiny_c7_ultra_tls = {0};
|
2025-12-10 22:19:32 +09:00
|
|
|
|
|
|
|
|
tiny_c7_ultra_tls_t* tiny_c7_ultra_tls_get(void) {
|
|
|
|
|
return &g_tiny_c7_ultra_tls;
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Phase PERF-ULTRA-ALLOC-OPT-1: Pure TLS pop alloc (hot path)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
void* tiny_c7_ultra_alloc(size_t size) {
|
|
|
|
|
(void)size; // C7 dedicated, size unused
|
|
|
|
|
tiny_c7_ultra_tls_t* tls = &g_tiny_c7_ultra_tls;
|
|
|
|
|
const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled();
|
|
|
|
|
|
|
|
|
|
// Hot path: TLS cache hit (single branch)
|
|
|
|
|
uint16_t n = tls->count;
|
|
|
|
|
if (__builtin_expect(n > 0, 1)) {
|
|
|
|
|
void* base = tls->freelist[n - 1];
|
|
|
|
|
tls->count = n - 1;
|
|
|
|
|
|
|
|
|
|
// Convert BASE -> USER pointer
|
|
|
|
|
if (header_light) {
|
|
|
|
|
return (uint8_t*)base + 1; // Header already written
|
|
|
|
|
}
|
|
|
|
|
return tiny_region_id_write_header(base, 7);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Cold path: Refill TLS cache from segment
|
|
|
|
|
if (!tiny_c7_ultra_refill(tls)) {
|
|
|
|
|
return so_alloc(7); // Fallback to v3
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Retry after refill
|
|
|
|
|
n = tls->count;
|
|
|
|
|
if (__builtin_expect(n > 0, 1)) {
|
|
|
|
|
void* base = tls->freelist[n - 1];
|
|
|
|
|
tls->count = n - 1;
|
2025-12-10 22:57:26 +09:00
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
if (header_light) {
|
|
|
|
|
return (uint8_t*)base + 1;
|
|
|
|
|
}
|
|
|
|
|
return tiny_region_id_write_header(base, 7);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return so_alloc(7); // Final fallback
|
2025-12-10 22:57:26 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Cold path: Refill TLS cache from segment
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
__attribute__((noinline))
|
|
|
|
|
bool tiny_c7_ultra_refill(tiny_c7_ultra_tls_t* tls) {
|
2025-12-10 22:19:32 +09:00
|
|
|
tiny_c7_ultra_segment_t* seg = tls->seg;
|
|
|
|
|
if (!seg) {
|
|
|
|
|
seg = tiny_c7_ultra_segment_acquire();
|
|
|
|
|
if (!seg) return false;
|
|
|
|
|
tls->seg = seg;
|
2025-12-11 20:39:46 +09:00
|
|
|
tls->seg_base = (uintptr_t)seg->base;
|
|
|
|
|
tls->seg_end = tls->seg_base + ((size_t)seg->num_pages * seg->page_size);
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
size_t block_sz = tls->block_size;
|
|
|
|
|
if (block_sz == 0) {
|
|
|
|
|
block_sz = (size_t)tiny_stride_for_class(7);
|
|
|
|
|
tls->block_size = block_sz;
|
|
|
|
|
}
|
2025-12-10 22:19:32 +09:00
|
|
|
if (block_sz == 0) return false;
|
2025-12-11 20:39:46 +09:00
|
|
|
|
2025-12-10 22:19:32 +09:00
|
|
|
uint32_t capacity = (uint32_t)(seg->page_size / block_sz);
|
|
|
|
|
if (capacity == 0) return false;
|
|
|
|
|
|
2025-12-10 22:57:26 +09:00
|
|
|
const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled();
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// Find an empty or partially used page
|
2025-12-10 22:19:32 +09:00
|
|
|
uint32_t chosen = seg->num_pages;
|
|
|
|
|
for (uint32_t i = 0; i < seg->num_pages; i++) {
|
|
|
|
|
tiny_c7_ultra_page_meta_t* pm = &seg->pages[i];
|
2025-12-11 20:39:46 +09:00
|
|
|
if (pm->capacity == 0 || pm->used < pm->capacity) {
|
2025-12-10 22:19:32 +09:00
|
|
|
chosen = i;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (chosen == seg->num_pages) {
|
2025-12-11 20:39:46 +09:00
|
|
|
return false; // No available pages
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tiny_c7_ultra_page_meta_t* page = &seg->pages[chosen];
|
|
|
|
|
uint8_t* base = (uint8_t*)seg->base + ((size_t)chosen * seg->page_size);
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// If page is uninitialized, carve it
|
|
|
|
|
if (page->capacity == 0) {
|
|
|
|
|
page->capacity = capacity;
|
|
|
|
|
page->used = 0;
|
|
|
|
|
page->freelist = NULL;
|
|
|
|
|
|
|
|
|
|
// Carve blocks into TLS cache (fill from end to preserve order)
|
|
|
|
|
uint16_t n = 0;
|
|
|
|
|
for (uint32_t i = 0; i < capacity && n < TINY_C7_ULTRA_CAP; i++) {
|
|
|
|
|
uint8_t* blk = base + ((size_t)i * block_sz);
|
|
|
|
|
if (header_light) {
|
|
|
|
|
tiny_region_id_write_header(blk, 7); // Write header once
|
|
|
|
|
}
|
|
|
|
|
tls->freelist[n++] = blk;
|
2025-12-10 22:57:26 +09:00
|
|
|
}
|
2025-12-11 20:39:46 +09:00
|
|
|
tls->count = n;
|
|
|
|
|
tls->page_base = base;
|
|
|
|
|
tls->page_idx = chosen;
|
|
|
|
|
tls->page_meta = page;
|
|
|
|
|
tls->headers_initialized = header_light;
|
|
|
|
|
page->used = n;
|
|
|
|
|
return (n > 0);
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// Page already initialized - collect available blocks into TLS cache
|
|
|
|
|
uint16_t n = 0;
|
|
|
|
|
for (uint32_t i = 0; i < capacity && n < TINY_C7_ULTRA_CAP; i++) {
|
|
|
|
|
if (page->used >= capacity) break;
|
2025-12-10 22:19:32 +09:00
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
uint8_t* blk = base + ((size_t)i * block_sz);
|
|
|
|
|
// Simple heuristic: if used < capacity, try to allocate next block
|
|
|
|
|
// (Real implementation would track per-block state or use a bitmap)
|
|
|
|
|
tls->freelist[n++] = blk;
|
|
|
|
|
page->used++;
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
if (n > 0) {
|
|
|
|
|
tls->count = n;
|
|
|
|
|
tls->page_base = base;
|
|
|
|
|
tls->page_idx = chosen;
|
|
|
|
|
tls->page_meta = page;
|
|
|
|
|
tls->headers_initialized = header_light;
|
|
|
|
|
return true;
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
return false;
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Free path: UF-3 segment learning + TLS cache push
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
2025-12-10 22:19:32 +09:00
|
|
|
void tiny_c7_ultra_free(void* ptr) {
|
|
|
|
|
if (!ptr) {
|
|
|
|
|
so_free(7, ptr);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-11 20:39:46 +09:00
|
|
|
tiny_c7_ultra_tls_t* tls = &g_tiny_c7_ultra_tls;
|
|
|
|
|
void* base = (uint8_t*)ptr - 1; // Convert USER -> BASE pointer
|
2025-12-10 22:19:32 +09:00
|
|
|
|
Phase PERF-ULTRA-FREE-OPT-1: C4-C7 ULTRA free 薄型化
- C4-C7 ULTRA free を pure TLS push + cold segment learning に統一
- C7 ULTRA free を同じパターンに整列(likely/unlikely + FREE_PATH_STAT_INC)
- C4/C5/C6 ULTRA は既に最適化済み(統一 legacy fallback 経由)
- base/user 変換を tiny_ptr_convert_box.h マクロで統一
実測値 (Mixed 16-1024B, 1M iter, ws=400):
- Baseline (C7 のみ): 42.0M ops/s, legacy=266,943 (49.2%)
- Optimized (C4-C7): 46.5M ops/s, legacy=26,025 (4.8%)
- 改善: +9.3% (+4M ops/s)
FREE_PATH_STATS:
- C6 ULTRA: 137,319 free + 137,241 alloc (100% カバー)
- C5 ULTRA: 68,871 free + 68,827 alloc (100% カバー)
- C4 ULTRA: 34,727 free + 34,696 alloc (100% カバー)
- Legacy: 266,943 → 26,025 (−90.2%, C2/C3 のみ)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 20:49:39 +09:00
|
|
|
// 1) Initial segment learning (cold path, once per thread)
|
|
|
|
|
if (unlikely(tls->seg_base == 0)) {
|
2025-12-11 20:39:46 +09:00
|
|
|
tiny_c7_ultra_segment_t* seg = tiny_c7_ultra_segment_from_ptr(ptr);
|
Phase PERF-ULTRA-FREE-OPT-1: C4-C7 ULTRA free 薄型化
- C4-C7 ULTRA free を pure TLS push + cold segment learning に統一
- C7 ULTRA free を同じパターンに整列(likely/unlikely + FREE_PATH_STAT_INC)
- C4/C5/C6 ULTRA は既に最適化済み(統一 legacy fallback 経由)
- base/user 変換を tiny_ptr_convert_box.h マクロで統一
実測値 (Mixed 16-1024B, 1M iter, ws=400):
- Baseline (C7 のみ): 42.0M ops/s, legacy=266,943 (49.2%)
- Optimized (C4-C7): 46.5M ops/s, legacy=26,025 (4.8%)
- 改善: +9.3% (+4M ops/s)
FREE_PATH_STATS:
- C6 ULTRA: 137,319 free + 137,241 alloc (100% カバー)
- C5 ULTRA: 68,871 free + 68,827 alloc (100% カバー)
- C4 ULTRA: 34,727 free + 34,696 alloc (100% カバー)
- Legacy: 266,943 → 26,025 (−90.2%, C2/C3 のみ)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 20:49:39 +09:00
|
|
|
if (seg != NULL) {
|
|
|
|
|
tls->seg = seg;
|
|
|
|
|
tls->seg_base = (uintptr_t)seg->base;
|
|
|
|
|
tls->seg_end = tls->seg_base + ((size_t)seg->num_pages * seg->page_size);
|
2025-12-11 20:39:46 +09:00
|
|
|
}
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
Phase PERF-ULTRA-FREE-OPT-1: C4-C7 ULTRA free 薄型化
- C4-C7 ULTRA free を pure TLS push + cold segment learning に統一
- C7 ULTRA free を同じパターンに整列(likely/unlikely + FREE_PATH_STAT_INC)
- C4/C5/C6 ULTRA は既に最適化済み(統一 legacy fallback 経由)
- base/user 変換を tiny_ptr_convert_box.h マクロで統一
実測値 (Mixed 16-1024B, 1M iter, ws=400):
- Baseline (C7 のみ): 42.0M ops/s, legacy=266,943 (49.2%)
- Optimized (C4-C7): 46.5M ops/s, legacy=26,025 (4.8%)
- 改善: +9.3% (+4M ops/s)
FREE_PATH_STATS:
- C6 ULTRA: 137,319 free + 137,241 alloc (100% カバー)
- C5 ULTRA: 68,871 free + 68,827 alloc (100% カバー)
- C4 ULTRA: 34,727 free + 34,696 alloc (100% カバー)
- Legacy: 266,943 → 26,025 (−90.2%, C2/C3 のみ)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 20:49:39 +09:00
|
|
|
// 2) Fast path: range check + TLS push
|
2025-12-11 20:39:46 +09:00
|
|
|
uintptr_t addr = (uintptr_t)base;
|
Phase PERF-ULTRA-FREE-OPT-1: C4-C7 ULTRA free 薄型化
- C4-C7 ULTRA free を pure TLS push + cold segment learning に統一
- C7 ULTRA free を同じパターンに整列(likely/unlikely + FREE_PATH_STAT_INC)
- C4/C5/C6 ULTRA は既に最適化済み(統一 legacy fallback 経由)
- base/user 変換を tiny_ptr_convert_box.h マクロで統一
実測値 (Mixed 16-1024B, 1M iter, ws=400):
- Baseline (C7 のみ): 42.0M ops/s, legacy=266,943 (49.2%)
- Optimized (C4-C7): 46.5M ops/s, legacy=26,025 (4.8%)
- 改善: +9.3% (+4M ops/s)
FREE_PATH_STATS:
- C6 ULTRA: 137,319 free + 137,241 alloc (100% カバー)
- C5 ULTRA: 68,871 free + 68,827 alloc (100% カバー)
- C4 ULTRA: 34,727 free + 34,696 alloc (100% カバー)
- Legacy: 266,943 → 26,025 (−90.2%, C2/C3 のみ)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 20:49:39 +09:00
|
|
|
if (likely(tls->seg_base != 0 &&
|
|
|
|
|
addr >= tls->seg_base &&
|
|
|
|
|
addr < tls->seg_end &&
|
|
|
|
|
tls->count < TINY_C7_ULTRA_CAP)) {
|
|
|
|
|
tls->freelist[tls->count++] = base;
|
|
|
|
|
FREE_PATH_STAT_INC(c7_ultra_fast);
|
|
|
|
|
return;
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|
|
|
|
|
|
Phase PERF-ULTRA-FREE-OPT-1: C4-C7 ULTRA free 薄型化
- C4-C7 ULTRA free を pure TLS push + cold segment learning に統一
- C7 ULTRA free を同じパターンに整列(likely/unlikely + FREE_PATH_STAT_INC)
- C4/C5/C6 ULTRA は既に最適化済み(統一 legacy fallback 経由)
- base/user 変換を tiny_ptr_convert_box.h マクロで統一
実測値 (Mixed 16-1024B, 1M iter, ws=400):
- Baseline (C7 のみ): 42.0M ops/s, legacy=266,943 (49.2%)
- Optimized (C4-C7): 46.5M ops/s, legacy=26,025 (4.8%)
- 改善: +9.3% (+4M ops/s)
FREE_PATH_STATS:
- C6 ULTRA: 137,319 free + 137,241 alloc (100% カバー)
- C5 ULTRA: 68,871 free + 68,827 alloc (100% カバー)
- C4 ULTRA: 34,727 free + 34,696 alloc (100% カバー)
- Legacy: 266,943 → 26,025 (−90.2%, C2/C3 のみ)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 20:49:39 +09:00
|
|
|
// 3) Slow path: fallback to v3 (out of segment or cache full)
|
2025-12-11 20:39:46 +09:00
|
|
|
so_free(7, ptr);
|
2025-12-10 22:19:32 +09:00
|
|
|
}
|