// tiny_c7_ultra.c - Phase PERF-ULTRA-ALLOC-OPT-1: Optimized array-based TLS cache for C7 ULTRA #include #include #include #include #include "box/tiny_c7_ultra_box.h" #include "box/smallobject_hotbox_v3_box.h" #include "box/tiny_geometry_box.h" #include "tiny_region_id.h" #include "box/tiny_c7_ultra_segment_box.h" #include "box/tiny_front_v3_env_box.h" #include "box/free_path_stats_box.h" // Phase PERF-ULTRA-REFILL-OPT-1a: Import page size shift macro // (defined in tiny_c7_ultra_segment.c for consistency) // We'll define it locally here as well for convenience #define TINY_C7_ULTRA_PAGE_SHIFT 16 // 64KiB = 2^16 #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #endif // TLS context static __thread tiny_c7_ultra_tls_t g_tiny_c7_ultra_tls = {0}; tiny_c7_ultra_tls_t* tiny_c7_ultra_tls_get(void) { return &g_tiny_c7_ultra_tls; } // ============================================================================ // Phase PERF-ULTRA-ALLOC-OPT-1: Pure TLS pop alloc (hot path) // ============================================================================ void* tiny_c7_ultra_alloc(size_t size) { (void)size; // C7 dedicated, size unused tiny_c7_ultra_tls_t* tls = &g_tiny_c7_ultra_tls; const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled(); // Hot path: TLS cache hit (single branch) uint16_t n = tls->count; if (__builtin_expect(n > 0, 1)) { void* base = tls->freelist[n - 1]; tls->count = n - 1; // Convert BASE -> USER pointer if (header_light) { return (uint8_t*)base + 1; // Header already written } return tiny_region_id_write_header(base, 7); } // Cold path: Refill TLS cache from segment if (!tiny_c7_ultra_refill(tls)) { return so_alloc(7); // Fallback to v3 } // Retry after refill n = tls->count; if (__builtin_expect(n > 0, 1)) { void* base = tls->freelist[n - 1]; tls->count = n - 1; if (header_light) { return (uint8_t*)base + 1; } return tiny_region_id_write_header(base, 7); } return so_alloc(7); // Final fallback } // ============================================================================ // Cold path: Refill TLS cache from segment // ============================================================================ __attribute__((noinline)) bool tiny_c7_ultra_refill(tiny_c7_ultra_tls_t* tls) { tiny_c7_ultra_segment_t* seg = tls->seg; if (!seg) { seg = tiny_c7_ultra_segment_acquire(); if (!seg) return false; tls->seg = seg; tls->seg_base = (uintptr_t)seg->base; // Phase PERF-ULTRA-REFILL-OPT-1a: Use bit shift instead of multiplication tls->seg_end = tls->seg_base + ((size_t)seg->num_pages << TINY_C7_ULTRA_PAGE_SHIFT); } size_t block_sz = tls->block_size; if (block_sz == 0) { block_sz = (size_t)tiny_stride_for_class(7); tls->block_size = block_sz; } if (block_sz == 0) return false; uint32_t capacity = (uint32_t)(seg->page_size / block_sz); if (capacity == 0) return false; const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled(); // Find an empty or partially used page uint32_t chosen = seg->num_pages; for (uint32_t i = 0; i < seg->num_pages; i++) { tiny_c7_ultra_page_meta_t* pm = &seg->pages[i]; if (pm->capacity == 0 || pm->used < pm->capacity) { chosen = i; break; } } if (chosen == seg->num_pages) { return false; // No available pages } tiny_c7_ultra_page_meta_t* page = &seg->pages[chosen]; // Phase PERF-ULTRA-REFILL-OPT-1a: Use bit shift instead of multiplication uint8_t* base = (uint8_t*)seg->base + ((size_t)chosen << TINY_C7_ULTRA_PAGE_SHIFT); // If page is uninitialized, carve it if (page->capacity == 0) { page->capacity = capacity; page->used = 0; page->freelist = NULL; // Carve blocks into TLS cache (fill from end to preserve order) uint16_t n = 0; for (uint32_t i = 0; i < capacity && n < TINY_C7_ULTRA_CAP; i++) { uint8_t* blk = base + ((size_t)i * block_sz); if (header_light) { tiny_region_id_write_header(blk, 7); // Write header once } tls->freelist[n++] = blk; } tls->count = n; tls->page_base = base; tls->page_idx = chosen; tls->page_meta = page; tls->headers_initialized = header_light; page->used = n; return (n > 0); } // Page already initialized - collect available blocks into TLS cache uint16_t n = 0; for (uint32_t i = 0; i < capacity && n < TINY_C7_ULTRA_CAP; i++) { if (page->used >= capacity) break; uint8_t* blk = base + ((size_t)i * block_sz); // Simple heuristic: if used < capacity, try to allocate next block // (Real implementation would track per-block state or use a bitmap) tls->freelist[n++] = blk; page->used++; } if (n > 0) { tls->count = n; tls->page_base = base; tls->page_idx = chosen; tls->page_meta = page; tls->headers_initialized = header_light; return true; } return false; } // ============================================================================ // Free path: UF-3 segment learning + TLS cache push // ============================================================================ void tiny_c7_ultra_free(void* ptr) { if (!ptr) { so_free(7, ptr); return; } tiny_c7_ultra_tls_t* tls = &g_tiny_c7_ultra_tls; void* base = (uint8_t*)ptr - 1; // Convert USER -> BASE pointer // Phase PERF-ULTRA-REFILL-OPT-1b: Segment learning moved to refill (alloc cold path) // In normal allocation patterns, alloc is always called before free on each thread. // Therefore, seg_base/seg_end are guaranteed to be initialized by refill's // tiny_c7_ultra_segment_acquire() call (line 82-87). // // This optimization removes the per-free segment learning overhead. // Risk: If a thread does free() before any alloc(), it will fallback to so_free(). // This is acceptable because it's an unusual pattern. // Fast path: assume segment already learned by refill // No unlikely() guard needed because refill always runs first in normal patterns uintptr_t addr = (uintptr_t)base; if (likely(tls->seg_base != 0 && addr >= tls->seg_base && addr < tls->seg_end && tls->count < TINY_C7_ULTRA_CAP)) { tls->freelist[tls->count++] = base; FREE_PATH_STAT_INC(c7_ultra_fast); return; } // 3) Slow path: fallback to v3 (out of segment or cache full) so_free(7, ptr); }