Phase 83-1 + Allocator Comparison: Switch dispatch fixed (NO-GO +0.32%), PROFILE correction, SCORECARD update
Key changes: - Phase 83-1: Switch dispatch fixed mode (tiny_inline_slots_switch_dispatch_fixed_box) - NO-GO (marginal +0.32%, branch reduction negligible) Reason: lazy-init pattern already optimal, Phase 78-1 pattern shows diminishing returns - Allocator comparison baseline update (10-run SSOT, WS=400, ITERS=20M): tcmalloc: 115.26M (92.33% of mimalloc) jemalloc: 97.39M (77.96% of mimalloc) system: 85.20M (68.24% of mimalloc) mimalloc: 124.82M (baseline) - hakmem PROFILE correction: scripts/run_mixed_10_cleanenv.sh + run_allocator_quick_matrix.sh PROFILE explicitly set to MIXED_TINYV3_C7_SAFE for hakmem measurements Result: baseline stabilized to 55.53M (44.46% of mimalloc) Previous unstable measurement (35.57M) was due to profile leak - Documentation: * PERFORMANCE_TARGETS_SCORECARD.md: Reference allocators + M1/M2 milestone status * PHASE83_1_SWITCH_DISPATCH_FIXED_RESULTS.md: Phase 83-1 analysis (NO-GO) * ALLOCATOR_COMPARISON_QUICK_RUNBOOK.md: Quick comparison procedure * ALLOCATOR_COMPARISON_SSOT.md: Detailed SSOT methodology - M2 milestone status: 44.46% (target 55%, gap -10.54pp) - structural improvements needed 🤖 Generated with Claude Code Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -16,6 +16,7 @@
|
||||
#include "box/front_fastlane_alloc_legacy_direct_env_box.h" // front_fastlane_alloc_legacy_direct_env_refresh_from_env (Phase 16 v1)
|
||||
#include "box/fastlane_direct_env_box.h" // fastlane_direct_env_refresh_from_env (Phase 19-1)
|
||||
#include "box/tiny_header_hotfull_env_box.h" // tiny_header_hotfull_env_refresh_from_env (Phase 21)
|
||||
#include "box/tiny_inline_slots_fixed_mode_box.h" // tiny_inline_slots_fixed_mode_refresh_from_env (Phase 78-1)
|
||||
#endif
|
||||
|
||||
// env が未設定のときだけ既定値を入れる
|
||||
@ -108,6 +109,12 @@ static inline void bench_apply_mixed_tinyv3_c7_common(void) {
|
||||
// Phase 75-3: C5+C6 Inline Slots (GO +5.41% proven, 4-point matrix A/B)
|
||||
bench_setenv_default("HAKMEM_TINY_C5_INLINE_SLOTS", "1");
|
||||
bench_setenv_default("HAKMEM_TINY_C6_INLINE_SLOTS", "1");
|
||||
// Phase 76-1: C4 Inline Slots (GO +1.73%, 10-run A/B)
|
||||
bench_setenv_default("HAKMEM_TINY_C4_INLINE_SLOTS", "1");
|
||||
// Phase 78-1: Inline Slots Fixed Mode (GO, removes per-op ENV gate overhead)
|
||||
bench_setenv_default("HAKMEM_TINY_INLINE_SLOTS_FIXED", "1");
|
||||
// Phase 80-1: Inline Slots Switch Dispatch (GO +1.65%, removes if-chain comparisons)
|
||||
bench_setenv_default("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH", "1");
|
||||
}
|
||||
|
||||
static inline void bench_apply_profile(void) {
|
||||
@ -222,9 +229,11 @@ static inline void bench_apply_profile(void) {
|
||||
tiny_unified_lifo_env_refresh_from_env();
|
||||
// Phase 16 v1: Sync LEGACY direct ENV cache after bench_profile putenv defaults.
|
||||
front_fastlane_alloc_legacy_direct_env_refresh_from_env();
|
||||
// Phase 19-1: Sync FastLane Direct ENV cache after bench_profile putenv defaults.
|
||||
// Phase 19-1: Sync FastLane Direct ENV cache after bench_profile putenv defaults.
|
||||
fastlane_direct_env_refresh_from_env();
|
||||
// Phase 21: Sync Tiny Header HotFull ENV cache after bench_profile putenv defaults.
|
||||
tiny_header_hotfull_env_refresh_from_env();
|
||||
// Phase 78-1: Optionally pin C3/C4/C5/C6 inline-slots modes (avoid per-op ENV gates).
|
||||
tiny_inline_slots_fixed_mode_refresh_from_env();
|
||||
#endif
|
||||
}
|
||||
|
||||
41
core/box/tiny_c2_local_cache_env_box.h
Normal file
41
core/box/tiny_c2_local_cache_env_box.h
Normal file
@ -0,0 +1,41 @@
|
||||
// tiny_c2_local_cache_env_box.h - Phase 79-1: C2 Local Cache ENV Gate
|
||||
//
|
||||
// Goal: Gate C2 local cache feature via environment variable
|
||||
// Scope: C2 class only (32-64B allocations)
|
||||
// Design: Lazy-init cached decision pattern (zero overhead when disabled)
|
||||
//
|
||||
// ENV Variable: HAKMEM_TINY_C2_LOCAL_CACHE
|
||||
// - Value 0, unset, or empty: disabled (default OFF in Phase 79-1)
|
||||
// - Non-zero (e.g., 1): enabled
|
||||
// - Decision cached at first call
|
||||
//
|
||||
// Rationale:
|
||||
// - Separation of concerns (policy from mechanism)
|
||||
// - A/B testing support (enable/disable without recompile)
|
||||
// - Safe default: disabled until Phase 79-1 A/B test validates +1.0% GO threshold
|
||||
// - Phase 79-0 analysis: C2 hits Stage3 backend lock (contention signal)
|
||||
|
||||
#ifndef HAK_BOX_TINY_C2_LOCAL_CACHE_ENV_BOX_H
|
||||
#define HAK_BOX_TINY_C2_LOCAL_CACHE_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// ============================================================================
|
||||
// C2 Local Cache: Environment Decision Gate
|
||||
// ============================================================================
|
||||
|
||||
// Check if C2 local cache is enabled via ENV
|
||||
// Decision is cached at first call (zero overhead after initialization)
|
||||
static inline int tiny_c2_local_cache_enabled(void) {
|
||||
static int g_c2_local_cache_enabled = -1; // -1 = uncached
|
||||
|
||||
if (__builtin_expect(g_c2_local_cache_enabled == -1, 0)) {
|
||||
// First call: read ENV and cache decision
|
||||
const char* e = getenv("HAKMEM_TINY_C2_LOCAL_CACHE");
|
||||
g_c2_local_cache_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
return g_c2_local_cache_enabled;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C2_LOCAL_CACHE_ENV_BOX_H
|
||||
99
core/box/tiny_c2_local_cache_tls_box.h
Normal file
99
core/box/tiny_c2_local_cache_tls_box.h
Normal file
@ -0,0 +1,99 @@
|
||||
// tiny_c2_local_cache_tls_box.h - Phase 79-1: C2 Local Cache TLS Extension
|
||||
//
|
||||
// Goal: Extend TLS struct with C2-only local cache ring buffer
|
||||
// Scope: C2 class only (capacity 64, 8-byte slots = 512B per thread)
|
||||
// Design: Simple FIFO ring (head/tail indices, modulo 64)
|
||||
//
|
||||
// Ring Buffer Strategy:
|
||||
// - head: next pop position (consumer)
|
||||
// - tail: next push position (producer)
|
||||
// - Empty: head == tail
|
||||
// - Full: (tail + 1) % 64 == head
|
||||
// - Count: (tail - head + 64) % 64
|
||||
//
|
||||
// TLS Layout Impact:
|
||||
// - Size: 64 slots × 8 bytes = 512B per thread (lightweight, Phase 79-0 spec)
|
||||
// - Alignment: 64-byte cache line aligned (NUMA-friendly)
|
||||
// - Lifetime: Zero-initialized at TLS init, valid for thread lifetime
|
||||
//
|
||||
// Rationale for cap=64:
|
||||
// - Phase 79-0 analysis: C2 hits Stage3 backend lock (cache miss pattern)
|
||||
// - Conservative cap (512B) to intercept C2 frees locally
|
||||
// - Capacity > max concurrent C2 allocations in WS=400
|
||||
// - Smaller than C3's 256 (Phase 77-1 precedent) to manage TLS bloat
|
||||
// - 64 = 2^6 (efficient modulo arithmetic)
|
||||
//
|
||||
// Conditional Compilation:
|
||||
// - Only compiled if HAKMEM_TINY_C2_LOCAL_CACHE enabled
|
||||
// - Default OFF: zero overhead when disabled
|
||||
|
||||
#ifndef HAK_BOX_TINY_C2_LOCAL_CACHE_TLS_BOX_H
|
||||
#define HAK_BOX_TINY_C2_LOCAL_CACHE_TLS_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "tiny_c2_local_cache_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C2 Local Cache: TLS Structure
|
||||
// ============================================================================
|
||||
|
||||
#define TINY_C2_LOCAL_CACHE_CAPACITY 64 // C2 capacity: 64 = 2^6 (512B per thread)
|
||||
|
||||
// TLS ring buffer for C2 local cache
|
||||
// Design: FIFO ring (head/tail indices, circular buffer)
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* slots[TINY_C2_LOCAL_CACHE_CAPACITY]; // BASE pointers (512B)
|
||||
uint8_t head; // Next pop position (consumer)
|
||||
uint8_t tail; // Next push position (producer)
|
||||
uint8_t _pad[62]; // Padding to 64-byte cache line boundary
|
||||
} TinyC2LocalCache;
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable (extern, defined in tiny_c2_local_cache.c)
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Conditionally compiled: only if C2 local cache is enabled
|
||||
extern __thread TinyC2LocalCache g_tiny_c2_local_cache;
|
||||
|
||||
// ============================================================================
|
||||
// Initialization
|
||||
// ============================================================================
|
||||
|
||||
// Initialize C2 local cache for current thread
|
||||
// Called once at TLS init time (hakmem_tiny_init_thread or equivalent)
|
||||
// Returns: 1 if initialized, 0 if disabled
|
||||
static inline int tiny_c2_local_cache_init(TinyC2LocalCache* cache) {
|
||||
if (!tiny_c2_local_cache_enabled()) {
|
||||
return 0; // Disabled, no init needed
|
||||
}
|
||||
|
||||
// Zero-initialize all slots
|
||||
memset(cache->slots, 0, sizeof(cache->slots));
|
||||
cache->head = 0;
|
||||
cache->tail = 0;
|
||||
|
||||
return 1; // Initialized
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Ring Buffer Helpers (inline for zero overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Check if ring is empty
|
||||
static inline int c2_local_cache_empty(const TinyC2LocalCache* cache) {
|
||||
return cache->head == cache->tail;
|
||||
}
|
||||
|
||||
// Check if ring is full
|
||||
static inline int c2_local_cache_full(const TinyC2LocalCache* cache) {
|
||||
return ((cache->tail + 1) % TINY_C2_LOCAL_CACHE_CAPACITY) == cache->head;
|
||||
}
|
||||
|
||||
// Get current count (number of items in ring)
|
||||
static inline int c2_local_cache_count(const TinyC2LocalCache* cache) {
|
||||
return (cache->tail - cache->head + TINY_C2_LOCAL_CACHE_CAPACITY) % TINY_C2_LOCAL_CACHE_CAPACITY;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C2_LOCAL_CACHE_TLS_BOX_H
|
||||
40
core/box/tiny_c3_inline_slots_env_box.h
Normal file
40
core/box/tiny_c3_inline_slots_env_box.h
Normal file
@ -0,0 +1,40 @@
|
||||
// tiny_c3_inline_slots_env_box.h - Phase 77-1: C3 Inline Slots ENV Gate
|
||||
//
|
||||
// Goal: Gate C3 inline slots feature via environment variable
|
||||
// Scope: C3 class only (64-128B allocations)
|
||||
// Design: Lazy-init cached decision pattern (zero overhead when disabled)
|
||||
//
|
||||
// ENV Variable: HAKMEM_TINY_C3_INLINE_SLOTS
|
||||
// - Value 0, unset, or empty: disabled (default OFF in Phase 77-1)
|
||||
// - Non-zero (e.g., 1): enabled
|
||||
// - Decision cached at first call
|
||||
//
|
||||
// Rationale:
|
||||
// - Separation of concerns (policy from mechanism)
|
||||
// - A/B testing support (enable/disable without recompile)
|
||||
// - Safe default: disabled until promoted to SSOT
|
||||
|
||||
#ifndef HAK_BOX_TINY_C3_INLINE_SLOTS_ENV_BOX_H
|
||||
#define HAK_BOX_TINY_C3_INLINE_SLOTS_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// ============================================================================
|
||||
// C3 Inline Slots: Environment Decision Gate
|
||||
// ============================================================================
|
||||
|
||||
// Check if C3 inline slots are enabled via ENV
|
||||
// Decision is cached at first call (zero overhead after initialization)
|
||||
static inline int tiny_c3_inline_slots_enabled(void) {
|
||||
static int g_c3_inline_slots_enabled = -1; // -1 = uncached
|
||||
|
||||
if (__builtin_expect(g_c3_inline_slots_enabled == -1, 0)) {
|
||||
// First call: read ENV and cache decision
|
||||
const char* e = getenv("HAKMEM_TINY_C3_INLINE_SLOTS");
|
||||
g_c3_inline_slots_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
return g_c3_inline_slots_enabled;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C3_INLINE_SLOTS_ENV_BOX_H
|
||||
98
core/box/tiny_c3_inline_slots_tls_box.h
Normal file
98
core/box/tiny_c3_inline_slots_tls_box.h
Normal file
@ -0,0 +1,98 @@
|
||||
// tiny_c3_inline_slots_tls_box.h - Phase 77-1: C3 Inline Slots TLS Extension
|
||||
//
|
||||
// Goal: Extend TLS struct with C3-only inline slot ring buffer
|
||||
// Scope: C3 class only (capacity 256, 8-byte slots = 2KB per thread)
|
||||
// Design: Simple FIFO ring (head/tail indices, modulo 256)
|
||||
//
|
||||
// Ring Buffer Strategy:
|
||||
// - head: next pop position (consumer)
|
||||
// - tail: next push position (producer)
|
||||
// - Empty: head == tail
|
||||
// - Full: (tail + 1) % 256 == head
|
||||
// - Count: (tail - head + 256) % 256
|
||||
//
|
||||
// TLS Layout Impact:
|
||||
// - Size: 256 slots × 8 bytes = 2KB per thread (conservative cap, avoid cache-miss bloat)
|
||||
// - Alignment: 64-byte cache line aligned (NUMA-friendly)
|
||||
// - Lifetime: Zero-initialized at TLS init, valid for thread lifetime
|
||||
//
|
||||
// Rationale for cap=256:
|
||||
// - Phase 77-0 observation: unified_cache shows C3 has low traffic (1 miss in 20M ops)
|
||||
// - Conservative cap (2KB) to avoid Phase 74-2 cache-miss explosion
|
||||
// - Ring capacity > estimated max concurrent allocs in WS=400
|
||||
// - Smaller than C4's 512B but same modulo math (256 = 2^8)
|
||||
//
|
||||
// Conditional Compilation:
|
||||
// - Only compiled if HAKMEM_TINY_C3_INLINE_SLOTS enabled
|
||||
// - Default OFF: zero overhead when disabled
|
||||
|
||||
#ifndef HAK_BOX_TINY_C3_INLINE_SLOTS_TLS_BOX_H
|
||||
#define HAK_BOX_TINY_C3_INLINE_SLOTS_TLS_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "tiny_c3_inline_slots_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C3 Inline Slots: TLS Structure
|
||||
// ============================================================================
|
||||
|
||||
#define TINY_C3_INLINE_CAPACITY 256 // C3 capacity: 256 = 2^8 (2KB per thread)
|
||||
|
||||
// TLS ring buffer for C3 inline slots
|
||||
// Design: FIFO ring (head/tail indices, circular buffer)
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* slots[TINY_C3_INLINE_CAPACITY]; // BASE pointers (2KB)
|
||||
uint8_t head; // Next pop position (consumer)
|
||||
uint8_t tail; // Next push position (producer)
|
||||
uint8_t _pad[62]; // Padding to 64-byte cache line boundary
|
||||
} TinyC3InlineSlots;
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable (extern, defined in tiny_c3_inline_slots.c)
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Conditionally compiled: only if C3 inline slots are enabled
|
||||
extern __thread TinyC3InlineSlots g_tiny_c3_inline_slots;
|
||||
|
||||
// ============================================================================
|
||||
// Initialization
|
||||
// ============================================================================
|
||||
|
||||
// Initialize C3 inline slots for current thread
|
||||
// Called once at TLS init time (hakmem_tiny_init_thread or equivalent)
|
||||
// Returns: 1 if initialized, 0 if disabled
|
||||
static inline int tiny_c3_inline_slots_init(TinyC3InlineSlots* slots) {
|
||||
if (!tiny_c3_inline_slots_enabled()) {
|
||||
return 0; // Disabled, no init needed
|
||||
}
|
||||
|
||||
// Zero-initialize all slots
|
||||
memset(slots->slots, 0, sizeof(slots->slots));
|
||||
slots->head = 0;
|
||||
slots->tail = 0;
|
||||
|
||||
return 1; // Initialized
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Ring Buffer Helpers (inline for zero overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Check if ring is empty
|
||||
static inline int c3_inline_empty(const TinyC3InlineSlots* slots) {
|
||||
return slots->head == slots->tail;
|
||||
}
|
||||
|
||||
// Check if ring is full
|
||||
static inline int c3_inline_full(const TinyC3InlineSlots* slots) {
|
||||
return ((slots->tail + 1) % TINY_C3_INLINE_CAPACITY) == slots->head;
|
||||
}
|
||||
|
||||
// Get current count (number of items in ring)
|
||||
static inline int c3_inline_count(const TinyC3InlineSlots* slots) {
|
||||
return (slots->tail - slots->head + TINY_C3_INLINE_CAPACITY) % TINY_C3_INLINE_CAPACITY;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C3_INLINE_SLOTS_TLS_BOX_H
|
||||
61
core/box/tiny_c4_inline_slots_env_box.h
Normal file
61
core/box/tiny_c4_inline_slots_env_box.h
Normal file
@ -0,0 +1,61 @@
|
||||
// tiny_c4_inline_slots_env_box.h - Phase 76-1: C4 Inline Slots ENV Gate
|
||||
//
|
||||
// Goal: Runtime ENV gate for C4-only inline slots optimization
|
||||
// Scope: C4 class only (capacity 64, 8-byte slots)
|
||||
// Default: OFF (research box, ENV=0)
|
||||
//
|
||||
// ENV Variable:
|
||||
// HAKMEM_TINY_C4_INLINE_SLOTS=0/1 (default: 0, OFF)
|
||||
//
|
||||
// Design:
|
||||
// - Lazy-init pattern (single decision per TLS init)
|
||||
// - No TLS struct changes (pure gate)
|
||||
// - Thread-safe initialization
|
||||
//
|
||||
// Phase 76-1: C4-only implementation (extends C5+C6 pattern)
|
||||
// Phase 76-2: Measure C4 contribution to full optimization stack
|
||||
|
||||
#ifndef HAK_BOX_TINY_C4_INLINE_SLOTS_ENV_BOX_H
|
||||
#define HAK_BOX_TINY_C4_INLINE_SLOTS_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "../hakmem_build_flags.h"
|
||||
|
||||
// ============================================================================
|
||||
// ENV Gate: C4 Inline Slots
|
||||
// ============================================================================
|
||||
|
||||
// Check if C4 inline slots are enabled (lazy init, cached)
|
||||
static inline int tiny_c4_inline_slots_enabled(void) {
|
||||
static int g_c4_inline_slots_enabled = -1;
|
||||
|
||||
if (__builtin_expect(g_c4_inline_slots_enabled == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_TINY_C4_INLINE_SLOTS");
|
||||
g_c4_inline_slots_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
fprintf(stderr, "[C4-INLINE-INIT] tiny_c4_inline_slots_enabled() = %d (env=%s)\n",
|
||||
g_c4_inline_slots_enabled, e ? e : "NULL");
|
||||
fflush(stderr);
|
||||
#endif
|
||||
}
|
||||
|
||||
return g_c4_inline_slots_enabled;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Optional: Compile-time gate for Phase 76-2+ (future)
|
||||
// ============================================================================
|
||||
// When transitioning from research box (ENV-only) to production,
|
||||
// add compile-time flag to eliminate runtime branch overhead:
|
||||
//
|
||||
// #ifdef HAKMEM_TINY_C4_INLINE_SLOTS_COMPILED
|
||||
// return 1; // Compile-time ON
|
||||
// #else
|
||||
// return tiny_c4_inline_slots_enabled(); // Runtime ENV gate
|
||||
// #endif
|
||||
//
|
||||
// For Phase 76-1: Keep ENV-only (research box, default OFF)
|
||||
|
||||
#endif // HAK_BOX_TINY_C4_INLINE_SLOTS_ENV_BOX_H
|
||||
92
core/box/tiny_c4_inline_slots_tls_box.h
Normal file
92
core/box/tiny_c4_inline_slots_tls_box.h
Normal file
@ -0,0 +1,92 @@
|
||||
// tiny_c4_inline_slots_tls_box.h - Phase 76-1: C4 Inline Slots TLS Extension
|
||||
//
|
||||
// Goal: Extend TLS struct with C4-only inline slot ring buffer
|
||||
// Scope: C4 class only (capacity 64, 8-byte slots = 512B per thread)
|
||||
// Design: Simple FIFO ring (head/tail indices, modulo 64)
|
||||
//
|
||||
// Ring Buffer Strategy:
|
||||
// - head: next pop position (consumer)
|
||||
// - tail: next push position (producer)
|
||||
// - Empty: head == tail
|
||||
// - Full: (tail + 1) % 64 == head
|
||||
// - Count: (tail - head + 64) % 64
|
||||
//
|
||||
// TLS Layout Impact:
|
||||
// - Size: 64 slots × 8 bytes = 512B per thread (lighter than C5/C6's 1KB)
|
||||
// - Alignment: 64-byte cache line aligned (optional, for performance)
|
||||
// - Lifetime: Zero-initialized at TLS init, valid for thread lifetime
|
||||
//
|
||||
// Conditional Compilation:
|
||||
// - Only compiled if HAKMEM_TINY_C4_INLINE_SLOTS enabled
|
||||
// - Default OFF: zero overhead when disabled
|
||||
|
||||
#ifndef HAK_BOX_TINY_C4_INLINE_SLOTS_TLS_BOX_H
|
||||
#define HAK_BOX_TINY_C4_INLINE_SLOTS_TLS_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "tiny_c4_inline_slots_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C4 Inline Slots: TLS Structure
|
||||
// ============================================================================
|
||||
|
||||
#define TINY_C4_INLINE_CAPACITY 64 // C4 capacity (from Unified-STATS analysis)
|
||||
|
||||
// TLS ring buffer for C4 inline slots
|
||||
// Design: FIFO ring (head/tail indices, circular buffer)
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* slots[TINY_C4_INLINE_CAPACITY]; // BASE pointers (512B)
|
||||
uint8_t head; // Next pop position (consumer)
|
||||
uint8_t tail; // Next push position (producer)
|
||||
uint8_t _pad[62]; // Padding to 64-byte cache line boundary
|
||||
} TinyC4InlineSlots;
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable (extern, defined in tiny_c4_inline_slots.c)
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Conditionally compiled: only if C4 inline slots are enabled
|
||||
extern __thread TinyC4InlineSlots g_tiny_c4_inline_slots;
|
||||
|
||||
// ============================================================================
|
||||
// Initialization
|
||||
// ============================================================================
|
||||
|
||||
// Initialize C4 inline slots for current thread
|
||||
// Called once at TLS init time (hakmem_tiny_init_thread or equivalent)
|
||||
// Returns: 1 if initialized, 0 if disabled
|
||||
static inline int tiny_c4_inline_slots_init(TinyC4InlineSlots* slots) {
|
||||
if (!tiny_c4_inline_slots_enabled()) {
|
||||
return 0; // Disabled, no init needed
|
||||
}
|
||||
|
||||
// Zero-initialize all slots
|
||||
memset(slots->slots, 0, sizeof(slots->slots));
|
||||
slots->head = 0;
|
||||
slots->tail = 0;
|
||||
|
||||
return 1; // Initialized
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Ring Buffer Helpers (inline for zero overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Check if ring is empty
|
||||
static inline int c4_inline_empty(const TinyC4InlineSlots* slots) {
|
||||
return slots->head == slots->tail;
|
||||
}
|
||||
|
||||
// Check if ring is full
|
||||
static inline int c4_inline_full(const TinyC4InlineSlots* slots) {
|
||||
return ((slots->tail + 1) % TINY_C4_INLINE_CAPACITY) == slots->head;
|
||||
}
|
||||
|
||||
// Get current count (number of items in ring)
|
||||
static inline int c4_inline_count(const TinyC4InlineSlots* slots) {
|
||||
return (slots->tail - slots->head + TINY_C4_INLINE_CAPACITY) % TINY_C4_INLINE_CAPACITY;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C4_INLINE_SLOTS_TLS_BOX_H
|
||||
@ -35,6 +35,15 @@
|
||||
#include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API
|
||||
#include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate
|
||||
#include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API
|
||||
#include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate
|
||||
#include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API
|
||||
#include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate
|
||||
#include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API
|
||||
#include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate
|
||||
#include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API
|
||||
#include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating
|
||||
#include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6
|
||||
#include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode
|
||||
|
||||
// ============================================================================
|
||||
// Branch Prediction Macros (Pointer Safety - Prediction Hints)
|
||||
@ -114,9 +123,93 @@ __attribute__((always_inline))
|
||||
static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
extern __thread TinyUnifiedCache g_unified_cache[];
|
||||
|
||||
// Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
|
||||
// Phase 83-1: Per-op branch removed via fixed-mode caching
|
||||
// C2/C3 excluded (NO-GO from Phase 77-1/79-1)
|
||||
if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
|
||||
// Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
|
||||
switch (class_idx) {
|
||||
case 4:
|
||||
if (tiny_c4_inline_slots_enabled_fast()) {
|
||||
void* base = c4_inline_pop(c4_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
if (tiny_c5_inline_slots_enabled_fast()) {
|
||||
void* base = c5_inline_pop(c5_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
if (tiny_c6_inline_slots_enabled_fast()) {
|
||||
void* base = c6_inline_pop(c6_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// C0-C3, C7: fall through to unified_cache
|
||||
break;
|
||||
}
|
||||
// Switch mode: fall through to unified_cache after miss
|
||||
} else {
|
||||
// If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
|
||||
// NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path
|
||||
|
||||
// Phase 77-1: C3 Inline Slots early-exit (ENV gated)
|
||||
// Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
|
||||
if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
|
||||
void* base = c3_inline_pop(c3_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
// C3 inline miss → fall through to C4/C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 76-1: C4 Inline Slots early-exit (ENV gated)
|
||||
// Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
|
||||
if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
|
||||
void* base = c4_inline_pop(c4_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
// C4 inline miss → fall through to C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-2: C5 Inline Slots early-exit (ENV gated)
|
||||
// Try C5 inline slots FIRST (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled()) {
|
||||
// Try C5 inline slots SECOND (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
|
||||
void* base = c5_inline_pop(c5_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
@ -129,20 +222,21 @@ static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
// C5 inline miss → fall through to C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots SECOND (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled()) {
|
||||
void* base = c6_inline_pop(c6_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots THIRD (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
|
||||
void* base = c6_inline_pop(c6_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
// C6 inline miss → fall through to unified cache
|
||||
}
|
||||
// C6 inline miss → fall through to unified cache
|
||||
}
|
||||
} // End of if-chain mode
|
||||
|
||||
// TLS cache access (1 cache miss)
|
||||
// NOTE: Range check removed - caller (hak_tiny_size_to_class) guarantees valid class_idx
|
||||
|
||||
29
core/box/tiny_inline_slots_fixed_mode_box.c
Normal file
29
core/box/tiny_inline_slots_fixed_mode_box.c
Normal file
@ -0,0 +1,29 @@
|
||||
// tiny_inline_slots_fixed_mode_box.c - Phase 78-1: Inline Slots Fixed Mode Gate
|
||||
|
||||
#include "tiny_inline_slots_fixed_mode_box.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
uint8_t g_tiny_inline_slots_fixed_enabled = 0;
|
||||
uint8_t g_tiny_c3_inline_slots_fixed = 0;
|
||||
uint8_t g_tiny_c4_inline_slots_fixed = 0;
|
||||
uint8_t g_tiny_c5_inline_slots_fixed = 0;
|
||||
uint8_t g_tiny_c6_inline_slots_fixed = 0;
|
||||
|
||||
static inline uint8_t hak_env_bool0(const char* key) {
|
||||
const char* v = getenv(key);
|
||||
return (v && *v && *v != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
void tiny_inline_slots_fixed_mode_refresh_from_env(void) {
|
||||
g_tiny_inline_slots_fixed_enabled = hak_env_bool0("HAKMEM_TINY_INLINE_SLOTS_FIXED");
|
||||
if (!g_tiny_inline_slots_fixed_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
g_tiny_c3_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C3_INLINE_SLOTS");
|
||||
g_tiny_c4_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C4_INLINE_SLOTS");
|
||||
g_tiny_c5_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C5_INLINE_SLOTS");
|
||||
g_tiny_c6_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C6_INLINE_SLOTS");
|
||||
}
|
||||
|
||||
78
core/box/tiny_inline_slots_fixed_mode_box.h
Normal file
78
core/box/tiny_inline_slots_fixed_mode_box.h
Normal file
@ -0,0 +1,78 @@
|
||||
// tiny_inline_slots_fixed_mode_box.h - Phase 78-1: Inline Slots Fixed Mode Gate
|
||||
//
|
||||
// Goal: Remove per-operation ENV gate overhead for C3/C4/C5/C6 inline slots.
|
||||
//
|
||||
// Design (Box Theory):
|
||||
// - Single boundary: bench_profile calls tiny_inline_slots_fixed_mode_refresh_from_env()
|
||||
// after applying presets (putenv defaults).
|
||||
// - Hot path: tiny_c{3,4,5,6}_inline_slots_enabled_fast() reads cached globals when
|
||||
// HAKMEM_TINY_INLINE_SLOTS_FIXED=1, otherwise falls back to the legacy ENV gates.
|
||||
// - Reversible: toggle HAKMEM_TINY_INLINE_SLOTS_FIXED=0/1.
|
||||
//
|
||||
// ENV:
|
||||
// - HAKMEM_TINY_INLINE_SLOTS_FIXED=0/1 (default 0)
|
||||
// - Uses existing per-class ENVs when fixed:
|
||||
// - HAKMEM_TINY_C3_INLINE_SLOTS
|
||||
// - HAKMEM_TINY_C4_INLINE_SLOTS
|
||||
// - HAKMEM_TINY_C5_INLINE_SLOTS
|
||||
// - HAKMEM_TINY_C6_INLINE_SLOTS
|
||||
|
||||
#ifndef HAK_BOX_TINY_INLINE_SLOTS_FIXED_MODE_BOX_H
|
||||
#define HAK_BOX_TINY_INLINE_SLOTS_FIXED_MODE_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "tiny_c3_inline_slots_env_box.h"
|
||||
#include "tiny_c4_inline_slots_env_box.h"
|
||||
#include "tiny_c5_inline_slots_env_box.h"
|
||||
#include "tiny_c6_inline_slots_env_box.h"
|
||||
|
||||
// Refresh (single boundary): bench_profile calls this after putenv defaults.
|
||||
void tiny_inline_slots_fixed_mode_refresh_from_env(void);
|
||||
|
||||
// Cached state (read in hot path).
|
||||
extern uint8_t g_tiny_inline_slots_fixed_enabled;
|
||||
extern uint8_t g_tiny_c3_inline_slots_fixed;
|
||||
extern uint8_t g_tiny_c4_inline_slots_fixed;
|
||||
extern uint8_t g_tiny_c5_inline_slots_fixed;
|
||||
extern uint8_t g_tiny_c6_inline_slots_fixed;
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_inline_slots_fixed_mode_enabled_fast(void) {
|
||||
return (int)g_tiny_inline_slots_fixed_enabled;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c3_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c3_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c3_inline_slots_enabled();
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c4_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c4_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c4_inline_slots_enabled();
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c5_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c5_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c5_inline_slots_enabled();
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c6_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c6_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c6_inline_slots_enabled();
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_INLINE_SLOTS_FIXED_MODE_BOX_H
|
||||
|
||||
45
core/box/tiny_inline_slots_switch_dispatch_box.h
Normal file
45
core/box/tiny_inline_slots_switch_dispatch_box.h
Normal file
@ -0,0 +1,45 @@
|
||||
// tiny_inline_slots_switch_dispatch_box.h - Phase 80-1: Switch Dispatch for C4/C5/C6
|
||||
//
|
||||
// Goal: Eliminate multi-if comparison overhead for C4/C5/C6 inline slots
|
||||
// Scope: C4/C5/C6 only (C2/C3 are NO-GO, excluded from switch)
|
||||
// Design: Switch-case dispatch instead of if-chain
|
||||
//
|
||||
// Rationale:
|
||||
// - Current if-chain: C6 requires 4 failed comparisons (C2→C3→C4→C5→C6)
|
||||
// - Switch dispatch: Direct jump to case 4/5/6 (zero comparison overhead)
|
||||
// - C4-C6 are hot (SSOT from Phase 76-2), branch reduction has high ROI
|
||||
//
|
||||
// ENV Variable: HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH
|
||||
// - Value 0, unset, or empty: disabled (use if-chain, Phase 79-1 baseline)
|
||||
// - Non-zero (e.g., 1): enabled (use switch dispatch)
|
||||
// - Decision cached at first call
|
||||
//
|
||||
// Phase 80-0 Analysis:
|
||||
// - Baseline (if-chain): 1.35B branches, 4.84B instructions, 2.29 IPC
|
||||
// - Expected reduction: ~10-20% branch count for C4-C6 traffic
|
||||
// - Expected gain: +1-3% throughput (based on instruction/branch reduction)
|
||||
|
||||
#ifndef HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_BOX_H
|
||||
#define HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// ============================================================================
|
||||
// Switch Dispatch: Environment Decision Gate
|
||||
// ============================================================================
|
||||
|
||||
// Check if switch dispatch is enabled via ENV
|
||||
// Decision is cached at first call (zero overhead after initialization)
|
||||
static inline int tiny_inline_slots_switch_dispatch_enabled(void) {
|
||||
static int g_switch_dispatch_enabled = -1; // -1 = uncached
|
||||
|
||||
if (__builtin_expect(g_switch_dispatch_enabled == -1, 0)) {
|
||||
// First call: read ENV and cache decision
|
||||
const char* e = getenv("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH");
|
||||
g_switch_dispatch_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
return g_switch_dispatch_enabled;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_BOX_H
|
||||
22
core/box/tiny_inline_slots_switch_dispatch_fixed_box.c
Normal file
22
core/box/tiny_inline_slots_switch_dispatch_fixed_box.c
Normal file
@ -0,0 +1,22 @@
|
||||
// tiny_inline_slots_switch_dispatch_fixed_box.c - Phase 83-1: Switch Dispatch Fixed Mode Gate
|
||||
|
||||
#include "tiny_inline_slots_switch_dispatch_fixed_box.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
uint8_t g_tiny_inline_slots_switch_dispatch_fixed_enabled = 0;
|
||||
uint8_t g_tiny_inline_slots_switch_dispatch_fixed = 0;
|
||||
|
||||
static inline uint8_t hak_env_bool0(const char* key) {
|
||||
const char* v = getenv(key);
|
||||
return (v && *v && *v != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
void tiny_inline_slots_switch_dispatch_fixed_refresh_from_env(void) {
|
||||
g_tiny_inline_slots_switch_dispatch_fixed_enabled = hak_env_bool0("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED");
|
||||
if (!g_tiny_inline_slots_switch_dispatch_fixed_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
g_tiny_inline_slots_switch_dispatch_fixed = hak_env_bool0("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH");
|
||||
}
|
||||
48
core/box/tiny_inline_slots_switch_dispatch_fixed_box.h
Normal file
48
core/box/tiny_inline_slots_switch_dispatch_fixed_box.h
Normal file
@ -0,0 +1,48 @@
|
||||
// tiny_inline_slots_switch_dispatch_fixed_box.h - Phase 83-1: Switch Dispatch Fixed Mode Gate
|
||||
//
|
||||
// Goal: Remove per-operation ENV gate overhead for switch dispatch check.
|
||||
//
|
||||
// Design (Box Theory):
|
||||
// - Single boundary: bench_profile calls tiny_inline_slots_switch_dispatch_fixed_refresh_from_env()
|
||||
// after applying presets (putenv defaults).
|
||||
// - Hot path: tiny_inline_slots_switch_dispatch_enabled_fast() reads cached global when
|
||||
// HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED=1, otherwise falls back to the legacy ENV gate.
|
||||
// - Reversible: toggle HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED=0/1.
|
||||
//
|
||||
// ENV:
|
||||
// - HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED=0/1 (default 0 for A/B testing)
|
||||
// - Uses existing HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH when fixed
|
||||
//
|
||||
// Rationale:
|
||||
// - Phase 80-1: switch dispatch gives +1.65% by eliminating if-chain comparisons
|
||||
// - Current: per-op ENV gate check `tiny_inline_slots_switch_dispatch_enabled()` adds 1 branch
|
||||
// - Phase 83-1: Pre-compute decision at startup, eliminate per-op branch
|
||||
// - Expected gain: +0.3-1.0% (similar to Phase 78-1 pattern)
|
||||
|
||||
#ifndef HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_FIXED_BOX_H
|
||||
#define HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_FIXED_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "tiny_inline_slots_switch_dispatch_box.h"
|
||||
|
||||
// Refresh (single boundary): bench_profile calls this after putenv defaults.
|
||||
void tiny_inline_slots_switch_dispatch_fixed_refresh_from_env(void);
|
||||
|
||||
// Cached state (read in hot path).
|
||||
extern uint8_t g_tiny_inline_slots_switch_dispatch_fixed_enabled;
|
||||
extern uint8_t g_tiny_inline_slots_switch_dispatch_fixed;
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_inline_slots_switch_dispatch_fixed_mode_enabled_fast(void) {
|
||||
return (int)g_tiny_inline_slots_switch_dispatch_fixed_enabled;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_inline_slots_switch_dispatch_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_switch_dispatch_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_inline_slots_switch_dispatch_fixed;
|
||||
}
|
||||
return tiny_inline_slots_switch_dispatch_enabled();
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_FIXED_BOX_H
|
||||
@ -16,6 +16,15 @@
|
||||
#include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API
|
||||
#include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate
|
||||
#include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API
|
||||
#include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate
|
||||
#include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API
|
||||
#include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate
|
||||
#include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API
|
||||
#include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate
|
||||
#include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API
|
||||
#include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating
|
||||
#include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6
|
||||
#include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode
|
||||
|
||||
// Purpose: Encapsulate legacy free logic (shared by multiple paths)
|
||||
// Called by: malloc_tiny_fast.h (free path) + tiny_c6_ultra_free_box.c (C6 fallback)
|
||||
@ -27,9 +36,85 @@
|
||||
//
|
||||
__attribute__((always_inline))
|
||||
static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t class_idx, const HakmemEnvSnapshot* env) {
|
||||
// Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
|
||||
// Phase 83-1: Per-op branch removed via fixed-mode caching
|
||||
// C2/C3 excluded (NO-GO from Phase 77-1/79-1)
|
||||
if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
|
||||
// Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
|
||||
switch (class_idx) {
|
||||
case 4:
|
||||
if (tiny_c4_inline_slots_enabled_fast()) {
|
||||
if (c4_inline_push(c4_inline_tls(), base)) {
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
if (tiny_c5_inline_slots_enabled_fast()) {
|
||||
if (c5_inline_push(c5_inline_tls(), base)) {
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
if (tiny_c6_inline_slots_enabled_fast()) {
|
||||
if (c6_inline_push(c6_inline_tls(), base)) {
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// C0-C3, C7: fall through to unified_cache push
|
||||
break;
|
||||
}
|
||||
// Switch mode: fall through to unified_cache push after miss
|
||||
} else {
|
||||
// If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
|
||||
// NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path
|
||||
|
||||
// Phase 77-1: C3 Inline Slots early-exit (ENV gated)
|
||||
// Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
|
||||
if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
|
||||
if (c3_inline_push(c3_inline_tls(), base)) {
|
||||
// Success: pushed to C3 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// FULL → fall through to C4/C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 76-1: C4 Inline Slots early-exit (ENV gated)
|
||||
// Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
|
||||
if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
|
||||
if (c4_inline_push(c4_inline_tls(), base)) {
|
||||
// Success: pushed to C4 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// FULL → fall through to C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-2: C5 Inline Slots early-exit (ENV gated)
|
||||
// Try C5 inline slots FIRST (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled()) {
|
||||
// Try C5 inline slots SECOND (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
|
||||
if (c5_inline_push(c5_inline_tls(), base)) {
|
||||
// Success: pushed to C5 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
@ -41,19 +126,20 @@ static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t
|
||||
// FULL → fall through to C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots SECOND (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled()) {
|
||||
if (c6_inline_push(c6_inline_tls(), base)) {
|
||||
// Success: pushed to C6 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots THIRD (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
|
||||
if (c6_inline_push(c6_inline_tls(), base)) {
|
||||
// Success: pushed to C6 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
return;
|
||||
// FULL → fall through to unified cache
|
||||
}
|
||||
// FULL → fall through to unified cache
|
||||
}
|
||||
} // End of if-chain mode
|
||||
|
||||
const TinyFrontV3Snapshot* front_snap =
|
||||
env ? (env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL)
|
||||
|
||||
73
core/front/tiny_c2_local_cache.h
Normal file
73
core/front/tiny_c2_local_cache.h
Normal file
@ -0,0 +1,73 @@
|
||||
// tiny_c2_local_cache.h - Phase 79-1: C2 Local Cache Fast-Path API
|
||||
//
|
||||
// Goal: Zero-overhead always-inline push/pop for C2 FIFO ring buffer
|
||||
// Scope: C2 allocations (32-64B)
|
||||
// Design: Fail-fast to unified_cache on full/empty
|
||||
//
|
||||
// Fast-Path Strategy:
|
||||
// - Always-inline push/pop for zero-call-overhead
|
||||
// - Modulo arithmetic inlined (tail/head)
|
||||
// - Return NULL on empty, 0 on full (caller handles fallback)
|
||||
// - No bounds checking (ring size fixed at compile time)
|
||||
//
|
||||
// Integration Points:
|
||||
// - Alloc: Call c2_local_cache_pop() in tiny_front_hot_box BEFORE unified_cache
|
||||
// - Free: Call c2_local_cache_push() in tiny_legacy_fallback BEFORE unified_cache
|
||||
//
|
||||
// Rationale:
|
||||
// - Same pattern as C3/C4/C5/C6 inline slots (proven +7.05% C4-C6 cumulative)
|
||||
// - Phase 79-0 analysis: C2 Stage3 backend lock contention (not well-served by TLS)
|
||||
// - Lightweight cap (64) = 512B/thread (Phase 79-0 specification)
|
||||
// - Fail-fast design = no performance cliff if full/empty
|
||||
|
||||
#ifndef HAK_FRONT_TINY_C2_LOCAL_CACHE_H
|
||||
#define HAK_FRONT_TINY_C2_LOCAL_CACHE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "../box/tiny_c2_local_cache_tls_box.h"
|
||||
#include "../box/tiny_c2_local_cache_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C2 Local Cache: Fast-Path Push/Pop (Always-Inline)
|
||||
// ============================================================================
|
||||
|
||||
// Get TLS pointer for C2 local cache
|
||||
// Inline for zero overhead
|
||||
static inline TinyC2LocalCache* c2_local_cache_tls(void) {
|
||||
extern __thread TinyC2LocalCache g_tiny_c2_local_cache;
|
||||
return &g_tiny_c2_local_cache;
|
||||
}
|
||||
|
||||
// Push pointer to C2 local cache ring
|
||||
// Returns: 1 if success, 0 if full (caller must fallback to unified_cache)
|
||||
__attribute__((always_inline))
|
||||
static inline int c2_local_cache_push(TinyC2LocalCache* cache, void* ptr) {
|
||||
// Check if ring is full
|
||||
if (__builtin_expect(c2_local_cache_full(cache), 0)) {
|
||||
return 0; // Full, caller must use unified_cache
|
||||
}
|
||||
|
||||
// Enqueue at tail
|
||||
cache->slots[cache->tail] = ptr;
|
||||
cache->tail = (cache->tail + 1) % TINY_C2_LOCAL_CACHE_CAPACITY;
|
||||
|
||||
return 1; // Success
|
||||
}
|
||||
|
||||
// Pop pointer from C2 local cache ring
|
||||
// Returns: non-NULL if success, NULL if empty (caller must fallback to unified_cache)
|
||||
__attribute__((always_inline))
|
||||
static inline void* c2_local_cache_pop(TinyC2LocalCache* cache) {
|
||||
// Check if ring is empty
|
||||
if (__builtin_expect(c2_local_cache_empty(cache), 0)) {
|
||||
return NULL; // Empty, caller must use unified_cache
|
||||
}
|
||||
|
||||
// Dequeue from head
|
||||
void* ptr = cache->slots[cache->head];
|
||||
cache->head = (cache->head + 1) % TINY_C2_LOCAL_CACHE_CAPACITY;
|
||||
|
||||
return ptr; // Success
|
||||
}
|
||||
|
||||
#endif // HAK_FRONT_TINY_C2_LOCAL_CACHE_H
|
||||
73
core/front/tiny_c3_inline_slots.h
Normal file
73
core/front/tiny_c3_inline_slots.h
Normal file
@ -0,0 +1,73 @@
|
||||
// tiny_c3_inline_slots.h - Phase 77-1: C3 Inline Slots Fast-Path API
|
||||
//
|
||||
// Goal: Zero-overhead always-inline push/pop for C3 FIFO ring buffer
|
||||
// Scope: C3 allocations (64-128B)
|
||||
// Design: Fail-fast to unified_cache on full/empty
|
||||
//
|
||||
// Fast-Path Strategy:
|
||||
// - Always-inline push/pop for zero-call-overhead
|
||||
// - Modulo arithmetic inlined (tail/head)
|
||||
// - Return NULL on empty, 0 on full (caller handles fallback)
|
||||
// - No bounds checking (ring size fixed at compile time)
|
||||
//
|
||||
// Integration Points:
|
||||
// - Alloc: Call c3_inline_pop() in tiny_front_hot_box BEFORE unified_cache
|
||||
// - Free: Call c3_inline_push() in tiny_legacy_fallback BEFORE unified_cache
|
||||
//
|
||||
// Rationale:
|
||||
// - Same pattern as C4/C5/C6 inline slots (proven +7.05% cumulative)
|
||||
// - Conservative cap (256) = 2KB/thread (Phase 77-0 recommendation)
|
||||
// - Fail-fast design = no performance cliff if full/empty
|
||||
|
||||
#ifndef HAK_FRONT_TINY_C3_INLINE_SLOTS_H
|
||||
#define HAK_FRONT_TINY_C3_INLINE_SLOTS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "../box/tiny_c3_inline_slots_tls_box.h"
|
||||
#include "../box/tiny_c3_inline_slots_env_box.h"
|
||||
#include "../box/tiny_inline_slots_fixed_mode_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C3 Inline Slots: Fast-Path Push/Pop (Always-Inline)
|
||||
// ============================================================================
|
||||
|
||||
// Get TLS pointer for C3 inline slots
|
||||
// Inline for zero overhead
|
||||
static inline TinyC3InlineSlots* c3_inline_tls(void) {
|
||||
extern __thread TinyC3InlineSlots g_tiny_c3_inline_slots;
|
||||
return &g_tiny_c3_inline_slots;
|
||||
}
|
||||
|
||||
// Push pointer to C3 inline ring
|
||||
// Returns: 1 if success, 0 if full (caller must fallback to unified_cache)
|
||||
__attribute__((always_inline))
|
||||
static inline int c3_inline_push(TinyC3InlineSlots* slots, void* ptr) {
|
||||
// Check if ring is full
|
||||
if (__builtin_expect(c3_inline_full(slots), 0)) {
|
||||
return 0; // Full, caller must use unified_cache
|
||||
}
|
||||
|
||||
// Enqueue at tail
|
||||
slots->slots[slots->tail] = ptr;
|
||||
slots->tail = (slots->tail + 1) % TINY_C3_INLINE_CAPACITY;
|
||||
|
||||
return 1; // Success
|
||||
}
|
||||
|
||||
// Pop pointer from C3 inline ring
|
||||
// Returns: non-NULL if success, NULL if empty (caller must fallback to unified_cache)
|
||||
__attribute__((always_inline))
|
||||
static inline void* c3_inline_pop(TinyC3InlineSlots* slots) {
|
||||
// Check if ring is empty
|
||||
if (__builtin_expect(c3_inline_empty(slots), 0)) {
|
||||
return NULL; // Empty, caller must use unified_cache
|
||||
}
|
||||
|
||||
// Dequeue from head
|
||||
void* ptr = slots->slots[slots->head];
|
||||
slots->head = (slots->head + 1) % TINY_C3_INLINE_CAPACITY;
|
||||
|
||||
return ptr; // Success
|
||||
}
|
||||
|
||||
#endif // HAK_FRONT_TINY_C3_INLINE_SLOTS_H
|
||||
89
core/front/tiny_c4_inline_slots.h
Normal file
89
core/front/tiny_c4_inline_slots.h
Normal file
@ -0,0 +1,89 @@
|
||||
// tiny_c4_inline_slots.h - Phase 76-1: C4 Inline Slots Fast-Path API
|
||||
//
|
||||
// Goal: Zero-overhead fast-path API for C4 inline slot operations
|
||||
// Scope: C4 class only (separate from C5/C6, tested independently)
|
||||
// Design: Always-inline, fail-fast to unified_cache on FULL/empty
|
||||
//
|
||||
// Performance Target:
|
||||
// - Push: 1-2 cycles (ring index update, no bounds check)
|
||||
// - Pop: 1-2 cycles (ring index update, null check)
|
||||
// - Fallback: Silent delegation to unified_cache (existing path)
|
||||
//
|
||||
// Integration Points:
|
||||
// - Alloc: Try c4_inline_pop() first, fallback to C5→C6→unified_cache
|
||||
// - Free: Try c4_inline_push() first, fallback to C5→C6→unified_cache
|
||||
//
|
||||
// Safety:
|
||||
// - Caller must check c4_inline_enabled() before calling
|
||||
// - Caller must handle NULL return (pop) or full condition (push)
|
||||
// - No internal checks (fail-fast design)
|
||||
|
||||
#ifndef HAK_FRONT_TINY_C4_INLINE_SLOTS_H
|
||||
#define HAK_FRONT_TINY_C4_INLINE_SLOTS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "../box/tiny_c4_inline_slots_env_box.h"
|
||||
#include "../box/tiny_c4_inline_slots_tls_box.h"
|
||||
#include "../box/tiny_inline_slots_fixed_mode_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// Fast-Path API (always_inline for zero branch overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Push to C4 inline slots (free path)
|
||||
// Returns: 1 on success, 0 if full (caller must fallback to unified_cache)
|
||||
// Precondition: ptr is valid BASE pointer for C4 class
|
||||
__attribute__((always_inline))
|
||||
static inline int c4_inline_push(TinyC4InlineSlots* slots, void* ptr) {
|
||||
// Full check (single branch, likely taken in steady state)
|
||||
if (__builtin_expect(c4_inline_full(slots), 0)) {
|
||||
return 0; // Full, caller must fallback
|
||||
}
|
||||
|
||||
// Push to tail (FIFO producer)
|
||||
slots->slots[slots->tail] = ptr;
|
||||
slots->tail = (slots->tail + 1) % TINY_C4_INLINE_CAPACITY;
|
||||
|
||||
return 1; // Success
|
||||
}
|
||||
|
||||
// Pop from C4 inline slots (alloc path)
|
||||
// Returns: BASE pointer on success, NULL if empty (caller must fallback to unified_cache)
|
||||
// Precondition: slots is initialized and enabled
|
||||
__attribute__((always_inline))
|
||||
static inline void* c4_inline_pop(TinyC4InlineSlots* slots) {
|
||||
// Empty check (single branch, likely NOT taken in steady state)
|
||||
if (__builtin_expect(c4_inline_empty(slots), 0)) {
|
||||
return NULL; // Empty, caller must fallback
|
||||
}
|
||||
|
||||
// Pop from head (FIFO consumer)
|
||||
void* ptr = slots->slots[slots->head];
|
||||
slots->head = (slots->head + 1) % TINY_C4_INLINE_CAPACITY;
|
||||
|
||||
return ptr; // BASE pointer (caller converts to USER)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Integration Helpers (for malloc_tiny_fast.h integration)
|
||||
// ============================================================================
|
||||
|
||||
// Get TLS instance (wraps extern TLS variable)
|
||||
static inline TinyC4InlineSlots* c4_inline_tls(void) {
|
||||
return &g_tiny_c4_inline_slots;
|
||||
}
|
||||
|
||||
// Check if C4 inline is enabled AND initialized (combined gate)
|
||||
// Returns: 1 if ready to use, 0 if disabled or uninitialized
|
||||
static inline int c4_inline_ready(void) {
|
||||
if (!tiny_c4_inline_slots_enabled_fast()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TLS init check (once per thread)
|
||||
// Note: In production, this check can be eliminated if TLS init is guaranteed
|
||||
TinyC4InlineSlots* slots = c4_inline_tls();
|
||||
return (slots->slots != NULL || slots->head == 0); // Initialized if zero or non-null
|
||||
}
|
||||
|
||||
#endif // HAK_FRONT_TINY_C4_INLINE_SLOTS_H
|
||||
@ -24,6 +24,7 @@
|
||||
#include <stdint.h>
|
||||
#include "../box/tiny_c5_inline_slots_env_box.h"
|
||||
#include "../box/tiny_c5_inline_slots_tls_box.h"
|
||||
#include "../box/tiny_inline_slots_fixed_mode_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// Fast-Path API (always_inline for zero branch overhead)
|
||||
@ -75,8 +76,7 @@ static inline TinyC5InlineSlots* c5_inline_tls(void) {
|
||||
// Check if C5 inline is enabled AND initialized (combined gate)
|
||||
// Returns: 1 if ready to use, 0 if disabled or uninitialized
|
||||
static inline int c5_inline_ready(void) {
|
||||
// ENV gate first (cached, zero cost after first call)
|
||||
if (!tiny_c5_inline_slots_enabled()) {
|
||||
if (!tiny_c5_inline_slots_enabled_fast()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
#include <stdint.h>
|
||||
#include "../box/tiny_c6_inline_slots_env_box.h"
|
||||
#include "../box/tiny_c6_inline_slots_tls_box.h"
|
||||
#include "../box/tiny_inline_slots_fixed_mode_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// Fast-Path API (always_inline for zero branch overhead)
|
||||
@ -75,8 +76,7 @@ static inline TinyC6InlineSlots* c6_inline_tls(void) {
|
||||
// Check if C6 inline is enabled AND initialized (combined gate)
|
||||
// Returns: 1 if ready to use, 0 if disabled or uninitialized
|
||||
static inline int c6_inline_ready(void) {
|
||||
// ENV gate first (cached, zero cost after first call)
|
||||
if (!tiny_c6_inline_slots_enabled()) {
|
||||
if (!tiny_c6_inline_slots_enabled_fast()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
17
core/tiny_c2_local_cache.c
Normal file
17
core/tiny_c2_local_cache.c
Normal file
@ -0,0 +1,17 @@
|
||||
// tiny_c2_local_cache.c - Phase 79-1: C2 Local Cache TLS Variable Definition
|
||||
//
|
||||
// Goal: Define TLS variable for C2 local cache ring buffer
|
||||
// Scope: C2 class only
|
||||
// Design: Zero-initialized __thread variable
|
||||
|
||||
#include "box/tiny_c2_local_cache_tls_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C2 Local Cache: TLS Variable Definition
|
||||
// ============================================================================
|
||||
|
||||
// TLS ring buffer for C2 local cache
|
||||
// Automatically zero-initialized for each thread
|
||||
// Name: g_tiny_c2_local_cache
|
||||
// Size: 512B per thread (64 slots × 8 bytes + 64 bytes padding)
|
||||
__thread TinyC2LocalCache g_tiny_c2_local_cache = {0};
|
||||
17
core/tiny_c3_inline_slots.c
Normal file
17
core/tiny_c3_inline_slots.c
Normal file
@ -0,0 +1,17 @@
|
||||
// tiny_c3_inline_slots.c - Phase 77-1: C3 Inline Slots TLS Variable Definition
|
||||
//
|
||||
// Goal: Define TLS variable for C3 inline ring buffer
|
||||
// Scope: C3 class only
|
||||
// Design: Zero-initialized __thread variable
|
||||
|
||||
#include "box/tiny_c3_inline_slots_tls_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C3 Inline Slots: TLS Variable Definition
|
||||
// ============================================================================
|
||||
|
||||
// TLS ring buffer for C3 inline slots
|
||||
// Automatically zero-initialized for each thread
|
||||
// Name: g_tiny_c3_inline_slots
|
||||
// Size: 2KB per thread (256 slots × 8 bytes + 64 bytes padding)
|
||||
__thread TinyC3InlineSlots g_tiny_c3_inline_slots = {0};
|
||||
18
core/tiny_c4_inline_slots.c
Normal file
18
core/tiny_c4_inline_slots.c
Normal file
@ -0,0 +1,18 @@
|
||||
// tiny_c4_inline_slots.c - Phase 76-1: C4 Inline Slots TLS Variable Definition
|
||||
//
|
||||
// Goal: Define TLS variable for C4 inline slots
|
||||
// Scope: C4 class only (512B per thread)
|
||||
|
||||
#include "box/tiny_c4_inline_slots_tls_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable Definition
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Zero-initialized by default (all slots NULL, head=0, tail=0)
|
||||
__thread TinyC4InlineSlots g_tiny_c4_inline_slots = {
|
||||
.slots = {0}, // All NULL
|
||||
.head = 0,
|
||||
.tail = 0,
|
||||
};
|
||||
Reference in New Issue
Block a user