Phase 83-1 + Allocator Comparison: Switch dispatch fixed (NO-GO +0.32%), PROFILE correction, SCORECARD update
Key changes: - Phase 83-1: Switch dispatch fixed mode (tiny_inline_slots_switch_dispatch_fixed_box) - NO-GO (marginal +0.32%, branch reduction negligible) Reason: lazy-init pattern already optimal, Phase 78-1 pattern shows diminishing returns - Allocator comparison baseline update (10-run SSOT, WS=400, ITERS=20M): tcmalloc: 115.26M (92.33% of mimalloc) jemalloc: 97.39M (77.96% of mimalloc) system: 85.20M (68.24% of mimalloc) mimalloc: 124.82M (baseline) - hakmem PROFILE correction: scripts/run_mixed_10_cleanenv.sh + run_allocator_quick_matrix.sh PROFILE explicitly set to MIXED_TINYV3_C7_SAFE for hakmem measurements Result: baseline stabilized to 55.53M (44.46% of mimalloc) Previous unstable measurement (35.57M) was due to profile leak - Documentation: * PERFORMANCE_TARGETS_SCORECARD.md: Reference allocators + M1/M2 milestone status * PHASE83_1_SWITCH_DISPATCH_FIXED_RESULTS.md: Phase 83-1 analysis (NO-GO) * ALLOCATOR_COMPARISON_QUICK_RUNBOOK.md: Quick comparison procedure * ALLOCATOR_COMPARISON_SSOT.md: Detailed SSOT methodology - M2 milestone status: 44.46% (target 55%, gap -10.54pp) - structural improvements needed 🤖 Generated with Claude Code Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
41
core/box/tiny_c2_local_cache_env_box.h
Normal file
41
core/box/tiny_c2_local_cache_env_box.h
Normal file
@ -0,0 +1,41 @@
|
||||
// tiny_c2_local_cache_env_box.h - Phase 79-1: C2 Local Cache ENV Gate
|
||||
//
|
||||
// Goal: Gate C2 local cache feature via environment variable
|
||||
// Scope: C2 class only (32-64B allocations)
|
||||
// Design: Lazy-init cached decision pattern (zero overhead when disabled)
|
||||
//
|
||||
// ENV Variable: HAKMEM_TINY_C2_LOCAL_CACHE
|
||||
// - Value 0, unset, or empty: disabled (default OFF in Phase 79-1)
|
||||
// - Non-zero (e.g., 1): enabled
|
||||
// - Decision cached at first call
|
||||
//
|
||||
// Rationale:
|
||||
// - Separation of concerns (policy from mechanism)
|
||||
// - A/B testing support (enable/disable without recompile)
|
||||
// - Safe default: disabled until Phase 79-1 A/B test validates +1.0% GO threshold
|
||||
// - Phase 79-0 analysis: C2 hits Stage3 backend lock (contention signal)
|
||||
|
||||
#ifndef HAK_BOX_TINY_C2_LOCAL_CACHE_ENV_BOX_H
|
||||
#define HAK_BOX_TINY_C2_LOCAL_CACHE_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// ============================================================================
|
||||
// C2 Local Cache: Environment Decision Gate
|
||||
// ============================================================================
|
||||
|
||||
// Check if C2 local cache is enabled via ENV
|
||||
// Decision is cached at first call (zero overhead after initialization)
|
||||
static inline int tiny_c2_local_cache_enabled(void) {
|
||||
static int g_c2_local_cache_enabled = -1; // -1 = uncached
|
||||
|
||||
if (__builtin_expect(g_c2_local_cache_enabled == -1, 0)) {
|
||||
// First call: read ENV and cache decision
|
||||
const char* e = getenv("HAKMEM_TINY_C2_LOCAL_CACHE");
|
||||
g_c2_local_cache_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
return g_c2_local_cache_enabled;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C2_LOCAL_CACHE_ENV_BOX_H
|
||||
99
core/box/tiny_c2_local_cache_tls_box.h
Normal file
99
core/box/tiny_c2_local_cache_tls_box.h
Normal file
@ -0,0 +1,99 @@
|
||||
// tiny_c2_local_cache_tls_box.h - Phase 79-1: C2 Local Cache TLS Extension
|
||||
//
|
||||
// Goal: Extend TLS struct with C2-only local cache ring buffer
|
||||
// Scope: C2 class only (capacity 64, 8-byte slots = 512B per thread)
|
||||
// Design: Simple FIFO ring (head/tail indices, modulo 64)
|
||||
//
|
||||
// Ring Buffer Strategy:
|
||||
// - head: next pop position (consumer)
|
||||
// - tail: next push position (producer)
|
||||
// - Empty: head == tail
|
||||
// - Full: (tail + 1) % 64 == head
|
||||
// - Count: (tail - head + 64) % 64
|
||||
//
|
||||
// TLS Layout Impact:
|
||||
// - Size: 64 slots × 8 bytes = 512B per thread (lightweight, Phase 79-0 spec)
|
||||
// - Alignment: 64-byte cache line aligned (NUMA-friendly)
|
||||
// - Lifetime: Zero-initialized at TLS init, valid for thread lifetime
|
||||
//
|
||||
// Rationale for cap=64:
|
||||
// - Phase 79-0 analysis: C2 hits Stage3 backend lock (cache miss pattern)
|
||||
// - Conservative cap (512B) to intercept C2 frees locally
|
||||
// - Capacity > max concurrent C2 allocations in WS=400
|
||||
// - Smaller than C3's 256 (Phase 77-1 precedent) to manage TLS bloat
|
||||
// - 64 = 2^6 (efficient modulo arithmetic)
|
||||
//
|
||||
// Conditional Compilation:
|
||||
// - Only compiled if HAKMEM_TINY_C2_LOCAL_CACHE enabled
|
||||
// - Default OFF: zero overhead when disabled
|
||||
|
||||
#ifndef HAK_BOX_TINY_C2_LOCAL_CACHE_TLS_BOX_H
|
||||
#define HAK_BOX_TINY_C2_LOCAL_CACHE_TLS_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "tiny_c2_local_cache_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C2 Local Cache: TLS Structure
|
||||
// ============================================================================
|
||||
|
||||
#define TINY_C2_LOCAL_CACHE_CAPACITY 64 // C2 capacity: 64 = 2^6 (512B per thread)
|
||||
|
||||
// TLS ring buffer for C2 local cache
|
||||
// Design: FIFO ring (head/tail indices, circular buffer)
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* slots[TINY_C2_LOCAL_CACHE_CAPACITY]; // BASE pointers (512B)
|
||||
uint8_t head; // Next pop position (consumer)
|
||||
uint8_t tail; // Next push position (producer)
|
||||
uint8_t _pad[62]; // Padding to 64-byte cache line boundary
|
||||
} TinyC2LocalCache;
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable (extern, defined in tiny_c2_local_cache.c)
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Conditionally compiled: only if C2 local cache is enabled
|
||||
extern __thread TinyC2LocalCache g_tiny_c2_local_cache;
|
||||
|
||||
// ============================================================================
|
||||
// Initialization
|
||||
// ============================================================================
|
||||
|
||||
// Initialize C2 local cache for current thread
|
||||
// Called once at TLS init time (hakmem_tiny_init_thread or equivalent)
|
||||
// Returns: 1 if initialized, 0 if disabled
|
||||
static inline int tiny_c2_local_cache_init(TinyC2LocalCache* cache) {
|
||||
if (!tiny_c2_local_cache_enabled()) {
|
||||
return 0; // Disabled, no init needed
|
||||
}
|
||||
|
||||
// Zero-initialize all slots
|
||||
memset(cache->slots, 0, sizeof(cache->slots));
|
||||
cache->head = 0;
|
||||
cache->tail = 0;
|
||||
|
||||
return 1; // Initialized
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Ring Buffer Helpers (inline for zero overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Check if ring is empty
|
||||
static inline int c2_local_cache_empty(const TinyC2LocalCache* cache) {
|
||||
return cache->head == cache->tail;
|
||||
}
|
||||
|
||||
// Check if ring is full
|
||||
static inline int c2_local_cache_full(const TinyC2LocalCache* cache) {
|
||||
return ((cache->tail + 1) % TINY_C2_LOCAL_CACHE_CAPACITY) == cache->head;
|
||||
}
|
||||
|
||||
// Get current count (number of items in ring)
|
||||
static inline int c2_local_cache_count(const TinyC2LocalCache* cache) {
|
||||
return (cache->tail - cache->head + TINY_C2_LOCAL_CACHE_CAPACITY) % TINY_C2_LOCAL_CACHE_CAPACITY;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C2_LOCAL_CACHE_TLS_BOX_H
|
||||
40
core/box/tiny_c3_inline_slots_env_box.h
Normal file
40
core/box/tiny_c3_inline_slots_env_box.h
Normal file
@ -0,0 +1,40 @@
|
||||
// tiny_c3_inline_slots_env_box.h - Phase 77-1: C3 Inline Slots ENV Gate
|
||||
//
|
||||
// Goal: Gate C3 inline slots feature via environment variable
|
||||
// Scope: C3 class only (64-128B allocations)
|
||||
// Design: Lazy-init cached decision pattern (zero overhead when disabled)
|
||||
//
|
||||
// ENV Variable: HAKMEM_TINY_C3_INLINE_SLOTS
|
||||
// - Value 0, unset, or empty: disabled (default OFF in Phase 77-1)
|
||||
// - Non-zero (e.g., 1): enabled
|
||||
// - Decision cached at first call
|
||||
//
|
||||
// Rationale:
|
||||
// - Separation of concerns (policy from mechanism)
|
||||
// - A/B testing support (enable/disable without recompile)
|
||||
// - Safe default: disabled until promoted to SSOT
|
||||
|
||||
#ifndef HAK_BOX_TINY_C3_INLINE_SLOTS_ENV_BOX_H
|
||||
#define HAK_BOX_TINY_C3_INLINE_SLOTS_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// ============================================================================
|
||||
// C3 Inline Slots: Environment Decision Gate
|
||||
// ============================================================================
|
||||
|
||||
// Check if C3 inline slots are enabled via ENV
|
||||
// Decision is cached at first call (zero overhead after initialization)
|
||||
static inline int tiny_c3_inline_slots_enabled(void) {
|
||||
static int g_c3_inline_slots_enabled = -1; // -1 = uncached
|
||||
|
||||
if (__builtin_expect(g_c3_inline_slots_enabled == -1, 0)) {
|
||||
// First call: read ENV and cache decision
|
||||
const char* e = getenv("HAKMEM_TINY_C3_INLINE_SLOTS");
|
||||
g_c3_inline_slots_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
return g_c3_inline_slots_enabled;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C3_INLINE_SLOTS_ENV_BOX_H
|
||||
98
core/box/tiny_c3_inline_slots_tls_box.h
Normal file
98
core/box/tiny_c3_inline_slots_tls_box.h
Normal file
@ -0,0 +1,98 @@
|
||||
// tiny_c3_inline_slots_tls_box.h - Phase 77-1: C3 Inline Slots TLS Extension
|
||||
//
|
||||
// Goal: Extend TLS struct with C3-only inline slot ring buffer
|
||||
// Scope: C3 class only (capacity 256, 8-byte slots = 2KB per thread)
|
||||
// Design: Simple FIFO ring (head/tail indices, modulo 256)
|
||||
//
|
||||
// Ring Buffer Strategy:
|
||||
// - head: next pop position (consumer)
|
||||
// - tail: next push position (producer)
|
||||
// - Empty: head == tail
|
||||
// - Full: (tail + 1) % 256 == head
|
||||
// - Count: (tail - head + 256) % 256
|
||||
//
|
||||
// TLS Layout Impact:
|
||||
// - Size: 256 slots × 8 bytes = 2KB per thread (conservative cap, avoid cache-miss bloat)
|
||||
// - Alignment: 64-byte cache line aligned (NUMA-friendly)
|
||||
// - Lifetime: Zero-initialized at TLS init, valid for thread lifetime
|
||||
//
|
||||
// Rationale for cap=256:
|
||||
// - Phase 77-0 observation: unified_cache shows C3 has low traffic (1 miss in 20M ops)
|
||||
// - Conservative cap (2KB) to avoid Phase 74-2 cache-miss explosion
|
||||
// - Ring capacity > estimated max concurrent allocs in WS=400
|
||||
// - Smaller than C4's 512B but same modulo math (256 = 2^8)
|
||||
//
|
||||
// Conditional Compilation:
|
||||
// - Only compiled if HAKMEM_TINY_C3_INLINE_SLOTS enabled
|
||||
// - Default OFF: zero overhead when disabled
|
||||
|
||||
#ifndef HAK_BOX_TINY_C3_INLINE_SLOTS_TLS_BOX_H
|
||||
#define HAK_BOX_TINY_C3_INLINE_SLOTS_TLS_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "tiny_c3_inline_slots_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C3 Inline Slots: TLS Structure
|
||||
// ============================================================================
|
||||
|
||||
#define TINY_C3_INLINE_CAPACITY 256 // C3 capacity: 256 = 2^8 (2KB per thread)
|
||||
|
||||
// TLS ring buffer for C3 inline slots
|
||||
// Design: FIFO ring (head/tail indices, circular buffer)
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* slots[TINY_C3_INLINE_CAPACITY]; // BASE pointers (2KB)
|
||||
uint8_t head; // Next pop position (consumer)
|
||||
uint8_t tail; // Next push position (producer)
|
||||
uint8_t _pad[62]; // Padding to 64-byte cache line boundary
|
||||
} TinyC3InlineSlots;
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable (extern, defined in tiny_c3_inline_slots.c)
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Conditionally compiled: only if C3 inline slots are enabled
|
||||
extern __thread TinyC3InlineSlots g_tiny_c3_inline_slots;
|
||||
|
||||
// ============================================================================
|
||||
// Initialization
|
||||
// ============================================================================
|
||||
|
||||
// Initialize C3 inline slots for current thread
|
||||
// Called once at TLS init time (hakmem_tiny_init_thread or equivalent)
|
||||
// Returns: 1 if initialized, 0 if disabled
|
||||
static inline int tiny_c3_inline_slots_init(TinyC3InlineSlots* slots) {
|
||||
if (!tiny_c3_inline_slots_enabled()) {
|
||||
return 0; // Disabled, no init needed
|
||||
}
|
||||
|
||||
// Zero-initialize all slots
|
||||
memset(slots->slots, 0, sizeof(slots->slots));
|
||||
slots->head = 0;
|
||||
slots->tail = 0;
|
||||
|
||||
return 1; // Initialized
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Ring Buffer Helpers (inline for zero overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Check if ring is empty
|
||||
static inline int c3_inline_empty(const TinyC3InlineSlots* slots) {
|
||||
return slots->head == slots->tail;
|
||||
}
|
||||
|
||||
// Check if ring is full
|
||||
static inline int c3_inline_full(const TinyC3InlineSlots* slots) {
|
||||
return ((slots->tail + 1) % TINY_C3_INLINE_CAPACITY) == slots->head;
|
||||
}
|
||||
|
||||
// Get current count (number of items in ring)
|
||||
static inline int c3_inline_count(const TinyC3InlineSlots* slots) {
|
||||
return (slots->tail - slots->head + TINY_C3_INLINE_CAPACITY) % TINY_C3_INLINE_CAPACITY;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C3_INLINE_SLOTS_TLS_BOX_H
|
||||
61
core/box/tiny_c4_inline_slots_env_box.h
Normal file
61
core/box/tiny_c4_inline_slots_env_box.h
Normal file
@ -0,0 +1,61 @@
|
||||
// tiny_c4_inline_slots_env_box.h - Phase 76-1: C4 Inline Slots ENV Gate
|
||||
//
|
||||
// Goal: Runtime ENV gate for C4-only inline slots optimization
|
||||
// Scope: C4 class only (capacity 64, 8-byte slots)
|
||||
// Default: OFF (research box, ENV=0)
|
||||
//
|
||||
// ENV Variable:
|
||||
// HAKMEM_TINY_C4_INLINE_SLOTS=0/1 (default: 0, OFF)
|
||||
//
|
||||
// Design:
|
||||
// - Lazy-init pattern (single decision per TLS init)
|
||||
// - No TLS struct changes (pure gate)
|
||||
// - Thread-safe initialization
|
||||
//
|
||||
// Phase 76-1: C4-only implementation (extends C5+C6 pattern)
|
||||
// Phase 76-2: Measure C4 contribution to full optimization stack
|
||||
|
||||
#ifndef HAK_BOX_TINY_C4_INLINE_SLOTS_ENV_BOX_H
|
||||
#define HAK_BOX_TINY_C4_INLINE_SLOTS_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "../hakmem_build_flags.h"
|
||||
|
||||
// ============================================================================
|
||||
// ENV Gate: C4 Inline Slots
|
||||
// ============================================================================
|
||||
|
||||
// Check if C4 inline slots are enabled (lazy init, cached)
|
||||
static inline int tiny_c4_inline_slots_enabled(void) {
|
||||
static int g_c4_inline_slots_enabled = -1;
|
||||
|
||||
if (__builtin_expect(g_c4_inline_slots_enabled == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_TINY_C4_INLINE_SLOTS");
|
||||
g_c4_inline_slots_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
fprintf(stderr, "[C4-INLINE-INIT] tiny_c4_inline_slots_enabled() = %d (env=%s)\n",
|
||||
g_c4_inline_slots_enabled, e ? e : "NULL");
|
||||
fflush(stderr);
|
||||
#endif
|
||||
}
|
||||
|
||||
return g_c4_inline_slots_enabled;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Optional: Compile-time gate for Phase 76-2+ (future)
|
||||
// ============================================================================
|
||||
// When transitioning from research box (ENV-only) to production,
|
||||
// add compile-time flag to eliminate runtime branch overhead:
|
||||
//
|
||||
// #ifdef HAKMEM_TINY_C4_INLINE_SLOTS_COMPILED
|
||||
// return 1; // Compile-time ON
|
||||
// #else
|
||||
// return tiny_c4_inline_slots_enabled(); // Runtime ENV gate
|
||||
// #endif
|
||||
//
|
||||
// For Phase 76-1: Keep ENV-only (research box, default OFF)
|
||||
|
||||
#endif // HAK_BOX_TINY_C4_INLINE_SLOTS_ENV_BOX_H
|
||||
92
core/box/tiny_c4_inline_slots_tls_box.h
Normal file
92
core/box/tiny_c4_inline_slots_tls_box.h
Normal file
@ -0,0 +1,92 @@
|
||||
// tiny_c4_inline_slots_tls_box.h - Phase 76-1: C4 Inline Slots TLS Extension
|
||||
//
|
||||
// Goal: Extend TLS struct with C4-only inline slot ring buffer
|
||||
// Scope: C4 class only (capacity 64, 8-byte slots = 512B per thread)
|
||||
// Design: Simple FIFO ring (head/tail indices, modulo 64)
|
||||
//
|
||||
// Ring Buffer Strategy:
|
||||
// - head: next pop position (consumer)
|
||||
// - tail: next push position (producer)
|
||||
// - Empty: head == tail
|
||||
// - Full: (tail + 1) % 64 == head
|
||||
// - Count: (tail - head + 64) % 64
|
||||
//
|
||||
// TLS Layout Impact:
|
||||
// - Size: 64 slots × 8 bytes = 512B per thread (lighter than C5/C6's 1KB)
|
||||
// - Alignment: 64-byte cache line aligned (optional, for performance)
|
||||
// - Lifetime: Zero-initialized at TLS init, valid for thread lifetime
|
||||
//
|
||||
// Conditional Compilation:
|
||||
// - Only compiled if HAKMEM_TINY_C4_INLINE_SLOTS enabled
|
||||
// - Default OFF: zero overhead when disabled
|
||||
|
||||
#ifndef HAK_BOX_TINY_C4_INLINE_SLOTS_TLS_BOX_H
|
||||
#define HAK_BOX_TINY_C4_INLINE_SLOTS_TLS_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "tiny_c4_inline_slots_env_box.h"
|
||||
|
||||
// ============================================================================
|
||||
// C4 Inline Slots: TLS Structure
|
||||
// ============================================================================
|
||||
|
||||
#define TINY_C4_INLINE_CAPACITY 64 // C4 capacity (from Unified-STATS analysis)
|
||||
|
||||
// TLS ring buffer for C4 inline slots
|
||||
// Design: FIFO ring (head/tail indices, circular buffer)
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* slots[TINY_C4_INLINE_CAPACITY]; // BASE pointers (512B)
|
||||
uint8_t head; // Next pop position (consumer)
|
||||
uint8_t tail; // Next push position (producer)
|
||||
uint8_t _pad[62]; // Padding to 64-byte cache line boundary
|
||||
} TinyC4InlineSlots;
|
||||
|
||||
// ============================================================================
|
||||
// TLS Variable (extern, defined in tiny_c4_inline_slots.c)
|
||||
// ============================================================================
|
||||
|
||||
// TLS instance (one per thread)
|
||||
// Conditionally compiled: only if C4 inline slots are enabled
|
||||
extern __thread TinyC4InlineSlots g_tiny_c4_inline_slots;
|
||||
|
||||
// ============================================================================
|
||||
// Initialization
|
||||
// ============================================================================
|
||||
|
||||
// Initialize C4 inline slots for current thread
|
||||
// Called once at TLS init time (hakmem_tiny_init_thread or equivalent)
|
||||
// Returns: 1 if initialized, 0 if disabled
|
||||
static inline int tiny_c4_inline_slots_init(TinyC4InlineSlots* slots) {
|
||||
if (!tiny_c4_inline_slots_enabled()) {
|
||||
return 0; // Disabled, no init needed
|
||||
}
|
||||
|
||||
// Zero-initialize all slots
|
||||
memset(slots->slots, 0, sizeof(slots->slots));
|
||||
slots->head = 0;
|
||||
slots->tail = 0;
|
||||
|
||||
return 1; // Initialized
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Ring Buffer Helpers (inline for zero overhead)
|
||||
// ============================================================================
|
||||
|
||||
// Check if ring is empty
|
||||
static inline int c4_inline_empty(const TinyC4InlineSlots* slots) {
|
||||
return slots->head == slots->tail;
|
||||
}
|
||||
|
||||
// Check if ring is full
|
||||
static inline int c4_inline_full(const TinyC4InlineSlots* slots) {
|
||||
return ((slots->tail + 1) % TINY_C4_INLINE_CAPACITY) == slots->head;
|
||||
}
|
||||
|
||||
// Get current count (number of items in ring)
|
||||
static inline int c4_inline_count(const TinyC4InlineSlots* slots) {
|
||||
return (slots->tail - slots->head + TINY_C4_INLINE_CAPACITY) % TINY_C4_INLINE_CAPACITY;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_C4_INLINE_SLOTS_TLS_BOX_H
|
||||
@ -35,6 +35,15 @@
|
||||
#include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API
|
||||
#include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate
|
||||
#include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API
|
||||
#include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate
|
||||
#include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API
|
||||
#include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate
|
||||
#include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API
|
||||
#include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate
|
||||
#include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API
|
||||
#include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating
|
||||
#include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6
|
||||
#include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode
|
||||
|
||||
// ============================================================================
|
||||
// Branch Prediction Macros (Pointer Safety - Prediction Hints)
|
||||
@ -114,9 +123,93 @@ __attribute__((always_inline))
|
||||
static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
extern __thread TinyUnifiedCache g_unified_cache[];
|
||||
|
||||
// Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
|
||||
// Phase 83-1: Per-op branch removed via fixed-mode caching
|
||||
// C2/C3 excluded (NO-GO from Phase 77-1/79-1)
|
||||
if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
|
||||
// Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
|
||||
switch (class_idx) {
|
||||
case 4:
|
||||
if (tiny_c4_inline_slots_enabled_fast()) {
|
||||
void* base = c4_inline_pop(c4_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
if (tiny_c5_inline_slots_enabled_fast()) {
|
||||
void* base = c5_inline_pop(c5_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
if (tiny_c6_inline_slots_enabled_fast()) {
|
||||
void* base = c6_inline_pop(c6_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// C0-C3, C7: fall through to unified_cache
|
||||
break;
|
||||
}
|
||||
// Switch mode: fall through to unified_cache after miss
|
||||
} else {
|
||||
// If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
|
||||
// NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path
|
||||
|
||||
// Phase 77-1: C3 Inline Slots early-exit (ENV gated)
|
||||
// Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
|
||||
if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
|
||||
void* base = c3_inline_pop(c3_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
// C3 inline miss → fall through to C4/C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 76-1: C4 Inline Slots early-exit (ENV gated)
|
||||
// Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
|
||||
if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
|
||||
void* base = c4_inline_pop(c4_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
// C4 inline miss → fall through to C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-2: C5 Inline Slots early-exit (ENV gated)
|
||||
// Try C5 inline slots FIRST (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled()) {
|
||||
// Try C5 inline slots SECOND (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
|
||||
void* base = c5_inline_pop(c5_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
@ -129,20 +222,21 @@ static inline void* tiny_hot_alloc_fast(int class_idx) {
|
||||
// C5 inline miss → fall through to C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots SECOND (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled()) {
|
||||
void* base = c6_inline_pop(c6_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots THIRD (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
|
||||
void* base = c6_inline_pop(c6_inline_tls());
|
||||
if (TINY_HOT_LIKELY(base != NULL)) {
|
||||
TINY_HOT_METRICS_HIT(class_idx);
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
return tiny_header_finalize_alloc(base, class_idx);
|
||||
#else
|
||||
return base;
|
||||
#endif
|
||||
}
|
||||
// C6 inline miss → fall through to unified cache
|
||||
}
|
||||
// C6 inline miss → fall through to unified cache
|
||||
}
|
||||
} // End of if-chain mode
|
||||
|
||||
// TLS cache access (1 cache miss)
|
||||
// NOTE: Range check removed - caller (hak_tiny_size_to_class) guarantees valid class_idx
|
||||
|
||||
29
core/box/tiny_inline_slots_fixed_mode_box.c
Normal file
29
core/box/tiny_inline_slots_fixed_mode_box.c
Normal file
@ -0,0 +1,29 @@
|
||||
// tiny_inline_slots_fixed_mode_box.c - Phase 78-1: Inline Slots Fixed Mode Gate
|
||||
|
||||
#include "tiny_inline_slots_fixed_mode_box.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
uint8_t g_tiny_inline_slots_fixed_enabled = 0;
|
||||
uint8_t g_tiny_c3_inline_slots_fixed = 0;
|
||||
uint8_t g_tiny_c4_inline_slots_fixed = 0;
|
||||
uint8_t g_tiny_c5_inline_slots_fixed = 0;
|
||||
uint8_t g_tiny_c6_inline_slots_fixed = 0;
|
||||
|
||||
static inline uint8_t hak_env_bool0(const char* key) {
|
||||
const char* v = getenv(key);
|
||||
return (v && *v && *v != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
void tiny_inline_slots_fixed_mode_refresh_from_env(void) {
|
||||
g_tiny_inline_slots_fixed_enabled = hak_env_bool0("HAKMEM_TINY_INLINE_SLOTS_FIXED");
|
||||
if (!g_tiny_inline_slots_fixed_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
g_tiny_c3_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C3_INLINE_SLOTS");
|
||||
g_tiny_c4_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C4_INLINE_SLOTS");
|
||||
g_tiny_c5_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C5_INLINE_SLOTS");
|
||||
g_tiny_c6_inline_slots_fixed = hak_env_bool0("HAKMEM_TINY_C6_INLINE_SLOTS");
|
||||
}
|
||||
|
||||
78
core/box/tiny_inline_slots_fixed_mode_box.h
Normal file
78
core/box/tiny_inline_slots_fixed_mode_box.h
Normal file
@ -0,0 +1,78 @@
|
||||
// tiny_inline_slots_fixed_mode_box.h - Phase 78-1: Inline Slots Fixed Mode Gate
|
||||
//
|
||||
// Goal: Remove per-operation ENV gate overhead for C3/C4/C5/C6 inline slots.
|
||||
//
|
||||
// Design (Box Theory):
|
||||
// - Single boundary: bench_profile calls tiny_inline_slots_fixed_mode_refresh_from_env()
|
||||
// after applying presets (putenv defaults).
|
||||
// - Hot path: tiny_c{3,4,5,6}_inline_slots_enabled_fast() reads cached globals when
|
||||
// HAKMEM_TINY_INLINE_SLOTS_FIXED=1, otherwise falls back to the legacy ENV gates.
|
||||
// - Reversible: toggle HAKMEM_TINY_INLINE_SLOTS_FIXED=0/1.
|
||||
//
|
||||
// ENV:
|
||||
// - HAKMEM_TINY_INLINE_SLOTS_FIXED=0/1 (default 0)
|
||||
// - Uses existing per-class ENVs when fixed:
|
||||
// - HAKMEM_TINY_C3_INLINE_SLOTS
|
||||
// - HAKMEM_TINY_C4_INLINE_SLOTS
|
||||
// - HAKMEM_TINY_C5_INLINE_SLOTS
|
||||
// - HAKMEM_TINY_C6_INLINE_SLOTS
|
||||
|
||||
#ifndef HAK_BOX_TINY_INLINE_SLOTS_FIXED_MODE_BOX_H
|
||||
#define HAK_BOX_TINY_INLINE_SLOTS_FIXED_MODE_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "tiny_c3_inline_slots_env_box.h"
|
||||
#include "tiny_c4_inline_slots_env_box.h"
|
||||
#include "tiny_c5_inline_slots_env_box.h"
|
||||
#include "tiny_c6_inline_slots_env_box.h"
|
||||
|
||||
// Refresh (single boundary): bench_profile calls this after putenv defaults.
|
||||
void tiny_inline_slots_fixed_mode_refresh_from_env(void);
|
||||
|
||||
// Cached state (read in hot path).
|
||||
extern uint8_t g_tiny_inline_slots_fixed_enabled;
|
||||
extern uint8_t g_tiny_c3_inline_slots_fixed;
|
||||
extern uint8_t g_tiny_c4_inline_slots_fixed;
|
||||
extern uint8_t g_tiny_c5_inline_slots_fixed;
|
||||
extern uint8_t g_tiny_c6_inline_slots_fixed;
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_inline_slots_fixed_mode_enabled_fast(void) {
|
||||
return (int)g_tiny_inline_slots_fixed_enabled;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c3_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c3_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c3_inline_slots_enabled();
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c4_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c4_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c4_inline_slots_enabled();
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c5_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c5_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c5_inline_slots_enabled();
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_c6_inline_slots_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_c6_inline_slots_fixed;
|
||||
}
|
||||
return tiny_c6_inline_slots_enabled();
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_INLINE_SLOTS_FIXED_MODE_BOX_H
|
||||
|
||||
45
core/box/tiny_inline_slots_switch_dispatch_box.h
Normal file
45
core/box/tiny_inline_slots_switch_dispatch_box.h
Normal file
@ -0,0 +1,45 @@
|
||||
// tiny_inline_slots_switch_dispatch_box.h - Phase 80-1: Switch Dispatch for C4/C5/C6
|
||||
//
|
||||
// Goal: Eliminate multi-if comparison overhead for C4/C5/C6 inline slots
|
||||
// Scope: C4/C5/C6 only (C2/C3 are NO-GO, excluded from switch)
|
||||
// Design: Switch-case dispatch instead of if-chain
|
||||
//
|
||||
// Rationale:
|
||||
// - Current if-chain: C6 requires 4 failed comparisons (C2→C3→C4→C5→C6)
|
||||
// - Switch dispatch: Direct jump to case 4/5/6 (zero comparison overhead)
|
||||
// - C4-C6 are hot (SSOT from Phase 76-2), branch reduction has high ROI
|
||||
//
|
||||
// ENV Variable: HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH
|
||||
// - Value 0, unset, or empty: disabled (use if-chain, Phase 79-1 baseline)
|
||||
// - Non-zero (e.g., 1): enabled (use switch dispatch)
|
||||
// - Decision cached at first call
|
||||
//
|
||||
// Phase 80-0 Analysis:
|
||||
// - Baseline (if-chain): 1.35B branches, 4.84B instructions, 2.29 IPC
|
||||
// - Expected reduction: ~10-20% branch count for C4-C6 traffic
|
||||
// - Expected gain: +1-3% throughput (based on instruction/branch reduction)
|
||||
|
||||
#ifndef HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_BOX_H
|
||||
#define HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
// ============================================================================
|
||||
// Switch Dispatch: Environment Decision Gate
|
||||
// ============================================================================
|
||||
|
||||
// Check if switch dispatch is enabled via ENV
|
||||
// Decision is cached at first call (zero overhead after initialization)
|
||||
static inline int tiny_inline_slots_switch_dispatch_enabled(void) {
|
||||
static int g_switch_dispatch_enabled = -1; // -1 = uncached
|
||||
|
||||
if (__builtin_expect(g_switch_dispatch_enabled == -1, 0)) {
|
||||
// First call: read ENV and cache decision
|
||||
const char* e = getenv("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH");
|
||||
g_switch_dispatch_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
return g_switch_dispatch_enabled;
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_BOX_H
|
||||
22
core/box/tiny_inline_slots_switch_dispatch_fixed_box.c
Normal file
22
core/box/tiny_inline_slots_switch_dispatch_fixed_box.c
Normal file
@ -0,0 +1,22 @@
|
||||
// tiny_inline_slots_switch_dispatch_fixed_box.c - Phase 83-1: Switch Dispatch Fixed Mode Gate
|
||||
|
||||
#include "tiny_inline_slots_switch_dispatch_fixed_box.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
uint8_t g_tiny_inline_slots_switch_dispatch_fixed_enabled = 0;
|
||||
uint8_t g_tiny_inline_slots_switch_dispatch_fixed = 0;
|
||||
|
||||
static inline uint8_t hak_env_bool0(const char* key) {
|
||||
const char* v = getenv(key);
|
||||
return (v && *v && *v != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
void tiny_inline_slots_switch_dispatch_fixed_refresh_from_env(void) {
|
||||
g_tiny_inline_slots_switch_dispatch_fixed_enabled = hak_env_bool0("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED");
|
||||
if (!g_tiny_inline_slots_switch_dispatch_fixed_enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
g_tiny_inline_slots_switch_dispatch_fixed = hak_env_bool0("HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH");
|
||||
}
|
||||
48
core/box/tiny_inline_slots_switch_dispatch_fixed_box.h
Normal file
48
core/box/tiny_inline_slots_switch_dispatch_fixed_box.h
Normal file
@ -0,0 +1,48 @@
|
||||
// tiny_inline_slots_switch_dispatch_fixed_box.h - Phase 83-1: Switch Dispatch Fixed Mode Gate
|
||||
//
|
||||
// Goal: Remove per-operation ENV gate overhead for switch dispatch check.
|
||||
//
|
||||
// Design (Box Theory):
|
||||
// - Single boundary: bench_profile calls tiny_inline_slots_switch_dispatch_fixed_refresh_from_env()
|
||||
// after applying presets (putenv defaults).
|
||||
// - Hot path: tiny_inline_slots_switch_dispatch_enabled_fast() reads cached global when
|
||||
// HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED=1, otherwise falls back to the legacy ENV gate.
|
||||
// - Reversible: toggle HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED=0/1.
|
||||
//
|
||||
// ENV:
|
||||
// - HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH_FIXED=0/1 (default 0 for A/B testing)
|
||||
// - Uses existing HAKMEM_TINY_INLINE_SLOTS_SWITCHDISPATCH when fixed
|
||||
//
|
||||
// Rationale:
|
||||
// - Phase 80-1: switch dispatch gives +1.65% by eliminating if-chain comparisons
|
||||
// - Current: per-op ENV gate check `tiny_inline_slots_switch_dispatch_enabled()` adds 1 branch
|
||||
// - Phase 83-1: Pre-compute decision at startup, eliminate per-op branch
|
||||
// - Expected gain: +0.3-1.0% (similar to Phase 78-1 pattern)
|
||||
|
||||
#ifndef HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_FIXED_BOX_H
|
||||
#define HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_FIXED_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "tiny_inline_slots_switch_dispatch_box.h"
|
||||
|
||||
// Refresh (single boundary): bench_profile calls this after putenv defaults.
|
||||
void tiny_inline_slots_switch_dispatch_fixed_refresh_from_env(void);
|
||||
|
||||
// Cached state (read in hot path).
|
||||
extern uint8_t g_tiny_inline_slots_switch_dispatch_fixed_enabled;
|
||||
extern uint8_t g_tiny_inline_slots_switch_dispatch_fixed;
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_inline_slots_switch_dispatch_fixed_mode_enabled_fast(void) {
|
||||
return (int)g_tiny_inline_slots_switch_dispatch_fixed_enabled;
|
||||
}
|
||||
|
||||
__attribute__((always_inline))
|
||||
static inline int tiny_inline_slots_switch_dispatch_enabled_fast(void) {
|
||||
if (__builtin_expect(g_tiny_inline_slots_switch_dispatch_fixed_enabled, 0)) {
|
||||
return (int)g_tiny_inline_slots_switch_dispatch_fixed;
|
||||
}
|
||||
return tiny_inline_slots_switch_dispatch_enabled();
|
||||
}
|
||||
|
||||
#endif // HAK_BOX_TINY_INLINE_SLOTS_SWITCH_DISPATCH_FIXED_BOX_H
|
||||
@ -16,6 +16,15 @@
|
||||
#include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API
|
||||
#include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate
|
||||
#include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API
|
||||
#include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate
|
||||
#include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API
|
||||
#include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate
|
||||
#include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API
|
||||
#include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate
|
||||
#include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API
|
||||
#include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating
|
||||
#include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6
|
||||
#include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode
|
||||
|
||||
// Purpose: Encapsulate legacy free logic (shared by multiple paths)
|
||||
// Called by: malloc_tiny_fast.h (free path) + tiny_c6_ultra_free_box.c (C6 fallback)
|
||||
@ -27,9 +36,85 @@
|
||||
//
|
||||
__attribute__((always_inline))
|
||||
static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t class_idx, const HakmemEnvSnapshot* env) {
|
||||
// Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
|
||||
// Phase 83-1: Per-op branch removed via fixed-mode caching
|
||||
// C2/C3 excluded (NO-GO from Phase 77-1/79-1)
|
||||
if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
|
||||
// Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
|
||||
switch (class_idx) {
|
||||
case 4:
|
||||
if (tiny_c4_inline_slots_enabled_fast()) {
|
||||
if (c4_inline_push(c4_inline_tls(), base)) {
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
if (tiny_c5_inline_slots_enabled_fast()) {
|
||||
if (c5_inline_push(c5_inline_tls(), base)) {
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
if (tiny_c6_inline_slots_enabled_fast()) {
|
||||
if (c6_inline_push(c6_inline_tls(), base)) {
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// C0-C3, C7: fall through to unified_cache push
|
||||
break;
|
||||
}
|
||||
// Switch mode: fall through to unified_cache push after miss
|
||||
} else {
|
||||
// If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
|
||||
// NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path
|
||||
|
||||
// Phase 77-1: C3 Inline Slots early-exit (ENV gated)
|
||||
// Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
|
||||
if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
|
||||
if (c3_inline_push(c3_inline_tls(), base)) {
|
||||
// Success: pushed to C3 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// FULL → fall through to C4/C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 76-1: C4 Inline Slots early-exit (ENV gated)
|
||||
// Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
|
||||
if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
|
||||
if (c4_inline_push(c4_inline_tls(), base)) {
|
||||
// Success: pushed to C4 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// FULL → fall through to C5/C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-2: C5 Inline Slots early-exit (ENV gated)
|
||||
// Try C5 inline slots FIRST (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled()) {
|
||||
// Try C5 inline slots SECOND (before C6 and unified cache) for class 5
|
||||
if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
|
||||
if (c5_inline_push(c5_inline_tls(), base)) {
|
||||
// Success: pushed to C5 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
@ -41,19 +126,20 @@ static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t
|
||||
// FULL → fall through to C6/unified cache
|
||||
}
|
||||
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots SECOND (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled()) {
|
||||
if (c6_inline_push(c6_inline_tls(), base)) {
|
||||
// Success: pushed to C6 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
||||
// Try C6 inline slots THIRD (before unified cache) for class 6
|
||||
if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
|
||||
if (c6_inline_push(c6_inline_tls(), base)) {
|
||||
// Success: pushed to C6 inline slots
|
||||
FREE_PATH_STAT_INC(legacy_fallback);
|
||||
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
||||
g_free_path_stats.legacy_by_class[class_idx]++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
return;
|
||||
// FULL → fall through to unified cache
|
||||
}
|
||||
// FULL → fall through to unified cache
|
||||
}
|
||||
} // End of if-chain mode
|
||||
|
||||
const TinyFrontV3Snapshot* front_snap =
|
||||
env ? (env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL)
|
||||
|
||||
Reference in New Issue
Block a user