Phase FREE-LEGACY-OPT-6: C4 ULTRA Implementation
Implement C4 ULTRA free TLS cache with parasitic free+alloc pattern, achieving 99.7-99.9% elimination of C4 legacy fallback calls. Key Features: - TLS cache cap=64 (tuned for L1 cache fit, smaller than C5/C6's 128) - Segment learning via ss_fast_lookup() on first free - Free-side cache push + alloc-side TLS pop pattern - ENV gate: HAKMEM_TINY_C4_ULTRA_FREE_ENABLED (default OFF) - Full FREE_PATH_STATS instrumentation Benchmark Results: C4-heavy (65-128B range): - C4 legacy: 591,583 → 1,711 (-99.7%) - c4_ultra cache hits: ~599k (free) + ~599k (alloc) - Mixed load: 340,732 → 284 C4 legacy (-99.9%) Legacy fallback reduction: - C4-heavy: 589,872 fewer legacy calls (-10.9% total) - Mixed: 340,448 fewer C4 legacy calls (-12.8% in mixed) Performance note: ~2% throughput cost in isolated C4-heavy case, acceptable tradeoff for 99%+ legacy elimination per class. Files: NEW: core/box/tiny_c4_ultra_free_box.h/c NEW: core/box/tiny_c4_ultra_free_env_box.h MOD: core/box/tiny_ultra_classes_box.h (added C4 macros) MOD: core/box/free_path_stats_box.h/c (C4 ULTRA counters) MOD: core/front/malloc_tiny_fast.h (C4 alloc+free integration) MOD: Makefile (added C4 ULTRA object) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -16,13 +16,15 @@ static void free_path_stats_dump(void) {
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "[FREE_PATH_STATS] total=%lu c7_ultra=%lu c6_ultra_free=%lu c6_ultra_alloc=%lu c5_ultra_free=%lu c5_ultra_alloc=%lu small_v3=%lu v6=%lu tiny_v1=%lu pool_v1=%lu remote=%lu super_lookup=%lu legacy_fb=%lu\n",
|
||||
fprintf(stderr, "[FREE_PATH_STATS] total=%lu c7_ultra=%lu c6_ultra_free=%lu c6_ultra_alloc=%lu c5_ultra_free=%lu c5_ultra_alloc=%lu c4_ultra_free=%lu c4_ultra_alloc=%lu small_v3=%lu v6=%lu tiny_v1=%lu pool_v1=%lu remote=%lu super_lookup=%lu legacy_fb=%lu\n",
|
||||
g_free_path_stats.total_calls,
|
||||
g_free_path_stats.c7_ultra_fast,
|
||||
g_free_path_stats.c6_ultra_free_fast, // Phase 4-2
|
||||
g_free_path_stats.c6_ultra_alloc_hit, // Phase 4-4
|
||||
g_free_path_stats.c5_ultra_free_fast, // Phase 5-1
|
||||
g_free_path_stats.c5_ultra_alloc_hit, // Phase 5-2
|
||||
g_free_path_stats.c4_ultra_free_fast, // Phase 6
|
||||
g_free_path_stats.c4_ultra_alloc_hit, // Phase 6
|
||||
g_free_path_stats.smallheap_v3_fast,
|
||||
g_free_path_stats.smallheap_v6_fast,
|
||||
g_free_path_stats.tiny_heap_v1_fast,
|
||||
|
||||
@ -13,6 +13,8 @@ typedef struct FreePathStats {
|
||||
uint64_t c6_ultra_alloc_hit; // Phase 4-4: C6 ULTRA-alloc (TLS pop)
|
||||
uint64_t c5_ultra_free_fast; // Phase 5-1: C5 ULTRA-free
|
||||
uint64_t c5_ultra_alloc_hit; // Phase 5-2: C5 ULTRA-alloc (TLS pop)
|
||||
uint64_t c4_ultra_free_fast; // Phase 6: C4 ULTRA-free (cap=64)
|
||||
uint64_t c4_ultra_alloc_hit; // Phase 6: C4 ULTRA-alloc (TLS pop)
|
||||
uint64_t smallheap_v3_fast;
|
||||
uint64_t smallheap_v6_fast;
|
||||
uint64_t tiny_heap_v1_fast;
|
||||
|
||||
49
core/box/tiny_c4_ultra_free_box.c
Normal file
49
core/box/tiny_c4_ultra_free_box.c
Normal file
@ -0,0 +1,49 @@
|
||||
#include "tiny_c4_ultra_free_box.h"
|
||||
#include "tiny_legacy_fallback_box.h" // Phase REFACTOR-2: Unified legacy fallback
|
||||
#include "free_path_stats_box.h"
|
||||
#include "tiny_front_v3_env_box.h"
|
||||
#include "../hakmem.h" // For HAK_BASE_FROM_RAW
|
||||
#include "../front/tiny_unified_cache.h"
|
||||
#include "tiny_front_hot_box.h"
|
||||
#include "../superslab/superslab_inline.h" // For ss_fast_lookup
|
||||
#include <string.h>
|
||||
|
||||
// TLS context
|
||||
static __thread TinyC4UltraFreeTLS g_c4_ultra_free_tls = {0};
|
||||
|
||||
TinyC4UltraFreeTLS* tiny_c4_ultra_free_tls(void) {
|
||||
return &g_c4_ultra_free_tls;
|
||||
}
|
||||
|
||||
// Fast path: TLS cache push
|
||||
void tiny_c4_ultra_free_fast(void* base, uint32_t class_idx) {
|
||||
TinyC4UltraFreeTLS* ctx = &g_c4_ultra_free_tls;
|
||||
|
||||
// Phase 6: Learn segment on first C4 free (same as C5/C6)
|
||||
if (unlikely(ctx->seg_base == 0)) {
|
||||
SuperSlab* ss = ss_fast_lookup(base);
|
||||
if (ss != NULL) {
|
||||
ctx->seg_base = (uintptr_t)ss;
|
||||
ctx->seg_end = ctx->seg_base + (1u << ss->lg_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if ptr is in our segment AND cache has room
|
||||
if (likely(ctx->seg_base != 0 &&
|
||||
(uintptr_t)base >= ctx->seg_base &&
|
||||
(uintptr_t)base < ctx->seg_end &&
|
||||
ctx->count < TINY_C4_ULTRA_FREE_CAP)) {
|
||||
// Push to TLS cache
|
||||
ctx->freelist[ctx->count++] = base;
|
||||
FREE_PATH_STAT_INC(c4_ultra_free_fast);
|
||||
return;
|
||||
}
|
||||
|
||||
// Slow path: fallback to legacy (cache full or ptr not in segment)
|
||||
tiny_c4_ultra_free_slow(base, class_idx);
|
||||
}
|
||||
|
||||
// Slow path: fallback to legacy free
|
||||
void tiny_c4_ultra_free_slow(void* base, uint32_t class_idx) {
|
||||
tiny_legacy_fallback_free_base(base, class_idx);
|
||||
}
|
||||
30
core/box/tiny_c4_ultra_free_box.h
Normal file
30
core/box/tiny_c4_ultra_free_box.h
Normal file
@ -0,0 +1,30 @@
|
||||
#ifndef HAKMEM_TINY_C4_ULTRA_FREE_BOX_H
|
||||
#define HAKMEM_TINY_C4_ULTRA_FREE_BOX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "tiny_c4_ultra_free_env_box.h"
|
||||
|
||||
#ifndef likely
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#endif
|
||||
|
||||
// TLS cache capacity (Phase 6: smaller for C4, tuned to L1 fit)
|
||||
#define TINY_C4_ULTRA_FREE_CAP 64
|
||||
|
||||
// TLS context for C4 ULTRA-free
|
||||
typedef struct TinyC4UltraFreeTLS {
|
||||
void* freelist[TINY_C4_ULTRA_FREE_CAP]; // BASE pointers
|
||||
uint8_t count;
|
||||
uint8_t _pad[7];
|
||||
uintptr_t seg_base; // C4 segment range (0 = not initialized)
|
||||
uintptr_t seg_end;
|
||||
} TinyC4UltraFreeTLS;
|
||||
|
||||
// API
|
||||
TinyC4UltraFreeTLS* tiny_c4_ultra_free_tls(void);
|
||||
void tiny_c4_ultra_free_fast(void* base, uint32_t class_idx);
|
||||
void tiny_c4_ultra_free_slow(void* base, uint32_t class_idx);
|
||||
|
||||
#endif // HAKMEM_TINY_C4_ULTRA_FREE_BOX_H
|
||||
17
core/box/tiny_c4_ultra_free_env_box.h
Normal file
17
core/box/tiny_c4_ultra_free_env_box.h
Normal file
@ -0,0 +1,17 @@
|
||||
#ifndef HAKMEM_TINY_C4_ULTRA_FREE_ENV_BOX_H
|
||||
#define HAKMEM_TINY_C4_ULTRA_FREE_ENV_BOX_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
// ENV: HAKMEM_TINY_C4_ULTRA_FREE_ENABLED (default 0)
|
||||
static inline bool tiny_c4_ultra_free_enabled(void) {
|
||||
static int g_enabled = -1;
|
||||
if (__builtin_expect(g_enabled == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_TINY_C4_ULTRA_FREE_ENABLED");
|
||||
g_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
return g_enabled;
|
||||
}
|
||||
|
||||
#endif // HAKMEM_TINY_C4_ULTRA_FREE_ENV_BOX_H
|
||||
@ -1,15 +1,17 @@
|
||||
#ifndef HAKMEM_TINY_ULTRA_CLASSES_BOX_H
|
||||
#define HAKMEM_TINY_ULTRA_CLASSES_BOX_H
|
||||
|
||||
// Purpose: Named constants for ULTRA tier classes (C5, C6, C7)
|
||||
// Purpose: Named constants for ULTRA tier classes (C4, C5, C6, C7)
|
||||
#define TINY_CLASS_C4 4
|
||||
#define TINY_CLASS_C5 5
|
||||
#define TINY_CLASS_C6 6
|
||||
#define TINY_CLASS_C7 7
|
||||
|
||||
// Helper macros for class checking
|
||||
#define tiny_class_is_c4(idx) ((idx) == TINY_CLASS_C4)
|
||||
#define tiny_class_is_c5(idx) ((idx) == TINY_CLASS_C5)
|
||||
#define tiny_class_is_c6(idx) ((idx) == TINY_CLASS_C6)
|
||||
#define tiny_class_is_c7(idx) ((idx) == TINY_CLASS_C7)
|
||||
#define tiny_class_is_ultra(idx) (tiny_class_is_c5(idx) || tiny_class_is_c6(idx) || tiny_class_is_c7(idx))
|
||||
#define tiny_class_is_ultra(idx) (tiny_class_is_c4(idx) || tiny_class_is_c5(idx) || tiny_class_is_c6(idx) || tiny_class_is_c7(idx))
|
||||
|
||||
#endif // HAKMEM_TINY_ULTRA_CLASSES_BOX_H
|
||||
|
||||
Reference in New Issue
Block a user