// hakmem_tiny_lazy_init.inc.h - Phase 22: Lazy Per-Class Initialization // Goal: Reduce cold-start page faults by initializing only used classes // // ChatGPT Analysis (2025-11-16): // - hak_tiny_init() page faults: 94.94% of all page faults // - Cause: Eager init of all 8 classes even if only C2/C3 used // - Solution: Lazy init per class on first use // // Expected Impact: // - Page faults: -90% (only touch C2/C3 for 256B workload) // - Cold start: +30-40% performance (16.2M → 22-25M ops/s) #ifndef HAKMEM_TINY_LAZY_INIT_INC_H #define HAKMEM_TINY_LAZY_INIT_INC_H #include #include #include // For fprintf #include "superslab/superslab_types.h" // For SuperSlabACEState #include "box/ss_addr_map_box.h" // Phase 9-1: SuperSlab address map // ============================================================================ // Phase 22-1: Per-Class Initialization State // ============================================================================ // Track which classes are initialized (per-thread) __thread uint8_t g_class_initialized[TINY_NUM_CLASSES] = {0}; // Global one-time init flag (for shared resources) static int g_tiny_global_initialized = 0; static pthread_mutex_t g_lazy_init_lock = PTHREAD_MUTEX_INITIALIZER; // ============================================================================ // Phase 22-2: Lazy Init Implementation // ============================================================================ // Initialize one class lazily (called on first use) static inline void lazy_init_class(int class_idx) { // Fast path: already initialized if (__builtin_expect(g_class_initialized[class_idx], 1)) { return; } // Slow path: need to initialize this class pthread_mutex_lock(&g_lazy_init_lock); // Double-check after acquiring lock if (g_class_initialized[class_idx]) { pthread_mutex_unlock(&g_lazy_init_lock); return; } // Extract from hak_tiny_init.inc lines 84-103: TLS List Init { TinyTLSList* tls = &g_tls_lists[class_idx]; tls->head = NULL; tls->count = 0; uint32_t base_cap = (uint32_t)tiny_default_cap(class_idx); uint32_t class_max = (uint32_t)tiny_cap_max_for_class(class_idx); if (base_cap > class_max) base_cap = class_max; // Apply global cap limit if set extern int g_mag_cap_limit; extern int g_mag_cap_override[TINY_NUM_CLASSES]; if ((uint32_t)g_mag_cap_limit < base_cap) base_cap = (uint32_t)g_mag_cap_limit; if (g_mag_cap_override[class_idx] > 0) { uint32_t ov = (uint32_t)g_mag_cap_override[class_idx]; if (ov > class_max) ov = class_max; if (ov > (uint32_t)g_mag_cap_limit) ov = (uint32_t)g_mag_cap_limit; if (ov != 0u) base_cap = ov; } if (base_cap == 0u) base_cap = 32u; tls->cap = base_cap; tls->refill_low = tiny_tls_default_refill(base_cap); tls->spill_high = tiny_tls_default_spill(base_cap); tiny_tls_publish_targets(class_idx, base_cap); } // CRITICAL FIX: Clear TLS SLL (Phase 3d-B unified structure) to purge stale blocks // This prevents C7 1024B→2048B stride upgrade issues where old misaligned blocks // remain in TLS SLL from previous runs or initialization paths. // Note: g_tls_sll is defined in hakmem_tiny_tls_state_box.inc, already visible here g_tls_sll[class_idx].head = NULL; g_tls_sll[class_idx].count = 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LAZY_INIT] Cleared TLS SLL for class %d (purge stale blocks)\n", class_idx); #endif // Extract from hak_tiny_init.inc lines 623-625: Per-class lock pthread_mutex_init(&g_tiny_class_locks[class_idx].m, NULL); // Extract from hak_tiny_init.inc lines 628-637: ACE state { extern SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES]; g_ss_ace[class_idx].current_lg = 20; // Start with 1MB SuperSlabs g_ss_ace[class_idx].target_lg = 20; g_ss_ace[class_idx].hot_score = 0; g_ss_ace[class_idx].alloc_count = 0; g_ss_ace[class_idx].refill_count = 0; g_ss_ace[class_idx].spill_count = 0; g_ss_ace[class_idx].live_blocks = 0; g_ss_ace[class_idx].last_tick_ns = 0; } // Mark as initialized g_class_initialized[class_idx] = 1; pthread_mutex_unlock(&g_lazy_init_lock); #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LAZY_INIT] Class %d initialized\n", class_idx); #endif } // Global initialization (called once, for non-class resources) static inline void lazy_init_global(void) { if (__builtin_expect(g_tiny_global_initialized, 1)) { return; } pthread_mutex_lock(&g_lazy_init_lock); if (g_tiny_global_initialized) { pthread_mutex_unlock(&g_lazy_init_lock); return; } // Initialize SuperSlab subsystem (only once) extern int g_use_superslab; if (g_use_superslab) { extern void hak_super_registry_init(void); extern void hak_ss_lru_init(void); extern void hak_ss_prewarm_init(void); hak_super_registry_init(); hak_ss_lru_init(); hak_ss_prewarm_init(); // Phase 9-1: Initialize SuperSlab address map (hash table O(1) lookup) ss_map_init(&g_ss_addr_map); #if !HAKMEM_BUILD_RELEASE if (getenv("HAKMEM_SS_MAP_TRACE")) { fprintf(stderr, "[SS_MAP] Initialized hash table with %d buckets\n", SS_MAP_HASH_SIZE); } #endif } // Mark global resources as initialized g_tiny_global_initialized = 1; pthread_mutex_unlock(&g_lazy_init_lock); #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LAZY_INIT] Global resources initialized\n"); #endif } #endif // HAKMEM_TINY_LAZY_INIT_INC_H