Files
hakmem/core/hakmem_tiny_hot_pop_v4.inc.h
Moe Charm (CI) 52386401b3 Debug Counters Implementation - Clean History
Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 12:31:14 +09:00

184 lines
5.8 KiB
C

// hakmem_tiny_hot_pop_v4.inc.h
// Phase 4-A1: TLS-BUMP Immediate-Value Hot Functions
//
// This file contains Phase 4-A1 optimized hot-path functions with:
// - Immediate-value block sizes (no g_tiny_class_sizes[] lookup)
// - Direct TLS bump allocation (2-register path)
// - Branch minimization
//
// Expected improvement: +5-8% (16.53 → 17.5-18.0 M ops/sec)
#ifndef HAKMEM_TINY_HOT_POP_V4_INC_H
#define HAKMEM_TINY_HOT_POP_V4_INC_H
#include "hakmem_tiny.h"
#include <stdint.h>
// External TLS variables
extern int g_fast_enable;
extern uint16_t g_fast_cap[TINY_NUM_CLASSES];
extern __thread void* g_fast_head[TINY_NUM_CLASSES];
extern __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES];
// ============================================================================
// Phase 4-A1: Immediate-Value TLS-BUMP Functions
// ============================================================================
// Class 0: 8B (immediate value)
static inline __attribute__((always_inline))
void* tiny_hot_bump_class0_v4(void) {
uint8_t* p = g_tls_bcur[0];
if (__builtin_expect(p != NULL, 1)) {
uint8_t* n = p + 8; // ← Immediate value!
uint8_t* end = g_tls_bend[0];
if (__builtin_expect(n <= end, 1)) {
g_tls_bcur[0] = n;
return p; // 2-register hot path! 🚀
}
// Window exhausted - clear and fallback
g_tls_bcur[0] = NULL;
g_tls_bend[0] = NULL;
}
return NULL; // Fallback to next tier
}
// Class 1: 16B (immediate value)
static inline __attribute__((always_inline))
void* tiny_hot_bump_class1_v4(void) {
uint8_t* p = g_tls_bcur[1];
if (__builtin_expect(p != NULL, 1)) {
uint8_t* n = p + 16; // ← Immediate value!
uint8_t* end = g_tls_bend[1];
if (__builtin_expect(n <= end, 1)) {
g_tls_bcur[1] = n;
return p;
}
g_tls_bcur[1] = NULL;
g_tls_bend[1] = NULL;
}
return NULL;
}
// Class 2: 32B (immediate value)
static inline __attribute__((always_inline))
void* tiny_hot_bump_class2_v4(void) {
uint8_t* p = g_tls_bcur[2];
if (__builtin_expect(p != NULL, 1)) {
uint8_t* n = p + 32; // ← Immediate value!
uint8_t* end = g_tls_bend[2];
if (__builtin_expect(n <= end, 1)) {
g_tls_bcur[2] = n;
return p;
}
g_tls_bcur[2] = NULL;
g_tls_bend[2] = NULL;
}
return NULL;
}
// ============================================================================
// Phase 4-A1: Hot-Class Wrapper Functions (BUMP → Linked-List)
// ============================================================================
// Phase 4-A1: Replace original hot-pop functions (keep same names for compatibility)
static inline __attribute__((always_inline))
void* tiny_hot_pop_class0(void) {
// Layer 1: TLS-BUMP (immediate-value, 2-register)
void* bump = tiny_hot_bump_class0_v4();
if (__builtin_expect(bump != NULL, 1)) {
return bump;
}
// Layer 2: Linked-list fallback (existing hot path)
if (__builtin_expect(!g_fast_enable, 0)) return NULL;
uint16_t cap = g_fast_cap[0];
if (__builtin_expect(cap == 0, 0)) return NULL;
void* head = g_fast_head[0];
if (__builtin_expect(head == NULL, 0)) return NULL;
g_fast_head[0] = *(void**)head;
uint16_t count = g_fast_count[0];
if (count > 0) {
g_fast_count[0] = (uint16_t)(count - 1);
} else {
g_fast_count[0] = 0;
}
return head;
}
static inline __attribute__((always_inline))
void* tiny_hot_pop_class1(void) {
void* bump = tiny_hot_bump_class1_v4();
if (__builtin_expect(bump != NULL, 1)) {
return bump;
}
if (__builtin_expect(!g_fast_enable, 0)) return NULL;
uint16_t cap = g_fast_cap[1];
if (__builtin_expect(cap == 0, 0)) return NULL;
void* head = g_fast_head[1];
if (__builtin_expect(head == NULL, 0)) return NULL;
g_fast_head[1] = *(void**)head;
uint16_t count = g_fast_count[1];
if (count > 0) {
g_fast_count[1] = (uint16_t)(count - 1);
} else {
g_fast_count[1] = 0;
}
return head;
}
static inline __attribute__((always_inline))
void* tiny_hot_pop_class2(void) {
void* bump = tiny_hot_bump_class2_v4();
if (__builtin_expect(bump != NULL, 1)) {
return bump;
}
if (__builtin_expect(!g_fast_enable, 0)) return NULL;
uint16_t cap = g_fast_cap[2];
if (__builtin_expect(cap == 0, 0)) return NULL;
void* head = g_fast_head[2];
if (__builtin_expect(head == NULL, 0)) return NULL;
g_fast_head[2] = *(void**)head;
uint16_t count = g_fast_count[2];
if (count > 0) {
g_fast_count[2] = (uint16_t)(count - 1);
} else {
g_fast_count[2] = 0;
}
return head;
}
// Class 3: 64B - Keep original (no immediate optimization for now)
// Forward declaration
static inline void* superslab_tls_bump_fast(int class_idx);
static inline __attribute__((always_inline))
void* tiny_hot_pop_class3(void) {
// Use original implementation (from hakmem_tiny_hot_pop.inc.h)
extern int g_ultra_bump_shadow;
if (__builtin_expect(g_ultra_bump_shadow != 0, 1)) { // Expect enabled!
void* bump = superslab_tls_bump_fast(3);
if (__builtin_expect(bump != NULL, 1)) {
return bump;
}
}
if (__builtin_expect(!g_fast_enable, 0)) return NULL;
uint16_t cap = g_fast_cap[3];
if (__builtin_expect(cap == 0, 0)) return NULL;
void* head = g_fast_head[3];
if (__builtin_expect(head == NULL, 0)) return NULL;
g_fast_head[3] = *(void**)head;
uint16_t count = g_fast_count[3];
if (count > 0) {
g_fast_count[3] = (uint16_t)(count - 1);
} else {
g_fast_count[3] = 0;
}
return head;
}
#endif // HAKMEM_TINY_HOT_POP_V4_INC_H