2025-11-07 01:27:04 +09:00
|
|
|
// hakmem_tiny_sfc.c - Box 5-NEW: Super Front Cache (SFC) Implementation
|
|
|
|
|
// Purpose: Slow path (refill/spill/config/stats), not inline
|
|
|
|
|
// Fast path is in tiny_alloc_fast_sfc.inc.h (inline)
|
|
|
|
|
|
|
|
|
|
#include "tiny_alloc_fast_sfc.inc.h"
|
|
|
|
|
#include "hakmem_tiny.h"
|
|
|
|
|
#include "hakmem_tiny_config.h"
|
|
|
|
|
#include "hakmem_tiny_superslab.h"
|
|
|
|
|
#include "tiny_tls.h"
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
#include "box/tls_sll_box.h" // static inline tls_sll_pop/push API (Box TLS-SLL)
|
2025-11-07 01:27:04 +09:00
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: TLS Variables (defined here, extern in header)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
__thread void* g_sfc_head[TINY_NUM_CLASSES] = {NULL};
|
|
|
|
|
__thread uint32_t g_sfc_count[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
uint32_t g_sfc_capacity[TINY_NUM_CLASSES] = {0}; // Non-TLS: shared read-only config
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Statistics (compile-time gated)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
sfc_stats_t g_sfc_stats[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Global Config (from ENV)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
2025-11-11 21:49:05 +09:00
|
|
|
int g_sfc_enabled = 1; // Default: ON (bench-focused; A/B via HAKMEM_SFC_ENABLE)
|
2025-11-07 01:27:04 +09:00
|
|
|
|
|
|
|
|
static int g_sfc_default_capacity = SFC_DEFAULT_CAPACITY;
|
|
|
|
|
static int g_sfc_default_refill = SFC_DEFAULT_REFILL_COUNT;
|
|
|
|
|
static int g_sfc_default_spill_thresh = SFC_DEFAULT_SPILL_THRESH;
|
|
|
|
|
|
|
|
|
|
// Per-class overrides (0 = use default)
|
|
|
|
|
static int g_sfc_capacity_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
static int g_sfc_refill_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Initialization
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
void sfc_init(void) {
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_ENABLE
|
|
|
|
|
const char* env_enable = getenv("HAKMEM_SFC_ENABLE");
|
|
|
|
|
if (env_enable && *env_enable && *env_enable != '0') {
|
|
|
|
|
g_sfc_enabled = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!g_sfc_enabled) {
|
|
|
|
|
// SFC disabled, skip initialization
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_CAPACITY (default capacity for all classes)
|
|
|
|
|
const char* env_cap = getenv("HAKMEM_SFC_CAPACITY");
|
|
|
|
|
if (env_cap && *env_cap) {
|
|
|
|
|
int cap = atoi(env_cap);
|
|
|
|
|
if (cap >= SFC_MIN_CAPACITY && cap <= SFC_MAX_CAPACITY) {
|
|
|
|
|
g_sfc_default_capacity = cap;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_REFILL_COUNT (default refill for all classes)
|
|
|
|
|
const char* env_refill = getenv("HAKMEM_SFC_REFILL_COUNT");
|
|
|
|
|
if (env_refill && *env_refill) {
|
|
|
|
|
int refill = atoi(env_refill);
|
|
|
|
|
if (refill >= 8 && refill <= 256) {
|
|
|
|
|
g_sfc_default_refill = refill;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_CAPACITY_CLASS{0..7} (per-class capacity override)
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
char var[64];
|
|
|
|
|
snprintf(var, sizeof(var), "HAKMEM_SFC_CAPACITY_CLASS%d", cls);
|
|
|
|
|
const char* env_cls_cap = getenv(var);
|
|
|
|
|
if (env_cls_cap && *env_cls_cap) {
|
|
|
|
|
int cap = atoi(env_cls_cap);
|
|
|
|
|
if (cap >= SFC_MIN_CAPACITY && cap <= SFC_MAX_CAPACITY) {
|
|
|
|
|
g_sfc_capacity_override[cls] = cap;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_REFILL_COUNT_CLASS{0..7} (per-class refill override)
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
char var[64];
|
|
|
|
|
snprintf(var, sizeof(var), "HAKMEM_SFC_REFILL_COUNT_CLASS%d", cls);
|
|
|
|
|
const char* env_cls_refill = getenv(var);
|
|
|
|
|
if (env_cls_refill && *env_cls_refill) {
|
|
|
|
|
int refill = atoi(env_cls_refill);
|
|
|
|
|
if (refill >= 8 && refill <= 256) {
|
|
|
|
|
g_sfc_refill_override[cls] = refill;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize per-class capacities (use override or default)
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (g_sfc_capacity_override[cls] > 0) {
|
|
|
|
|
g_sfc_capacity[cls] = g_sfc_capacity_override[cls];
|
|
|
|
|
} else {
|
|
|
|
|
g_sfc_capacity[cls] = g_sfc_default_capacity;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// If class5 hotpath is enabled, disable SFC for class 5 by default
|
|
|
|
|
// unless explicitly overridden via HAKMEM_SFC_CAPACITY_CLASS5
|
|
|
|
|
extern int g_tiny_hotpath_class5;
|
|
|
|
|
if (g_tiny_hotpath_class5 && g_sfc_capacity_override[5] == 0) {
|
|
|
|
|
g_sfc_capacity[5] = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-11 21:49:05 +09:00
|
|
|
// Register shutdown hook for optional stats dump
|
|
|
|
|
atexit(sfc_shutdown);
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
// One-shot debug log
|
|
|
|
|
static int debug_printed = 0;
|
|
|
|
|
if (!debug_printed) {
|
|
|
|
|
debug_printed = 1;
|
|
|
|
|
const char* env_debug = getenv("HAKMEM_SFC_DEBUG");
|
|
|
|
|
if (env_debug && *env_debug && *env_debug != '0') {
|
|
|
|
|
fprintf(stderr, "[SFC] Initialized: enabled=%d, default_cap=%d, default_refill=%d\n",
|
|
|
|
|
g_sfc_enabled, g_sfc_default_capacity, g_sfc_default_refill);
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (g_sfc_capacity_override[cls] > 0 || g_sfc_refill_override[cls] > 0) {
|
|
|
|
|
fprintf(stderr, "[SFC] Class %d: cap=%u, refill_override=%d\n",
|
|
|
|
|
cls, g_sfc_capacity[cls], g_sfc_refill_override[cls]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Ensure stats (if requested) are printed at process exit.
|
|
|
|
|
// This is inexpensive and guarded inside sfc_shutdown by HAKMEM_SFC_STATS_DUMP.
|
|
|
|
|
atexit(sfc_shutdown);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_shutdown(void) {
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// Optional: Print stats at exit (full stats when counters enabled)
|
2025-11-07 01:27:04 +09:00
|
|
|
const char* env_dump = getenv("HAKMEM_SFC_STATS_DUMP");
|
|
|
|
|
if (env_dump && *env_dump && *env_dump != '0') {
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
2025-11-07 01:27:04 +09:00
|
|
|
sfc_print_stats();
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
#else
|
|
|
|
|
// Minimal summary in release builds (no counters): capacity and current counts
|
|
|
|
|
fprintf(stderr, "\n=== SFC Minimal Summary (release) ===\n");
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (g_sfc_capacity[cls] == 0) continue;
|
|
|
|
|
fprintf(stderr, "Class %d: cap=%u, count=%u\n",
|
|
|
|
|
cls, g_sfc_capacity[cls], g_sfc_count[cls]);
|
|
|
|
|
}
|
|
|
|
|
fprintf(stderr, "===========================\n\n");
|
|
|
|
|
#endif
|
2025-11-07 01:27:04 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// No cleanup needed (TLS memory freed by OS)
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-11 21:49:05 +09:00
|
|
|
// Cascade a first batch from TLS SLL into SFC after TLS prewarm.
|
|
|
|
|
// Hot classes only (0..3 and 5) to focus on 256B/小サイズ。
|
|
|
|
|
void sfc_cascade_from_tls_initial(void) {
|
|
|
|
|
if (!g_sfc_enabled) return;
|
|
|
|
|
// TLS SLL externs
|
|
|
|
|
extern __thread void* g_tls_sll_head[];
|
|
|
|
|
extern __thread uint32_t g_tls_sll_count[];
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (!(cls <= 3 || cls == 5)) continue; // focus: 8..64B and 256B
|
|
|
|
|
uint32_t cap = g_sfc_capacity[cls];
|
|
|
|
|
if (cap == 0) continue;
|
|
|
|
|
// target: max half of SFC cap or available SLL count
|
|
|
|
|
uint32_t avail = g_tls_sll_count[cls];
|
|
|
|
|
if (avail == 0) continue;
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// Target: 75% of cap by default, bounded by available
|
|
|
|
|
uint32_t target = (cap * 75u) / 100u;
|
2025-11-11 21:49:05 +09:00
|
|
|
if (target == 0) target = (avail < 16 ? avail : 16);
|
|
|
|
|
if (target > avail) target = avail;
|
|
|
|
|
// transfer
|
|
|
|
|
while (target-- > 0 && g_tls_sll_count[cls] > 0 && g_sfc_count[cls] < g_sfc_capacity[cls]) {
|
|
|
|
|
void* ptr = NULL;
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// pop one from SLL via Box TLS-SLL API (static inline)
|
2025-11-11 21:49:05 +09:00
|
|
|
if (!tls_sll_pop(cls, &ptr)) break;
|
|
|
|
|
// push into SFC
|
|
|
|
|
tiny_next_store(ptr, cls, g_sfc_head[cls]);
|
|
|
|
|
g_sfc_head[cls] = ptr;
|
|
|
|
|
g_sfc_count[cls]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Refill (Slow Path) - STUB (real logic in hakmem.c)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// Stub - real implementation is inline in hakmem.c malloc() to avoid LTO issues
|
|
|
|
|
// This is just a placeholder for future modular refactoring
|
|
|
|
|
int sfc_refill(int cls, int target_count) {
|
|
|
|
|
if (cls < 0 || cls >= TINY_NUM_CLASSES) return 0;
|
|
|
|
|
if (!g_sfc_enabled) return 0;
|
|
|
|
|
(void)target_count;
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
g_sfc_stats[cls].refill_calls++;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return 0; // Actual refill happens inline in hakmem.c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Spill (Slow Path) - STUB (real logic in hakmem.c)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// Stub - real implementation is inline in hakmem.c free() to avoid LTO issues
|
|
|
|
|
// This is just a placeholder for future modular refactoring
|
|
|
|
|
int sfc_spill(int cls, int spill_count) {
|
|
|
|
|
if (cls < 0 || cls >= TINY_NUM_CLASSES) return 0;
|
|
|
|
|
if (!g_sfc_enabled) return 0;
|
|
|
|
|
(void)spill_count;
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
g_sfc_stats[cls].spill_calls++;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return 0; // Actual spill happens inline in hakmem.c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Configuration API
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
sfc_config_t sfc_get_config(int cls) {
|
|
|
|
|
sfc_config_t cfg = {0};
|
|
|
|
|
|
|
|
|
|
if (cls >= 0 && cls < TINY_NUM_CLASSES) {
|
|
|
|
|
cfg.capacity = g_sfc_capacity[cls];
|
|
|
|
|
|
|
|
|
|
// Refill count (use override or default)
|
|
|
|
|
cfg.refill_count = (g_sfc_refill_override[cls] > 0)
|
|
|
|
|
? g_sfc_refill_override[cls]
|
|
|
|
|
: g_sfc_default_refill;
|
|
|
|
|
|
|
|
|
|
cfg.spill_thresh = g_sfc_default_spill_thresh;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return cfg;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_set_config(int cls, sfc_config_t cfg) {
|
|
|
|
|
if (cls < 0 || cls >= TINY_NUM_CLASSES) return;
|
|
|
|
|
|
|
|
|
|
// Validate capacity
|
|
|
|
|
if (cfg.capacity >= SFC_MIN_CAPACITY && cfg.capacity <= SFC_MAX_CAPACITY) {
|
|
|
|
|
g_sfc_capacity[cls] = cfg.capacity;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Validate refill count
|
|
|
|
|
if (cfg.refill_count >= 8 && cfg.refill_count <= 256) {
|
|
|
|
|
g_sfc_refill_override[cls] = cfg.refill_count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Spill threshold (future use)
|
|
|
|
|
if (cfg.spill_thresh > 0 && cfg.spill_thresh <= 100) {
|
|
|
|
|
// Currently unused
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Statistics API
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
|
|
|
|
sfc_stats_t sfc_get_stats(int cls) {
|
|
|
|
|
sfc_stats_t stats = {0};
|
|
|
|
|
|
|
|
|
|
if (cls >= 0 && cls < TINY_NUM_CLASSES) {
|
|
|
|
|
stats = g_sfc_stats[cls];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return stats;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_reset_stats(int cls) {
|
|
|
|
|
if (cls >= 0 && cls < TINY_NUM_CLASSES) {
|
|
|
|
|
memset(&g_sfc_stats[cls], 0, sizeof(sfc_stats_t));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_print_stats(void) {
|
|
|
|
|
fprintf(stderr, "\n=== SFC Statistics (Box 5-NEW) ===\n");
|
|
|
|
|
|
|
|
|
|
uint64_t total_alloc_hits = 0;
|
|
|
|
|
uint64_t total_alloc_misses = 0;
|
|
|
|
|
uint64_t total_refill_calls = 0;
|
|
|
|
|
uint64_t total_refill_blocks = 0;
|
|
|
|
|
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
sfc_stats_t* s = &g_sfc_stats[cls];
|
|
|
|
|
|
|
|
|
|
uint64_t total_allocs = s->alloc_hits + s->alloc_misses;
|
|
|
|
|
if (total_allocs == 0) continue; // Skip unused classes
|
|
|
|
|
|
|
|
|
|
total_alloc_hits += s->alloc_hits;
|
|
|
|
|
total_alloc_misses += s->alloc_misses;
|
|
|
|
|
total_refill_calls += s->refill_calls;
|
|
|
|
|
total_refill_blocks += s->refill_blocks;
|
|
|
|
|
|
|
|
|
|
double hit_rate = (double)s->alloc_hits / total_allocs * 100.0;
|
|
|
|
|
double refill_freq = (double)s->refill_calls / total_allocs * 100.0;
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "Class %d (%3zu B): allocs=%llu, hit_rate=%.2f%%, "
|
|
|
|
|
"refills=%llu (%.4f%%), spills=%llu, cap=%u\n",
|
|
|
|
|
cls, g_tiny_class_sizes[cls],
|
|
|
|
|
(unsigned long long)total_allocs, hit_rate,
|
|
|
|
|
(unsigned long long)s->refill_calls, refill_freq,
|
|
|
|
|
(unsigned long long)s->spill_calls,
|
|
|
|
|
g_sfc_capacity[cls]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Summary
|
|
|
|
|
uint64_t grand_total = total_alloc_hits + total_alloc_misses;
|
|
|
|
|
if (grand_total > 0) {
|
|
|
|
|
double overall_hit_rate = (double)total_alloc_hits / grand_total * 100.0;
|
|
|
|
|
double overall_refill_freq = (double)total_refill_calls / grand_total * 100.0;
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "\n=== SFC Summary ===\n");
|
|
|
|
|
fprintf(stderr, "Total allocs: %llu\n", (unsigned long long)grand_total);
|
|
|
|
|
fprintf(stderr, "Overall hit rate: %.2f%% (target: >95%%)\n", overall_hit_rate);
|
|
|
|
|
fprintf(stderr, "Refill frequency: %.4f%% (target: <0.03%%)\n", overall_refill_freq);
|
|
|
|
|
fprintf(stderr, "Refill calls: %llu (target: <50K for 4M ops/s workload)\n",
|
|
|
|
|
(unsigned long long)total_refill_calls);
|
|
|
|
|
fprintf(stderr, "Refill blocks: %llu (avg %.1f blocks/refill)\n",
|
|
|
|
|
(unsigned long long)total_refill_blocks,
|
|
|
|
|
total_refill_calls > 0 ? (double)total_refill_blocks / total_refill_calls : 0.0);
|
|
|
|
|
|
|
|
|
|
// Check targets
|
|
|
|
|
if (overall_hit_rate >= 95.0) {
|
|
|
|
|
fprintf(stderr, "✅ Hit rate target achieved!\n");
|
|
|
|
|
} else {
|
|
|
|
|
fprintf(stderr, "⚠️ Hit rate below target (increase capacity?)\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (overall_refill_freq < 0.03) {
|
|
|
|
|
fprintf(stderr, "✅ Refill frequency target achieved (-98.5%% reduction)!\n");
|
|
|
|
|
} else {
|
|
|
|
|
fprintf(stderr, "⚠️ Refill frequency above target (increase refill_count?)\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "===========================\n\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// End of hakmem_tiny_sfc.c
|
|
|
|
|
// ============================================================================
|