2025-11-07 01:27:04 +09:00
|
|
|
// hakmem_tiny_sfc.c - Box 5-NEW: Super Front Cache (SFC) Implementation
|
|
|
|
|
// Purpose: Slow path (refill/spill/config/stats), not inline
|
|
|
|
|
// Fast path is in tiny_alloc_fast_sfc.inc.h (inline)
|
|
|
|
|
|
|
|
|
|
#include "tiny_alloc_fast_sfc.inc.h"
|
|
|
|
|
#include "hakmem_tiny.h"
|
|
|
|
|
#include "hakmem_tiny_config.h"
|
|
|
|
|
#include "hakmem_tiny_superslab.h"
|
Phase 4d: Add master stats control (HAKMEM_STATS)
Add unified stats/dump control that allows enabling specific stats
modules using comma-separated values or "all" to enable everything.
New file: core/hakmem_stats_master.h
- HAKMEM_STATS=all: Enable all stats modules
- HAKMEM_STATS=sfc,fast,pool: Enable specific modules
- HAKMEM_STATS_DUMP=1: Dump stats at exit
- hak_stats_check(): Check if module should enable stats
Available stats modules:
sfc, fast, heap, refill, counters, ring, invariant,
pagefault, front, pool, slim, guard, nearempty
Updated files:
- core/hakmem_tiny_sfc.c: Use hak_stats_check() for SFC stats
- core/hakmem_shared_pool.c: Use hak_stats_check() for pool stats
Performance: No regression (72.9M ops/s)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 16:11:15 +09:00
|
|
|
#include "hakmem_stats_master.h" // Phase 4d: Master stats control
|
2025-11-07 01:27:04 +09:00
|
|
|
#include "tiny_tls.h"
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
#include "box/tls_sll_box.h" // static inline tls_sll_pop/push API (Box TLS-SLL)
|
2025-12-02 20:32:22 +09:00
|
|
|
#include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
|
2025-11-07 01:27:04 +09:00
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: TLS Variables (defined here, extern in header)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
__thread void* g_sfc_head[TINY_NUM_CLASSES] = {NULL};
|
|
|
|
|
__thread uint32_t g_sfc_count[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
uint32_t g_sfc_capacity[TINY_NUM_CLASSES] = {0}; // Non-TLS: shared read-only config
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Statistics (compile-time gated)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
sfc_stats_t g_sfc_stats[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Global Config (from ENV)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
2025-11-11 21:49:05 +09:00
|
|
|
int g_sfc_enabled = 1; // Default: ON (bench-focused; A/B via HAKMEM_SFC_ENABLE)
|
2025-11-27 03:18:33 +09:00
|
|
|
int g_sfc_debug = 0; // Set once at init from HAKMEM_SFC_DEBUG
|
2025-11-07 01:27:04 +09:00
|
|
|
|
|
|
|
|
static int g_sfc_default_capacity = SFC_DEFAULT_CAPACITY;
|
|
|
|
|
static int g_sfc_default_refill = SFC_DEFAULT_REFILL_COUNT;
|
|
|
|
|
static int g_sfc_default_spill_thresh = SFC_DEFAULT_SPILL_THRESH;
|
|
|
|
|
|
|
|
|
|
// Per-class overrides (0 = use default)
|
|
|
|
|
static int g_sfc_capacity_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
static int g_sfc_refill_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Initialization
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
void sfc_init(void) {
|
2025-12-02 20:32:22 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate init syscall overhead)
|
|
|
|
|
g_sfc_debug = HAK_ENV_SFC_DEBUG();
|
|
|
|
|
g_sfc_enabled = HAK_ENV_SFC_ENABLE();
|
2025-11-07 01:27:04 +09:00
|
|
|
|
|
|
|
|
if (!g_sfc_enabled) {
|
|
|
|
|
// SFC disabled, skip initialization
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-02 20:32:22 +09:00
|
|
|
// Priority-2: Use cached ENV (eliminate config syscall overhead)
|
|
|
|
|
int cap = HAK_ENV_SFC_CAPACITY();
|
|
|
|
|
if (cap >= SFC_MIN_CAPACITY && cap <= SFC_MAX_CAPACITY) {
|
|
|
|
|
g_sfc_default_capacity = cap;
|
2025-11-07 01:27:04 +09:00
|
|
|
}
|
|
|
|
|
|
2025-12-02 20:32:22 +09:00
|
|
|
int refill = HAK_ENV_SFC_REFILL_COUNT();
|
|
|
|
|
if (refill >= 8 && refill <= 256) {
|
|
|
|
|
g_sfc_default_refill = refill;
|
2025-11-07 01:27:04 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_CAPACITY_CLASS{0..7} (per-class capacity override)
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
char var[64];
|
|
|
|
|
snprintf(var, sizeof(var), "HAKMEM_SFC_CAPACITY_CLASS%d", cls);
|
|
|
|
|
const char* env_cls_cap = getenv(var);
|
|
|
|
|
if (env_cls_cap && *env_cls_cap) {
|
|
|
|
|
int cap = atoi(env_cls_cap);
|
|
|
|
|
if (cap >= SFC_MIN_CAPACITY && cap <= SFC_MAX_CAPACITY) {
|
|
|
|
|
g_sfc_capacity_override[cls] = cap;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parse ENV: HAKMEM_SFC_REFILL_COUNT_CLASS{0..7} (per-class refill override)
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
char var[64];
|
|
|
|
|
snprintf(var, sizeof(var), "HAKMEM_SFC_REFILL_COUNT_CLASS%d", cls);
|
|
|
|
|
const char* env_cls_refill = getenv(var);
|
|
|
|
|
if (env_cls_refill && *env_cls_refill) {
|
|
|
|
|
int refill = atoi(env_cls_refill);
|
|
|
|
|
if (refill >= 8 && refill <= 256) {
|
|
|
|
|
g_sfc_refill_override[cls] = refill;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Initialize per-class capacities (use override or default)
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (g_sfc_capacity_override[cls] > 0) {
|
|
|
|
|
g_sfc_capacity[cls] = g_sfc_capacity_override[cls];
|
|
|
|
|
} else {
|
|
|
|
|
g_sfc_capacity[cls] = g_sfc_default_capacity;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
|
2025-11-11 21:49:05 +09:00
|
|
|
// Register shutdown hook for optional stats dump
|
|
|
|
|
atexit(sfc_shutdown);
|
|
|
|
|
|
2025-11-13 13:32:58 +09:00
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-07 01:27:04 +09:00
|
|
|
// One-shot debug log
|
|
|
|
|
static int debug_printed = 0;
|
|
|
|
|
if (!debug_printed) {
|
|
|
|
|
debug_printed = 1;
|
2025-11-27 03:18:33 +09:00
|
|
|
if (g_sfc_debug) {
|
2025-11-07 01:27:04 +09:00
|
|
|
fprintf(stderr, "[SFC] Initialized: enabled=%d, default_cap=%d, default_refill=%d\n",
|
|
|
|
|
g_sfc_enabled, g_sfc_default_capacity, g_sfc_default_refill);
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (g_sfc_capacity_override[cls] > 0 || g_sfc_refill_override[cls] > 0) {
|
|
|
|
|
fprintf(stderr, "[SFC] Class %d: cap=%u, refill_override=%d\n",
|
|
|
|
|
cls, g_sfc_capacity[cls], g_sfc_refill_override[cls]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-11-13 13:32:58 +09:00
|
|
|
#endif
|
2025-11-07 01:27:04 +09:00
|
|
|
|
|
|
|
|
// Ensure stats (if requested) are printed at process exit.
|
|
|
|
|
// This is inexpensive and guarded inside sfc_shutdown by HAKMEM_SFC_STATS_DUMP.
|
|
|
|
|
atexit(sfc_shutdown);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_shutdown(void) {
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// Optional: Print stats at exit (full stats when counters enabled)
|
Phase 4d: Add master stats control (HAKMEM_STATS)
Add unified stats/dump control that allows enabling specific stats
modules using comma-separated values or "all" to enable everything.
New file: core/hakmem_stats_master.h
- HAKMEM_STATS=all: Enable all stats modules
- HAKMEM_STATS=sfc,fast,pool: Enable specific modules
- HAKMEM_STATS_DUMP=1: Dump stats at exit
- hak_stats_check(): Check if module should enable stats
Available stats modules:
sfc, fast, heap, refill, counters, ring, invariant,
pagefault, front, pool, slim, guard, nearempty
Updated files:
- core/hakmem_tiny_sfc.c: Use hak_stats_check() for SFC stats
- core/hakmem_shared_pool.c: Use hak_stats_check() for pool stats
Performance: No regression (72.9M ops/s)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 16:11:15 +09:00
|
|
|
// Phase 4d: Now uses hak_stats_check() for unified stats control
|
|
|
|
|
if (hak_stats_check("HAKMEM_SFC_STATS_DUMP", "sfc")) {
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
2025-11-07 01:27:04 +09:00
|
|
|
sfc_print_stats();
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
#else
|
|
|
|
|
// Minimal summary in release builds (no counters): capacity and current counts
|
|
|
|
|
fprintf(stderr, "\n=== SFC Minimal Summary (release) ===\n");
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (g_sfc_capacity[cls] == 0) continue;
|
|
|
|
|
fprintf(stderr, "Class %d: cap=%u, count=%u\n",
|
|
|
|
|
cls, g_sfc_capacity[cls], g_sfc_count[cls]);
|
|
|
|
|
}
|
|
|
|
|
fprintf(stderr, "===========================\n\n");
|
|
|
|
|
#endif
|
2025-11-07 01:27:04 +09:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// No cleanup needed (TLS memory freed by OS)
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-11 21:49:05 +09:00
|
|
|
// Cascade a first batch from TLS SLL into SFC after TLS prewarm.
|
|
|
|
|
// Hot classes only (0..3 and 5) to focus on 256B/小サイズ。
|
|
|
|
|
void sfc_cascade_from_tls_initial(void) {
|
|
|
|
|
if (!g_sfc_enabled) return;
|
2025-11-20 07:32:30 +09:00
|
|
|
// TLS SLL extern
|
|
|
|
|
extern __thread TinyTLSSLL g_tls_sll[];
|
2025-11-11 21:49:05 +09:00
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
if (!(cls <= 3 || cls == 5)) continue; // focus: 8..64B and 256B
|
|
|
|
|
uint32_t cap = g_sfc_capacity[cls];
|
|
|
|
|
if (cap == 0) continue;
|
|
|
|
|
// target: max half of SFC cap or available SLL count
|
2025-11-20 07:32:30 +09:00
|
|
|
uint32_t avail = g_tls_sll[cls].count;
|
2025-11-11 21:49:05 +09:00
|
|
|
if (avail == 0) continue;
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// Target: 75% of cap by default, bounded by available
|
|
|
|
|
uint32_t target = (cap * 75u) / 100u;
|
2025-11-11 21:49:05 +09:00
|
|
|
if (target == 0) target = (avail < 16 ? avail : 16);
|
|
|
|
|
if (target > avail) target = avail;
|
|
|
|
|
// transfer
|
2025-11-20 07:32:30 +09:00
|
|
|
while (target-- > 0 && g_tls_sll[cls].count > 0 && g_sfc_count[cls] < g_sfc_capacity[cls]) {
|
2025-11-11 21:49:05 +09:00
|
|
|
void* ptr = NULL;
|
Add Box 3 (Pointer Conversion Layer) and fix POOL_TLS_PHASE1 default
## Major Changes
### 1. Box 3: Pointer Conversion Module (NEW)
- File: core/box/ptr_conversion_box.h
- Purpose: Unified BASE ↔ USER pointer conversion (single source of truth)
- API: PTR_BASE_TO_USER(), PTR_USER_TO_BASE()
- Features: Zero-overhead inline, debug mode, NULL-safe, class 7 headerless support
- Design: Header-only, fully modular, no external dependencies
### 2. POOL_TLS_PHASE1 Default OFF (CRITICAL FIX)
- File: build.sh
- Change: POOL_TLS_PHASE1 now defaults to 0 (was hardcoded to 1)
- Impact: Eliminates pthread_mutex overhead on every free() (was causing 3.3x slowdown)
- Usage: Set POOL_TLS_PHASE1=1 env var to enable if needed
### 3. Pointer Conversion Fixes (PARTIAL)
- Files: core/box/front_gate_box.c, core/tiny_alloc_fast.inc.h, etc.
- Status: Partial implementation using Box 3 API
- Note: Work in progress, some conversions still need review
### 4. Performance Investigation Report (NEW)
- File: HOTPATH_PERFORMANCE_INVESTIGATION.md
- Findings:
- Hotpath works (+24% vs baseline) after POOL_TLS fix
- Still 9.2x slower than system malloc due to:
* Heavy initialization (23.85% of cycles)
* Syscall overhead (2,382 syscalls per 100K ops)
* Workload mismatch (C7 1KB is 49.8%, but only C5 256B has hotpath)
* 9.4x more instructions than system malloc
### 5. Known Issues
- SEGV at 20K-30K iterations (pre-existing bug, not related to pointer conversions)
- Root cause: Likely active counter corruption or TLS-SLL chain issues
- Status: Under investigation
## Performance Results (100K iterations, 256B)
- Baseline (Hotpath OFF): 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- System malloc: 82.2M ops/s (still 9.2x faster)
## Next Steps
- P0: Fix 20K-30K SEGV bug (GDB investigation needed)
- P1: Lazy initialization (+20-25% expected)
- P1: C7 (1KB) hotpath (+30-40% expected, biggest win)
- P2: Reduce syscalls (+15-20% expected)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 01:01:23 +09:00
|
|
|
// pop one from SLL via Box TLS-SLL API (static inline)
|
2025-11-11 21:49:05 +09:00
|
|
|
if (!tls_sll_pop(cls, &ptr)) break;
|
Phase E3-FINAL: Fix Box API offset bugs - ALL classes now use correct offsets
## Root Cause Analysis (GPT5)
**Physical Layout Constraints**:
- Class 0: 8B = [1B header][7B payload] → offset 1 = 9B needed = ❌ IMPOSSIBLE
- Class 1-6: >=16B = [1B header][15B+ payload] → offset 1 = ✅ POSSIBLE
- Class 7: 1KB → offset 0 (compatibility)
**Correct Specification**:
- HAKMEM_TINY_HEADER_CLASSIDX != 0:
- Class 0, 7: next at offset 0 (overwrites header when on freelist)
- Class 1-6: next at offset 1 (after header)
- HAKMEM_TINY_HEADER_CLASSIDX == 0:
- All classes: next at offset 0
**Previous Bug**:
- Attempted "ALL classes offset 1" unification
- Class 0 with offset 1 caused immediate SEGV (9B > 8B block size)
- Mixed 2-arg/3-arg API caused confusion
## Fixes Applied
### 1. Restored 3-Argument Box API (core/box/tiny_next_ptr_box.h)
```c
// Correct signatures
void tiny_next_write(int class_idx, void* base, void* next_value)
void* tiny_next_read(int class_idx, const void* base)
// Correct offset calculation
size_t offset = (class_idx == 0 || class_idx == 7) ? 0 : 1;
```
### 2. Updated 123+ Call Sites Across 34 Files
- hakmem_tiny_hot_pop_v4.inc.h (4 locations)
- hakmem_tiny_fastcache.inc.h (3 locations)
- hakmem_tiny_tls_list.h (12 locations)
- superslab_inline.h (5 locations)
- tiny_fastcache.h (3 locations)
- ptr_trace.h (macro definitions)
- tls_sll_box.h (2 locations)
- + 27 additional files
Pattern: `tiny_next_read(base)` → `tiny_next_read(class_idx, base)`
Pattern: `tiny_next_write(base, next)` → `tiny_next_write(class_idx, base, next)`
### 3. Added Sentinel Detection Guards
- tiny_fast_push(): Block nodes with sentinel in ptr or ptr->next
- tls_list_push(): Block nodes with sentinel in ptr or ptr->next
- Defense-in-depth against remote free sentinel leakage
## Verification (GPT5 Report)
**Test Command**: `./out/release/bench_random_mixed_hakmem --iterations=70000`
**Results**:
- ✅ Main loop completed successfully
- ✅ Drain phase completed successfully
- ✅ NO SEGV (previous crash at iteration 66151 is FIXED)
- ℹ️ Final log: "tiny_alloc(1024) failed" is normal fallback to Mid/ACE layers
**Analysis**:
- Class 0 immediate SEGV: ✅ RESOLVED (correct offset 0 now used)
- 66K iteration crash: ✅ RESOLVED (offset consistency fixed)
- Box API conflicts: ✅ RESOLVED (unified 3-arg API)
## Technical Details
### Offset Logic Justification
```
Class 0: 8B block → next pointer (8B) fits ONLY at offset 0
Class 1: 16B block → next pointer (8B) fits at offset 1 (after 1B header)
Class 2: 32B block → next pointer (8B) fits at offset 1
...
Class 6: 512B block → next pointer (8B) fits at offset 1
Class 7: 1024B block → offset 0 for legacy compatibility
```
### Files Modified (Summary)
- Core API: `box/tiny_next_ptr_box.h`
- Hot paths: `hakmem_tiny_hot_pop*.inc.h`, `tiny_fastcache.h`
- TLS layers: `hakmem_tiny_tls_list.h`, `hakmem_tiny_tls_ops.h`
- SuperSlab: `superslab_inline.h`, `tiny_superslab_*.inc.h`
- Refill: `hakmem_tiny_refill.inc.h`, `tiny_refill_opt.h`
- Free paths: `tiny_free_magazine.inc.h`, `tiny_superslab_free.inc.h`
- Documentation: Multiple Phase E3 reports
## Remaining Work
None for Box API offset bugs - all structural issues resolved.
Future enhancements (non-critical):
- Periodic `grep -R '*(void**)' core/` to detect direct pointer access violations
- Enforce Box API usage via static analysis
- Document offset rationale in architecture docs
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 06:50:20 +09:00
|
|
|
// Phase E1-CORRECT: Use Box API for next pointer write
|
|
|
|
|
tiny_next_write(cls, ptr, g_sfc_head[cls]);
|
2025-11-11 21:49:05 +09:00
|
|
|
g_sfc_head[cls] = ptr;
|
|
|
|
|
g_sfc_count[cls]++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Refill (Slow Path) - STUB (real logic in hakmem.c)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// Stub - real implementation is inline in hakmem.c malloc() to avoid LTO issues
|
|
|
|
|
// This is just a placeholder for future modular refactoring
|
|
|
|
|
int sfc_refill(int cls, int target_count) {
|
|
|
|
|
if (cls < 0 || cls >= TINY_NUM_CLASSES) return 0;
|
|
|
|
|
if (!g_sfc_enabled) return 0;
|
|
|
|
|
(void)target_count;
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
g_sfc_stats[cls].refill_calls++;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return 0; // Actual refill happens inline in hakmem.c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Spill (Slow Path) - STUB (real logic in hakmem.c)
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
// Stub - real implementation is inline in hakmem.c free() to avoid LTO issues
|
|
|
|
|
// This is just a placeholder for future modular refactoring
|
|
|
|
|
int sfc_spill(int cls, int spill_count) {
|
|
|
|
|
if (cls < 0 || cls >= TINY_NUM_CLASSES) return 0;
|
|
|
|
|
if (!g_sfc_enabled) return 0;
|
|
|
|
|
(void)spill_count;
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
g_sfc_stats[cls].spill_calls++;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return 0; // Actual spill happens inline in hakmem.c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Configuration API
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
sfc_config_t sfc_get_config(int cls) {
|
|
|
|
|
sfc_config_t cfg = {0};
|
|
|
|
|
|
|
|
|
|
if (cls >= 0 && cls < TINY_NUM_CLASSES) {
|
|
|
|
|
cfg.capacity = g_sfc_capacity[cls];
|
|
|
|
|
|
|
|
|
|
// Refill count (use override or default)
|
|
|
|
|
cfg.refill_count = (g_sfc_refill_override[cls] > 0)
|
|
|
|
|
? g_sfc_refill_override[cls]
|
|
|
|
|
: g_sfc_default_refill;
|
|
|
|
|
|
|
|
|
|
cfg.spill_thresh = g_sfc_default_spill_thresh;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return cfg;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_set_config(int cls, sfc_config_t cfg) {
|
|
|
|
|
if (cls < 0 || cls >= TINY_NUM_CLASSES) return;
|
|
|
|
|
|
|
|
|
|
// Validate capacity
|
|
|
|
|
if (cfg.capacity >= SFC_MIN_CAPACITY && cfg.capacity <= SFC_MAX_CAPACITY) {
|
|
|
|
|
g_sfc_capacity[cls] = cfg.capacity;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Validate refill count
|
|
|
|
|
if (cfg.refill_count >= 8 && cfg.refill_count <= 256) {
|
|
|
|
|
g_sfc_refill_override[cls] = cfg.refill_count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Spill threshold (future use)
|
|
|
|
|
if (cfg.spill_thresh > 0 && cfg.spill_thresh <= 100) {
|
|
|
|
|
// Currently unused
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// Box 5-NEW: Statistics API
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
|
|
|
|
sfc_stats_t sfc_get_stats(int cls) {
|
|
|
|
|
sfc_stats_t stats = {0};
|
|
|
|
|
|
|
|
|
|
if (cls >= 0 && cls < TINY_NUM_CLASSES) {
|
|
|
|
|
stats = g_sfc_stats[cls];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return stats;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_reset_stats(int cls) {
|
|
|
|
|
if (cls >= 0 && cls < TINY_NUM_CLASSES) {
|
|
|
|
|
memset(&g_sfc_stats[cls], 0, sizeof(sfc_stats_t));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void sfc_print_stats(void) {
|
|
|
|
|
fprintf(stderr, "\n=== SFC Statistics (Box 5-NEW) ===\n");
|
|
|
|
|
|
|
|
|
|
uint64_t total_alloc_hits = 0;
|
|
|
|
|
uint64_t total_alloc_misses = 0;
|
|
|
|
|
uint64_t total_refill_calls = 0;
|
|
|
|
|
uint64_t total_refill_blocks = 0;
|
|
|
|
|
|
|
|
|
|
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
|
|
|
|
|
sfc_stats_t* s = &g_sfc_stats[cls];
|
|
|
|
|
|
|
|
|
|
uint64_t total_allocs = s->alloc_hits + s->alloc_misses;
|
|
|
|
|
if (total_allocs == 0) continue; // Skip unused classes
|
|
|
|
|
|
|
|
|
|
total_alloc_hits += s->alloc_hits;
|
|
|
|
|
total_alloc_misses += s->alloc_misses;
|
|
|
|
|
total_refill_calls += s->refill_calls;
|
|
|
|
|
total_refill_blocks += s->refill_blocks;
|
|
|
|
|
|
|
|
|
|
double hit_rate = (double)s->alloc_hits / total_allocs * 100.0;
|
|
|
|
|
double refill_freq = (double)s->refill_calls / total_allocs * 100.0;
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "Class %d (%3zu B): allocs=%llu, hit_rate=%.2f%%, "
|
|
|
|
|
"refills=%llu (%.4f%%), spills=%llu, cap=%u\n",
|
|
|
|
|
cls, g_tiny_class_sizes[cls],
|
|
|
|
|
(unsigned long long)total_allocs, hit_rate,
|
|
|
|
|
(unsigned long long)s->refill_calls, refill_freq,
|
|
|
|
|
(unsigned long long)s->spill_calls,
|
|
|
|
|
g_sfc_capacity[cls]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Summary
|
|
|
|
|
uint64_t grand_total = total_alloc_hits + total_alloc_misses;
|
|
|
|
|
if (grand_total > 0) {
|
|
|
|
|
double overall_hit_rate = (double)total_alloc_hits / grand_total * 100.0;
|
|
|
|
|
double overall_refill_freq = (double)total_refill_calls / grand_total * 100.0;
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "\n=== SFC Summary ===\n");
|
|
|
|
|
fprintf(stderr, "Total allocs: %llu\n", (unsigned long long)grand_total);
|
|
|
|
|
fprintf(stderr, "Overall hit rate: %.2f%% (target: >95%%)\n", overall_hit_rate);
|
|
|
|
|
fprintf(stderr, "Refill frequency: %.4f%% (target: <0.03%%)\n", overall_refill_freq);
|
|
|
|
|
fprintf(stderr, "Refill calls: %llu (target: <50K for 4M ops/s workload)\n",
|
|
|
|
|
(unsigned long long)total_refill_calls);
|
|
|
|
|
fprintf(stderr, "Refill blocks: %llu (avg %.1f blocks/refill)\n",
|
|
|
|
|
(unsigned long long)total_refill_blocks,
|
|
|
|
|
total_refill_calls > 0 ? (double)total_refill_blocks / total_refill_calls : 0.0);
|
|
|
|
|
|
|
|
|
|
// Check targets
|
|
|
|
|
if (overall_hit_rate >= 95.0) {
|
|
|
|
|
fprintf(stderr, "✅ Hit rate target achieved!\n");
|
|
|
|
|
} else {
|
|
|
|
|
fprintf(stderr, "⚠️ Hit rate below target (increase capacity?)\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (overall_refill_freq < 0.03) {
|
|
|
|
|
fprintf(stderr, "✅ Refill frequency target achieved (-98.5%% reduction)!\n");
|
|
|
|
|
} else {
|
|
|
|
|
fprintf(stderr, "⚠️ Refill frequency above target (increase refill_count?)\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "===========================\n\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#endif // HAKMEM_DEBUG_COUNTERS
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
// End of hakmem_tiny_sfc.c
|
|
|
|
|
// ============================================================================
|