2025-11-07 00:37:33 +09:00
|
|
|
|
// hak_wrappers.inc.h — malloc/free/calloc/realloc wrappers (LD_PRELOAD-aware)
|
|
|
|
|
|
#ifndef HAK_WRAPPERS_INC_H
|
|
|
|
|
|
#define HAK_WRAPPERS_INC_H
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD
|
|
|
|
|
|
|
|
|
|
|
|
// Sanitizer/diagnostic builds: bypass hakmem allocator completely.
|
|
|
|
|
|
void* malloc(size_t size) {
|
|
|
|
|
|
extern void* __libc_malloc(size_t);
|
|
|
|
|
|
return __libc_malloc(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void free(void* ptr) {
|
|
|
|
|
|
if (!ptr) return;
|
|
|
|
|
|
extern void __libc_free(void*);
|
|
|
|
|
|
__libc_free(ptr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void* calloc(size_t nmemb, size_t size) {
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
|
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void* realloc(void* ptr, size_t size) {
|
|
|
|
|
|
extern void* __libc_realloc(void*, size_t);
|
|
|
|
|
|
return __libc_realloc(ptr, size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
2025-11-11 00:02:24 +09:00
|
|
|
|
#include "../ptr_trace.h" // Debug: pointer trace immediate dump on libc fallback
|
|
|
|
|
|
#include "front_gate_classifier.h" // Box FG: pointer classification (header/reg)
|
2025-12-01 22:06:10 +09:00
|
|
|
|
#include "../hakmem_pool.h" // Mid registry lookup (failsafe for headerless Mid)
|
2025-12-04 12:06:14 +09:00
|
|
|
|
#include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification (Tiny fast alloc)
|
|
|
|
|
|
#include "tiny_alloc_gate_box.h" // Tiny Alloc Gatekeeper Box (BASE/USER+Bridge 入口)
|
Phase 4-Step3: Add Front Config Box (+2.7-4.9% dead code elimination)
Implement compile-time configuration system for dead code elimination in Tiny
allocation hot paths. The Config Box provides dual-mode configuration:
- Normal mode: Runtime ENV checks (backward compatible, flexible)
- PGO mode: Compile-time constants (dead code elimination, performance)
PERFORMANCE:
- Baseline (runtime config): 50.32 M ops/s (avg of 5 runs)
- Config Box (PGO mode): 52.77 M ops/s (avg of 5 runs)
- Improvement: +2.45 M ops/s (+4.87% with outlier, +2.72% without)
- Target: +5-8% (partially achieved)
IMPLEMENTATION:
1. core/box/tiny_front_config_box.h (NEW):
- Defines TINY_FRONT_*_ENABLED macros for all config checks
- PGO mode (#if HAKMEM_TINY_FRONT_PGO): Macros expand to constants (0/1)
- Normal mode (#else): Macros expand to function calls
- Functions remain in their original locations (no code duplication)
2. core/hakmem_build_flags.h:
- Added HAKMEM_TINY_FRONT_PGO build flag (default: 0, off)
- Documentation: Usage with make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1"
3. core/box/hak_wrappers.inc.h:
- Replaced front_gate_unified_enabled() with TINY_FRONT_UNIFIED_GATE_ENABLED
- 2 call sites updated (malloc and free fast paths)
- Added config box include
EXPECTED DEAD CODE ELIMINATION (PGO mode):
if (TINY_FRONT_UNIFIED_GATE_ENABLED) { ... }
→ if (1) { ... } // Constant, always true
→ Compiler optimizes away the branch, keeps body
SCOPE:
Currently only front_gate_unified_enabled() is replaced (2 call sites).
To achieve full +5-8% target, expand to other config checks:
- ultra_slim_mode_enabled()
- tiny_heap_v2_enabled()
- sfc_cascade_enabled()
- tiny_fastcache_enabled()
- tiny_metrics_enabled()
- tiny_diag_enabled()
BUILD USAGE:
Normal mode (runtime config, default):
make bench_random_mixed_hakmem
PGO mode (compile-time config, dead code elimination):
make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem
BOX PATTERN COMPLIANCE:
✅ Single Responsibility: Configuration management ONLY
✅ Clear Contract: Dual-mode (PGO = constants, Normal = runtime)
✅ Observable: Config report function (debug builds)
✅ Safe: Backward compatible (default is normal mode)
✅ Testable: Easy A/B comparison (PGO vs normal builds)
WHY +2.7-4.9% (below +5-8% target)?
- Limited scope: Only 2 call sites for 1 config function replaced
- Lazy init overhead: front_gate_unified_enabled() cached after first call
- Need to expand to more config checks for full benefit
NEXT STEPS:
- Expand config macro usage to other functions (optional)
- OR proceed with PGO re-enablement (Final polish)
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 12:18:37 +09:00
|
|
|
|
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
|
2025-12-02 16:16:51 +09:00
|
|
|
|
#include "wrapper_env_box.h" // Wrapper env cache (step trace / LD safe / free trace)
|
2025-12-03 09:15:59 +09:00
|
|
|
|
#include "../hakmem_internal.h" // AllocHeader helpers for diagnostics
|
|
|
|
|
|
#include "../hakmem_super_registry.h" // Superslab lookup for diagnostics
|
|
|
|
|
|
#include "../superslab/superslab_inline.h" // slab_index_for, capacity
|
|
|
|
|
|
#include <sys/mman.h> // mincore for safe mapping checks
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
#include <unistd.h> // write for diagnostics
|
|
|
|
|
|
#include <string.h> // strlen for diagnostics
|
2025-11-11 00:02:24 +09:00
|
|
|
|
|
2025-11-07 00:37:33 +09:00
|
|
|
|
// malloc wrapper - intercepts system malloc() calls
|
|
|
|
|
|
__thread uint64_t g_malloc_total_calls = 0;
|
|
|
|
|
|
__thread uint64_t g_malloc_tiny_size_match = 0;
|
|
|
|
|
|
__thread uint64_t g_malloc_fast_path_tried = 0;
|
|
|
|
|
|
__thread uint64_t g_malloc_fast_path_null = 0;
|
|
|
|
|
|
__thread uint64_t g_malloc_slow_path = 0;
|
|
|
|
|
|
|
2025-11-20 07:32:30 +09:00
|
|
|
|
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
2025-11-07 00:37:33 +09:00
|
|
|
|
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
// CRITICAL FIX (BUG #10): Use cached g_jemalloc_loaded instead of calling hak_jemalloc_loaded()
|
|
|
|
|
|
// The function call version triggers infinite recursion: malloc → hak_jemalloc_loaded → dlopen → malloc
|
|
|
|
|
|
extern int g_jemalloc_loaded; // Cached during hak_init_impl(), defined in hakmem.c
|
|
|
|
|
|
|
Fix #16: Resolve double BASE→USER conversion causing header corruption
🎯 ROOT CAUSE: Internal allocation helpers were prematurely converting
BASE → USER pointers before returning to caller. The caller then applied
HAK_RET_ALLOC/tiny_region_id_write_header which performed ANOTHER BASE→USER
conversion, resulting in double offset (BASE+2) and header written at
wrong location.
📦 BOX THEORY SOLUTION: Establish clean pointer conversion boundary at
tiny_region_id_write_header, making it the single source of truth for
BASE → USER conversion.
🔧 CHANGES:
- Fix #16: Remove premature BASE→USER conversions (6 locations)
* core/tiny_alloc_fast.inc.h (3 fixes)
* core/hakmem_tiny_refill.inc.h (2 fixes)
* core/hakmem_tiny_fastcache.inc.h (1 fix)
- Fix #12: Add header validation in tls_sll_pop (detect corruption)
- Fix #14: Defense-in-depth header restoration in tls_sll_splice
- Fix #15: USER pointer detection (for debugging)
- Fix #13: Bump window header restoration
- Fix #2, #6, #7, #8: Various header restoration & NULL termination
🧪 TEST RESULTS: 100% SUCCESS
- 10K-500K iterations: All passed
- 8 seeds × 100K: All passed (42,123,456,789,999,314,271,161)
- Performance: ~630K ops/s average (stable)
- Header corruption: ZERO
📋 FIXES SUMMARY:
Fix #1-8: Initial header restoration & chain fixes (chatgpt-san)
Fix #9-10: USER pointer auto-fix (later disabled)
Fix #12: Validation system (caught corruption at call 14209)
Fix #13: Bump window header writes
Fix #14: Splice defense-in-depth
Fix #15: USER pointer detection (debugging tool)
Fix #16: Double conversion fix (FINAL SOLUTION) ✅
🎓 LESSONS LEARNED:
1. Validation catches bugs early (Fix #12 was critical)
2. Class-specific inline logging reveals patterns (Option C)
3. Box Theory provides clean architectural boundaries
4. Multiple investigation approaches (Task/chatgpt-san collaboration)
📄 DOCUMENTATION:
- P0_BUG_STATUS.md: Complete bug tracking timeline
- C2_CORRUPTION_ROOT_CAUSE_FINAL.md: Detailed root cause analysis
- FINAL_ANALYSIS_C2_CORRUPTION.md: Investigation methodology
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task Agent <task@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-11-12 10:33:57 +09:00
|
|
|
|
// Global malloc call counter for debugging (exposed for validation code)
|
|
|
|
|
|
// Defined here, accessed from tls_sll_box.h for corruption detection
|
|
|
|
|
|
_Atomic uint64_t malloc_count = 0;
|
|
|
|
|
|
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
// Lightweight fallback diagnostics (enabled with HAKMEM_WRAP_DIAG=1)
|
|
|
|
|
|
typedef enum {
|
|
|
|
|
|
FB_INIT_WAIT_FAIL = 0,
|
|
|
|
|
|
FB_INIT_LD_WAIT_FAIL,
|
|
|
|
|
|
FB_FORCE_LIBC,
|
|
|
|
|
|
FB_LD_SAFE,
|
|
|
|
|
|
FB_JEMALLOC_BLOCK,
|
|
|
|
|
|
FB_LOCKDEPTH,
|
|
|
|
|
|
FB_NOT_OWNED,
|
|
|
|
|
|
FB_OTHER,
|
|
|
|
|
|
FB_REASON_COUNT
|
|
|
|
|
|
} wrapper_fb_reason_t;
|
|
|
|
|
|
static _Atomic uint64_t g_fb_counts[FB_REASON_COUNT];
|
|
|
|
|
|
static _Atomic int g_fb_log_count[FB_REASON_COUNT];
|
|
|
|
|
|
|
|
|
|
|
|
static inline void wrapper_record_fallback(wrapper_fb_reason_t reason, const char* msg) {
|
|
|
|
|
|
atomic_fetch_add_explicit(&g_fb_counts[reason], 1, memory_order_relaxed);
|
|
|
|
|
|
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg();
|
|
|
|
|
|
if (__builtin_expect(wcfg->wrap_diag, 0)) {
|
|
|
|
|
|
int n = atomic_fetch_add_explicit(&g_fb_log_count[reason], 1, memory_order_relaxed);
|
|
|
|
|
|
if (n < 4 && msg) {
|
|
|
|
|
|
write(2, msg, strlen(msg));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-07 00:37:33 +09:00
|
|
|
|
void* malloc(size_t size) {
|
2025-12-03 07:56:38 +09:00
|
|
|
|
#ifndef NDEBUG
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
uint64_t count = atomic_fetch_add(&malloc_count, 1);
|
2025-12-03 07:56:38 +09:00
|
|
|
|
#endif
|
2025-12-04 19:20:44 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
// Debug-only trace counter: in release builds this atomic increment
|
|
|
|
|
|
// is disabled to avoid hot-path cache misses and contention.
|
2025-12-03 20:42:28 +09:00
|
|
|
|
static _Atomic int g_wrap_malloc_trace_count = 0;
|
|
|
|
|
|
if (atomic_fetch_add_explicit(&g_wrap_malloc_trace_count, 1, memory_order_relaxed) < 256) {
|
|
|
|
|
|
HAK_TRACE("[wrap_malloc_enter]\n");
|
|
|
|
|
|
}
|
2025-12-04 19:20:44 +09:00
|
|
|
|
#endif
|
2025-12-03 07:56:38 +09:00
|
|
|
|
// NDEBUG: malloc_count increment disabled - removes 27.55% bottleneck
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
|
Phase 20-2: BenchFast mode - Structural bottleneck analysis (+4.5% ceiling)
## Summary
Implemented BenchFast mode to measure HAKMEM's structural performance ceiling
by removing ALL safety costs. Result: +4.5% improvement reveals safety mechanisms
are NOT the bottleneck - 95% of the performance gap is structural.
## Critical Discovery: Safety Costs ≠ Bottleneck
**BenchFast Performance** (500K iterations, 256B fixed-size):
- Baseline (normal): 54.4M ops/s (53.3% of System malloc)
- BenchFast (no safety): 56.9M ops/s (55.7% of System malloc) **+4.5%**
- System malloc: 102.1M ops/s (100%)
**Key Finding**: Removing classify_ptr, Pool/Mid routing, registry, mincore,
and ExternalGuard yields only +4.5% improvement. This proves these safety
mechanisms account for <5% of total overhead.
**Real Bottleneck** (estimated 75% of overhead):
- SuperSlab metadata access (~35% CPU)
- TLS SLL pointer chasing (~25% CPU)
- Refill + carving logic (~15% CPU)
## Implementation Details
**BenchFast Bypass Strategy**:
- Alloc: size → class_idx → TLS SLL pop → write header (6-8 instructions)
- Free: read header → BASE pointer → TLS SLL push (3-5 instructions)
- Bypasses: classify_ptr, Pool/Mid routing, registry, mincore, refill
**Recursion Fix** (User's "C案" - Prealloc Pool):
1. bench_fast_init() pre-allocates 50K blocks per class using normal path
2. bench_fast_init_in_progress guard prevents BenchFast during init
3. bench_fast_alloc() pop-only (NO REFILL) during benchmark
**Files**:
- core/box/bench_fast_box.{h,c}: Ultra-minimal alloc/free + prealloc pool
- core/box/hak_wrappers.inc.h: malloc wrapper with init guard check
- Makefile: bench_fast_box.o integration
- CURRENT_TASK.md: Phase 20-2 results documentation
**Activation**:
export HAKMEM_BENCH_FAST_MODE=1
./bench_fixed_size_hakmem 500000 256 128
## Implications for Future Work
**Incremental Optimization Ceiling Confirmed**:
- Phase 9-11 lesson reinforced: symptom relief ≠ root cause fix
- Safety costs: 4.5% (removable via BenchFast)
- Structural bottleneck: 95.5% (requires Phase 12 redesign)
**Phase 12 Shared SuperSlab Pool Priority**:
- 877 SuperSlab → 100-200 (reduce metadata footprint)
- Dynamic slab sharing (mimalloc-style)
- Expected: 70-90M ops/s (70-90% of System malloc)
**Bottleneck Breakdown**:
| Component | CPU Time | BenchFast Removed? |
|------------------------|----------|-------------------|
| SuperSlab metadata | ~35% | ❌ Structural |
| TLS SLL pointer chase | ~25% | ❌ Structural |
| Refill + carving | ~15% | ❌ Structural |
| classify_ptr/registry | ~10% | ✅ Removed |
| Pool/Mid routing | ~5% | ✅ Removed |
| mincore/guards | ~5% | ✅ Removed |
**Conclusion**: Structural bottleneck (75%) >> Safety costs (20%)
## Phase 20 Complete
- Phase 20-1: SS-HotPrewarm (+3.3% from cache warming)
- Phase 20-2: BenchFast mode (proved safety costs = 4.5%)
- **Total Phase 20 improvement**: +7.8% (Phase 19 baseline → BenchFast)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-16 06:36:02 +09:00
|
|
|
|
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
|
|
|
|
|
// WARNING: Bypasses ALL safety checks - benchmark only!
|
|
|
|
|
|
// IMPORTANT: Do NOT use BenchFast during preallocation/init to avoid recursion.
|
2025-11-30 05:12:32 +09:00
|
|
|
|
// Phase 8-TLS-Fix: Use atomic_load for cross-thread safety
|
|
|
|
|
|
if (__builtin_expect(!atomic_load(&g_bench_fast_init_in_progress) && bench_fast_enabled(), 0)) {
|
Phase 20-2: BenchFast mode - Structural bottleneck analysis (+4.5% ceiling)
## Summary
Implemented BenchFast mode to measure HAKMEM's structural performance ceiling
by removing ALL safety costs. Result: +4.5% improvement reveals safety mechanisms
are NOT the bottleneck - 95% of the performance gap is structural.
## Critical Discovery: Safety Costs ≠ Bottleneck
**BenchFast Performance** (500K iterations, 256B fixed-size):
- Baseline (normal): 54.4M ops/s (53.3% of System malloc)
- BenchFast (no safety): 56.9M ops/s (55.7% of System malloc) **+4.5%**
- System malloc: 102.1M ops/s (100%)
**Key Finding**: Removing classify_ptr, Pool/Mid routing, registry, mincore,
and ExternalGuard yields only +4.5% improvement. This proves these safety
mechanisms account for <5% of total overhead.
**Real Bottleneck** (estimated 75% of overhead):
- SuperSlab metadata access (~35% CPU)
- TLS SLL pointer chasing (~25% CPU)
- Refill + carving logic (~15% CPU)
## Implementation Details
**BenchFast Bypass Strategy**:
- Alloc: size → class_idx → TLS SLL pop → write header (6-8 instructions)
- Free: read header → BASE pointer → TLS SLL push (3-5 instructions)
- Bypasses: classify_ptr, Pool/Mid routing, registry, mincore, refill
**Recursion Fix** (User's "C案" - Prealloc Pool):
1. bench_fast_init() pre-allocates 50K blocks per class using normal path
2. bench_fast_init_in_progress guard prevents BenchFast during init
3. bench_fast_alloc() pop-only (NO REFILL) during benchmark
**Files**:
- core/box/bench_fast_box.{h,c}: Ultra-minimal alloc/free + prealloc pool
- core/box/hak_wrappers.inc.h: malloc wrapper with init guard check
- Makefile: bench_fast_box.o integration
- CURRENT_TASK.md: Phase 20-2 results documentation
**Activation**:
export HAKMEM_BENCH_FAST_MODE=1
./bench_fixed_size_hakmem 500000 256 128
## Implications for Future Work
**Incremental Optimization Ceiling Confirmed**:
- Phase 9-11 lesson reinforced: symptom relief ≠ root cause fix
- Safety costs: 4.5% (removable via BenchFast)
- Structural bottleneck: 95.5% (requires Phase 12 redesign)
**Phase 12 Shared SuperSlab Pool Priority**:
- 877 SuperSlab → 100-200 (reduce metadata footprint)
- Dynamic slab sharing (mimalloc-style)
- Expected: 70-90M ops/s (70-90% of System malloc)
**Bottleneck Breakdown**:
| Component | CPU Time | BenchFast Removed? |
|------------------------|----------|-------------------|
| SuperSlab metadata | ~35% | ❌ Structural |
| TLS SLL pointer chase | ~25% | ❌ Structural |
| Refill + carving | ~15% | ❌ Structural |
| classify_ptr/registry | ~10% | ✅ Removed |
| Pool/Mid routing | ~5% | ✅ Removed |
| mincore/guards | ~5% | ✅ Removed |
**Conclusion**: Structural bottleneck (75%) >> Safety costs (20%)
## Phase 20 Complete
- Phase 20-1: SS-HotPrewarm (+3.3% from cache warming)
- Phase 20-2: BenchFast mode (proved safety costs = 4.5%)
- **Total Phase 20 improvement**: +7.8% (Phase 19 baseline → BenchFast)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-16 06:36:02 +09:00
|
|
|
|
if (size <= 1024) { // Tiny range
|
|
|
|
|
|
return bench_fast_alloc(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Fallback to normal path for large allocations
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Fix #16: Resolve double BASE→USER conversion causing header corruption
🎯 ROOT CAUSE: Internal allocation helpers were prematurely converting
BASE → USER pointers before returning to caller. The caller then applied
HAK_RET_ALLOC/tiny_region_id_write_header which performed ANOTHER BASE→USER
conversion, resulting in double offset (BASE+2) and header written at
wrong location.
📦 BOX THEORY SOLUTION: Establish clean pointer conversion boundary at
tiny_region_id_write_header, making it the single source of truth for
BASE → USER conversion.
🔧 CHANGES:
- Fix #16: Remove premature BASE→USER conversions (6 locations)
* core/tiny_alloc_fast.inc.h (3 fixes)
* core/hakmem_tiny_refill.inc.h (2 fixes)
* core/hakmem_tiny_fastcache.inc.h (1 fix)
- Fix #12: Add header validation in tls_sll_pop (detect corruption)
- Fix #14: Defense-in-depth header restoration in tls_sll_splice
- Fix #15: USER pointer detection (for debugging)
- Fix #13: Bump window header restoration
- Fix #2, #6, #7, #8: Various header restoration & NULL termination
🧪 TEST RESULTS: 100% SUCCESS
- 10K-500K iterations: All passed
- 8 seeds × 100K: All passed (42,123,456,789,999,314,271,161)
- Performance: ~630K ops/s average (stable)
- Header corruption: ZERO
📋 FIXES SUMMARY:
Fix #1-8: Initial header restoration & chain fixes (chatgpt-san)
Fix #9-10: USER pointer auto-fix (later disabled)
Fix #12: Validation system (caught corruption at call 14209)
Fix #13: Bump window header writes
Fix #14: Splice defense-in-depth
Fix #15: USER pointer detection (debugging tool)
Fix #16: Double conversion fix (FINAL SOLUTION) ✅
🎓 LESSONS LEARNED:
1. Validation catches bugs early (Fix #12 was critical)
2. Class-specific inline logging reveals patterns (Option C)
3. Box Theory provides clean architectural boundaries
4. Multiple investigation approaches (Task/chatgpt-san collaboration)
📄 DOCUMENTATION:
- P0_BUG_STATUS.md: Complete bug tracking timeline
- C2_CORRUPTION_ROOT_CAUSE_FINAL.md: Detailed root cause analysis
- FINAL_ANALYSIS_C2_CORRUPTION.md: Investigation methodology
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Task Agent <task@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-11-12 10:33:57 +09:00
|
|
|
|
// DEBUG BAILOUT DISABLED - Testing full path
|
|
|
|
|
|
// if (__builtin_expect(count >= 14270 && count <= 14285, 0)) {
|
|
|
|
|
|
// extern void* __libc_malloc(size_t);
|
|
|
|
|
|
// fprintf(stderr, "[MALLOC_WRAPPER] count=%lu size=%zu - BAILOUT TO LIBC!\n", count, size);
|
|
|
|
|
|
// fflush(stderr);
|
|
|
|
|
|
// return __libc_malloc(size);
|
|
|
|
|
|
// }
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
// CRITICAL FIX (BUG #7): Increment lock depth FIRST, before ANY libc calls
|
|
|
|
|
|
// This prevents infinite recursion when getenv/fprintf/dlopen call malloc
|
|
|
|
|
|
g_hakmem_lock_depth++;
|
2025-12-01 22:06:10 +09:00
|
|
|
|
// Debug step trace for 33KB: gated by env HAKMEM_STEP_TRACE (default: OFF)
|
2025-12-02 16:16:51 +09:00
|
|
|
|
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg();
|
|
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:1 Lock++\n", 14);
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
|
|
|
|
|
|
// Guard against recursion during initialization
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int init_wait = hak_init_wait_for_ready();
|
|
|
|
|
|
if (__builtin_expect(init_wait <= 0, 0)) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_INIT_WAIT_FAIL, "[wrap] libc malloc: init_wait\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
2025-11-07 00:37:33 +09:00
|
|
|
|
extern void* __libc_malloc(size_t);
|
2025-12-01 16:37:59 +09:00
|
|
|
|
if (size == 33000) write(2, "RET:Initializing\n", 17);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
return __libc_malloc(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
// Now safe to call getenv/fprintf/dlopen (will use __libc_malloc if needed)
|
2025-11-27 03:18:33 +09:00
|
|
|
|
extern int g_sfc_debug;
|
2025-11-07 00:37:33 +09:00
|
|
|
|
static _Atomic int debug_count = 0;
|
2025-11-27 03:18:33 +09:00
|
|
|
|
if (__builtin_expect(g_sfc_debug, 0) && debug_count < 100) {
|
2025-11-07 00:37:33 +09:00
|
|
|
|
int n = atomic_fetch_add(&debug_count, 1);
|
|
|
|
|
|
if (n < 20) fprintf(stderr, "[SFC_DEBUG] malloc(%zu)\n", size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_FORCE_LIBC, "[wrap] libc malloc: force_libc\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
2025-11-07 00:37:33 +09:00
|
|
|
|
extern void* __libc_malloc(size_t);
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "RET:ForceLibc\n", 14);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
return __libc_malloc(size);
|
|
|
|
|
|
}
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:2 ForceLibc passed\n", 24);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
|
|
|
|
|
|
int ld_mode = hak_ld_env_mode();
|
|
|
|
|
|
if (ld_mode) {
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:3 LD Mode\n", 15);
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
// BUG FIX: g_jemalloc_loaded == -1 (unknown) should not trigger fallback
|
|
|
|
|
|
// Only fallback if jemalloc is ACTUALLY loaded (> 0)
|
|
|
|
|
|
if (hak_ld_block_jemalloc() && g_jemalloc_loaded > 0) {
|
|
|
|
|
|
wrapper_record_fallback(FB_JEMALLOC_BLOCK, "[wrap] libc malloc: jemalloc block\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
2025-11-07 00:37:33 +09:00
|
|
|
|
extern void* __libc_malloc(size_t);
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "RET:Jemalloc\n", 13);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
return __libc_malloc(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!g_initialized) { hak_init(); }
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int ld_init_wait = hak_init_wait_for_ready();
|
|
|
|
|
|
if (__builtin_expect(ld_init_wait <= 0, 0)) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_INIT_LD_WAIT_FAIL, "[wrap] libc malloc: ld init_wait\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
2025-11-07 00:37:33 +09:00
|
|
|
|
extern void* __libc_malloc(size_t);
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "RET:Init2\n", 10);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
return __libc_malloc(size);
|
|
|
|
|
|
}
|
2025-11-07 01:15:28 +09:00
|
|
|
|
// Cache HAKMEM_LD_SAFE to avoid repeated getenv on hot path
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->ld_safe_mode >= 2) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_LD_SAFE, "[wrap] libc malloc: ld_safe\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
2025-11-07 00:37:33 +09:00
|
|
|
|
extern void* __libc_malloc(size_t);
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "RET:LDSafe\n", 11);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
return __libc_malloc(size);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:4 LD Check passed\n", 23);
|
2025-11-07 00:37:33 +09:00
|
|
|
|
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// Phase 26: CRITICAL - Ensure initialization before fast path
|
|
|
|
|
|
// (fast path bypasses hak_alloc_at, so we need to init here)
|
|
|
|
|
|
if (!g_initialized) hak_init();
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 26: Front Gate Unification (Tiny fast path)
|
|
|
|
|
|
// Placed AFTER all safety checks (lock depth, initializing, LD_SAFE, jemalloc)
|
|
|
|
|
|
// Bypasses: hak_alloc_at routing (236 lines) + wrapper diagnostics + tiny overhead
|
|
|
|
|
|
// Target: +10-15% performance (11.35M → 12.5-13.5M ops/s)
|
|
|
|
|
|
// ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF)
|
Phase 4-Step3: Add Front Config Box (+2.7-4.9% dead code elimination)
Implement compile-time configuration system for dead code elimination in Tiny
allocation hot paths. The Config Box provides dual-mode configuration:
- Normal mode: Runtime ENV checks (backward compatible, flexible)
- PGO mode: Compile-time constants (dead code elimination, performance)
PERFORMANCE:
- Baseline (runtime config): 50.32 M ops/s (avg of 5 runs)
- Config Box (PGO mode): 52.77 M ops/s (avg of 5 runs)
- Improvement: +2.45 M ops/s (+4.87% with outlier, +2.72% without)
- Target: +5-8% (partially achieved)
IMPLEMENTATION:
1. core/box/tiny_front_config_box.h (NEW):
- Defines TINY_FRONT_*_ENABLED macros for all config checks
- PGO mode (#if HAKMEM_TINY_FRONT_PGO): Macros expand to constants (0/1)
- Normal mode (#else): Macros expand to function calls
- Functions remain in their original locations (no code duplication)
2. core/hakmem_build_flags.h:
- Added HAKMEM_TINY_FRONT_PGO build flag (default: 0, off)
- Documentation: Usage with make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1"
3. core/box/hak_wrappers.inc.h:
- Replaced front_gate_unified_enabled() with TINY_FRONT_UNIFIED_GATE_ENABLED
- 2 call sites updated (malloc and free fast paths)
- Added config box include
EXPECTED DEAD CODE ELIMINATION (PGO mode):
if (TINY_FRONT_UNIFIED_GATE_ENABLED) { ... }
→ if (1) { ... } // Constant, always true
→ Compiler optimizes away the branch, keeps body
SCOPE:
Currently only front_gate_unified_enabled() is replaced (2 call sites).
To achieve full +5-8% target, expand to other config checks:
- ultra_slim_mode_enabled()
- tiny_heap_v2_enabled()
- sfc_cascade_enabled()
- tiny_fastcache_enabled()
- tiny_metrics_enabled()
- tiny_diag_enabled()
BUILD USAGE:
Normal mode (runtime config, default):
make bench_random_mixed_hakmem
PGO mode (compile-time config, dead code elimination):
make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem
BOX PATTERN COMPLIANCE:
✅ Single Responsibility: Configuration management ONLY
✅ Clear Contract: Dual-mode (PGO = constants, Normal = runtime)
✅ Observable: Config report function (debug builds)
✅ Safe: Backward compatible (default is normal mode)
✅ Testable: Easy A/B comparison (PGO vs normal builds)
WHY +2.7-4.9% (below +5-8% target)?
- Limited scope: Only 2 call sites for 1 config function replaced
- Lazy init overhead: front_gate_unified_enabled() cached after first call
- Need to expand to more config checks for full benefit
NEXT STEPS:
- Expand config macro usage to other functions (optional)
- OR proceed with PGO re-enablement (Final polish)
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 12:18:37 +09:00
|
|
|
|
// Phase 4-Step3: Use config macro for compile-time optimization
|
2025-11-29 16:17:34 +09:00
|
|
|
|
// Phase 7-Step1: Changed expect hint from 0→1 (unified path is now LIKELY)
|
|
|
|
|
|
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) {
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:5 Unified Gate check\n", 26);
|
2025-11-17 05:29:08 +09:00
|
|
|
|
if (size <= tiny_get_max_size()) {
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:5.1 Inside Unified\n", 24);
|
2025-12-04 12:06:14 +09:00
|
|
|
|
// Tiny Alloc Gate Box: malloc_tiny_fast() の薄いラッパ
|
|
|
|
|
|
// (診断 OFF 時は従来どおりの挙動・コスト)
|
|
|
|
|
|
void* ptr = tiny_alloc_gate_fast(size);
|
2025-11-17 05:29:08 +09:00
|
|
|
|
if (__builtin_expect(ptr != NULL, 1)) {
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "RET:TinyFast\n", 13);
|
2025-11-17 05:29:08 +09:00
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Unified Cache miss → fallback to normal path (hak_alloc_at)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-12-02 16:16:51 +09:00
|
|
|
|
if (wcfg->step_trace && size == 33000) write(2, "STEP:6 All checks passed\n", 25);
|
2025-11-17 05:29:08 +09:00
|
|
|
|
|
2025-11-13 13:32:58 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
if (count > 14250 && count < 14280 && size <= 1024) {
|
|
|
|
|
|
fprintf(stderr, "[MALLOC_WRAPPER] count=%lu calling hak_alloc_at\n", count);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
2025-11-13 13:32:58 +09:00
|
|
|
|
#endif
|
2025-11-07 00:37:33 +09:00
|
|
|
|
void* ptr = hak_alloc_at(size, HAK_CALLSITE());
|
2025-11-13 13:32:58 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
if (count > 14250 && count < 14280 && size <= 1024) {
|
|
|
|
|
|
fprintf(stderr, "[MALLOC_WRAPPER] count=%lu hak_alloc_at returned %p\n", count, ptr);
|
|
|
|
|
|
fflush(stderr);
|
|
|
|
|
|
}
|
2025-11-13 13:32:58 +09:00
|
|
|
|
#endif
|
2025-11-07 00:37:33 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void free(void* ptr) {
|
2025-12-04 19:20:44 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
// Debug-only trace counters; disabled in release to keep free() hot path
|
|
|
|
|
|
// free of atomic increments.
|
2025-12-03 20:42:28 +09:00
|
|
|
|
static _Atomic int g_wrap_free_trace_count = 0;
|
|
|
|
|
|
if (atomic_fetch_add_explicit(&g_wrap_free_trace_count, 1, memory_order_relaxed) < 256) {
|
|
|
|
|
|
HAK_TRACE("[wrap_free_enter]\n");
|
|
|
|
|
|
}
|
2025-11-07 00:37:33 +09:00
|
|
|
|
atomic_fetch_add_explicit(&g_free_wrapper_calls, 1, memory_order_relaxed);
|
2025-12-04 19:20:44 +09:00
|
|
|
|
#endif
|
2025-11-07 00:37:33 +09:00
|
|
|
|
if (!ptr) return;
|
Phase 20-2: BenchFast mode - Structural bottleneck analysis (+4.5% ceiling)
## Summary
Implemented BenchFast mode to measure HAKMEM's structural performance ceiling
by removing ALL safety costs. Result: +4.5% improvement reveals safety mechanisms
are NOT the bottleneck - 95% of the performance gap is structural.
## Critical Discovery: Safety Costs ≠ Bottleneck
**BenchFast Performance** (500K iterations, 256B fixed-size):
- Baseline (normal): 54.4M ops/s (53.3% of System malloc)
- BenchFast (no safety): 56.9M ops/s (55.7% of System malloc) **+4.5%**
- System malloc: 102.1M ops/s (100%)
**Key Finding**: Removing classify_ptr, Pool/Mid routing, registry, mincore,
and ExternalGuard yields only +4.5% improvement. This proves these safety
mechanisms account for <5% of total overhead.
**Real Bottleneck** (estimated 75% of overhead):
- SuperSlab metadata access (~35% CPU)
- TLS SLL pointer chasing (~25% CPU)
- Refill + carving logic (~15% CPU)
## Implementation Details
**BenchFast Bypass Strategy**:
- Alloc: size → class_idx → TLS SLL pop → write header (6-8 instructions)
- Free: read header → BASE pointer → TLS SLL push (3-5 instructions)
- Bypasses: classify_ptr, Pool/Mid routing, registry, mincore, refill
**Recursion Fix** (User's "C案" - Prealloc Pool):
1. bench_fast_init() pre-allocates 50K blocks per class using normal path
2. bench_fast_init_in_progress guard prevents BenchFast during init
3. bench_fast_alloc() pop-only (NO REFILL) during benchmark
**Files**:
- core/box/bench_fast_box.{h,c}: Ultra-minimal alloc/free + prealloc pool
- core/box/hak_wrappers.inc.h: malloc wrapper with init guard check
- Makefile: bench_fast_box.o integration
- CURRENT_TASK.md: Phase 20-2 results documentation
**Activation**:
export HAKMEM_BENCH_FAST_MODE=1
./bench_fixed_size_hakmem 500000 256 128
## Implications for Future Work
**Incremental Optimization Ceiling Confirmed**:
- Phase 9-11 lesson reinforced: symptom relief ≠ root cause fix
- Safety costs: 4.5% (removable via BenchFast)
- Structural bottleneck: 95.5% (requires Phase 12 redesign)
**Phase 12 Shared SuperSlab Pool Priority**:
- 877 SuperSlab → 100-200 (reduce metadata footprint)
- Dynamic slab sharing (mimalloc-style)
- Expected: 70-90M ops/s (70-90% of System malloc)
**Bottleneck Breakdown**:
| Component | CPU Time | BenchFast Removed? |
|------------------------|----------|-------------------|
| SuperSlab metadata | ~35% | ❌ Structural |
| TLS SLL pointer chase | ~25% | ❌ Structural |
| Refill + carving | ~15% | ❌ Structural |
| classify_ptr/registry | ~10% | ✅ Removed |
| Pool/Mid routing | ~5% | ✅ Removed |
| mincore/guards | ~5% | ✅ Removed |
**Conclusion**: Structural bottleneck (75%) >> Safety costs (20%)
## Phase 20 Complete
- Phase 20-1: SS-HotPrewarm (+3.3% from cache warming)
- Phase 20-2: BenchFast mode (proved safety costs = 4.5%)
- **Total Phase 20 improvement**: +7.8% (Phase 19 baseline → BenchFast)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-16 06:36:02 +09:00
|
|
|
|
|
|
|
|
|
|
// Phase 20-2: BenchFast mode (structural ceiling measurement)
|
|
|
|
|
|
// WARNING: Bypasses ALL safety checks - benchmark only!
|
|
|
|
|
|
if (__builtin_expect(bench_fast_enabled(), 0)) {
|
|
|
|
|
|
// Trust header magic to identify Tiny allocations
|
2025-12-03 12:11:27 +09:00
|
|
|
|
#if HAKMEM_TINY_HEADER_CLASSIDX
|
Phase 20-2: BenchFast mode - Structural bottleneck analysis (+4.5% ceiling)
## Summary
Implemented BenchFast mode to measure HAKMEM's structural performance ceiling
by removing ALL safety costs. Result: +4.5% improvement reveals safety mechanisms
are NOT the bottleneck - 95% of the performance gap is structural.
## Critical Discovery: Safety Costs ≠ Bottleneck
**BenchFast Performance** (500K iterations, 256B fixed-size):
- Baseline (normal): 54.4M ops/s (53.3% of System malloc)
- BenchFast (no safety): 56.9M ops/s (55.7% of System malloc) **+4.5%**
- System malloc: 102.1M ops/s (100%)
**Key Finding**: Removing classify_ptr, Pool/Mid routing, registry, mincore,
and ExternalGuard yields only +4.5% improvement. This proves these safety
mechanisms account for <5% of total overhead.
**Real Bottleneck** (estimated 75% of overhead):
- SuperSlab metadata access (~35% CPU)
- TLS SLL pointer chasing (~25% CPU)
- Refill + carving logic (~15% CPU)
## Implementation Details
**BenchFast Bypass Strategy**:
- Alloc: size → class_idx → TLS SLL pop → write header (6-8 instructions)
- Free: read header → BASE pointer → TLS SLL push (3-5 instructions)
- Bypasses: classify_ptr, Pool/Mid routing, registry, mincore, refill
**Recursion Fix** (User's "C案" - Prealloc Pool):
1. bench_fast_init() pre-allocates 50K blocks per class using normal path
2. bench_fast_init_in_progress guard prevents BenchFast during init
3. bench_fast_alloc() pop-only (NO REFILL) during benchmark
**Files**:
- core/box/bench_fast_box.{h,c}: Ultra-minimal alloc/free + prealloc pool
- core/box/hak_wrappers.inc.h: malloc wrapper with init guard check
- Makefile: bench_fast_box.o integration
- CURRENT_TASK.md: Phase 20-2 results documentation
**Activation**:
export HAKMEM_BENCH_FAST_MODE=1
./bench_fixed_size_hakmem 500000 256 128
## Implications for Future Work
**Incremental Optimization Ceiling Confirmed**:
- Phase 9-11 lesson reinforced: symptom relief ≠ root cause fix
- Safety costs: 4.5% (removable via BenchFast)
- Structural bottleneck: 95.5% (requires Phase 12 redesign)
**Phase 12 Shared SuperSlab Pool Priority**:
- 877 SuperSlab → 100-200 (reduce metadata footprint)
- Dynamic slab sharing (mimalloc-style)
- Expected: 70-90M ops/s (70-90% of System malloc)
**Bottleneck Breakdown**:
| Component | CPU Time | BenchFast Removed? |
|------------------------|----------|-------------------|
| SuperSlab metadata | ~35% | ❌ Structural |
| TLS SLL pointer chase | ~25% | ❌ Structural |
| Refill + carving | ~15% | ❌ Structural |
| classify_ptr/registry | ~10% | ✅ Removed |
| Pool/Mid routing | ~5% | ✅ Removed |
| mincore/guards | ~5% | ✅ Removed |
**Conclusion**: Structural bottleneck (75%) >> Safety costs (20%)
## Phase 20 Complete
- Phase 20-1: SS-HotPrewarm (+3.3% from cache warming)
- Phase 20-2: BenchFast mode (proved safety costs = 4.5%)
- **Total Phase 20 improvement**: +7.8% (Phase 19 baseline → BenchFast)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-16 06:36:02 +09:00
|
|
|
|
uint8_t header = *((uint8_t*)ptr - 1);
|
|
|
|
|
|
if ((header & 0xf0) == 0xa0) { // Tiny header magic (0xa0-0xa7)
|
|
|
|
|
|
bench_fast_free(ptr);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// Fallback to normal path for non-Tiny or no-header mode
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-03 09:15:59 +09:00
|
|
|
|
const wrapper_env_cfg_t* wcfg = wrapper_env_cfg();
|
|
|
|
|
|
|
2025-11-17 05:29:08 +09:00
|
|
|
|
// Phase 26: Front Gate Unification (Tiny free fast path)
|
|
|
|
|
|
// Placed AFTER BenchFast check, BEFORE expensive classify_ptr()
|
|
|
|
|
|
// Bypasses: hak_free_at routing + wrapper overhead + classification
|
|
|
|
|
|
// Target: +10-15% performance (pairs with malloc_tiny_fast)
|
|
|
|
|
|
// ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF)
|
Phase 4-Step3: Add Front Config Box (+2.7-4.9% dead code elimination)
Implement compile-time configuration system for dead code elimination in Tiny
allocation hot paths. The Config Box provides dual-mode configuration:
- Normal mode: Runtime ENV checks (backward compatible, flexible)
- PGO mode: Compile-time constants (dead code elimination, performance)
PERFORMANCE:
- Baseline (runtime config): 50.32 M ops/s (avg of 5 runs)
- Config Box (PGO mode): 52.77 M ops/s (avg of 5 runs)
- Improvement: +2.45 M ops/s (+4.87% with outlier, +2.72% without)
- Target: +5-8% (partially achieved)
IMPLEMENTATION:
1. core/box/tiny_front_config_box.h (NEW):
- Defines TINY_FRONT_*_ENABLED macros for all config checks
- PGO mode (#if HAKMEM_TINY_FRONT_PGO): Macros expand to constants (0/1)
- Normal mode (#else): Macros expand to function calls
- Functions remain in their original locations (no code duplication)
2. core/hakmem_build_flags.h:
- Added HAKMEM_TINY_FRONT_PGO build flag (default: 0, off)
- Documentation: Usage with make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1"
3. core/box/hak_wrappers.inc.h:
- Replaced front_gate_unified_enabled() with TINY_FRONT_UNIFIED_GATE_ENABLED
- 2 call sites updated (malloc and free fast paths)
- Added config box include
EXPECTED DEAD CODE ELIMINATION (PGO mode):
if (TINY_FRONT_UNIFIED_GATE_ENABLED) { ... }
→ if (1) { ... } // Constant, always true
→ Compiler optimizes away the branch, keeps body
SCOPE:
Currently only front_gate_unified_enabled() is replaced (2 call sites).
To achieve full +5-8% target, expand to other config checks:
- ultra_slim_mode_enabled()
- tiny_heap_v2_enabled()
- sfc_cascade_enabled()
- tiny_fastcache_enabled()
- tiny_metrics_enabled()
- tiny_diag_enabled()
BUILD USAGE:
Normal mode (runtime config, default):
make bench_random_mixed_hakmem
PGO mode (compile-time config, dead code elimination):
make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem
BOX PATTERN COMPLIANCE:
✅ Single Responsibility: Configuration management ONLY
✅ Clear Contract: Dual-mode (PGO = constants, Normal = runtime)
✅ Observable: Config report function (debug builds)
✅ Safe: Backward compatible (default is normal mode)
✅ Testable: Easy A/B comparison (PGO vs normal builds)
WHY +2.7-4.9% (below +5-8% target)?
- Limited scope: Only 2 call sites for 1 config function replaced
- Lazy init overhead: front_gate_unified_enabled() cached after first call
- Need to expand to more config checks for full benefit
NEXT STEPS:
- Expand config macro usage to other functions (optional)
- OR proceed with PGO re-enablement (Final polish)
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 12:18:37 +09:00
|
|
|
|
// Phase 4-Step3: Use config macro for compile-time optimization
|
2025-11-29 16:17:34 +09:00
|
|
|
|
// Phase 7-Step1: Changed expect hint from 0→1 (unified path is now LIKELY)
|
|
|
|
|
|
if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) {
|
2025-11-17 05:29:08 +09:00
|
|
|
|
int freed = free_tiny_fast(ptr);
|
|
|
|
|
|
if (__builtin_expect(freed, 1)) {
|
|
|
|
|
|
return; // Success (pushed to Unified Cache)
|
|
|
|
|
|
}
|
|
|
|
|
|
// Unified Cache full OR invalid header → fallback to normal path
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-11 00:02:24 +09:00
|
|
|
|
do { static int on=-1; if (on==-1){ const char* e=getenv("HAKMEM_FREE_WRAP_TRACE"); on=(e&&*e&&*e!='0')?1:0;} if(on){ fprintf(stderr,"[WRAP_FREE_ENTER] ptr=%p depth=%d init=%d\n", ptr, g_hakmem_lock_depth, g_initializing); } } while(0);
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
// Debug safety: guard obviously invalid tiny integers to avoid libc crash and collect trace
|
|
|
|
|
|
if ((uintptr_t)ptr < 4096) {
|
|
|
|
|
|
ptr_trace_dump_now("wrap_small_ptr");
|
|
|
|
|
|
fprintf(stderr, "[FREE_SMALL_PTR] ignore ptr=%p (likely header-corruption sentinel)\n", ptr);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Classify pointer BEFORE early libc fallbacks to avoid misrouting Tiny pointers
|
|
|
|
|
|
// This is safe: classifier uses header probe and registry; does not allocate.
|
|
|
|
|
|
int is_hakmem_owned = 0;
|
|
|
|
|
|
{
|
|
|
|
|
|
ptr_classification_t c = classify_ptr(ptr);
|
|
|
|
|
|
switch (c.kind) {
|
|
|
|
|
|
case PTR_KIND_TINY_HEADER:
|
|
|
|
|
|
case PTR_KIND_TINY_HEADERLESS:
|
|
|
|
|
|
case PTR_KIND_POOL_TLS:
|
Phase 23 Unified Cache + PageFaultTelemetry generalization: Mid/VM page-fault bottleneck identified
Summary:
- Phase 23 Unified Cache: +30% improvement (Random Mixed 256B: 18.18M → 23.68M ops/s)
- PageFaultTelemetry: Extended to generic buckets (C0-C7, MID, L25, SSM)
- Measurement-driven decision: Mid/VM page-faults (80-100K) >> Tiny (6K) → prioritize Mid/VM optimization
Phase 23 Changes:
1. Unified Cache implementation (core/front/tiny_unified_cache.{c,h})
- Direct SuperSlab carve (TLS SLL bypass)
- Self-contained pop-or-refill pattern
- ENV: HAKMEM_TINY_UNIFIED_CACHE=1, HAKMEM_TINY_UNIFIED_C{0-7}=128
2. Fast path pruning (tiny_alloc_fast.inc.h, tiny_free_fast_v2.inc.h)
- Unified ON → direct cache access (skip all intermediate layers)
- Alloc: unified_cache_pop_or_refill() → immediate fail to slow
- Free: unified_cache_push() → fallback to SLL only if full
PageFaultTelemetry Changes:
3. Generic bucket architecture (core/box/pagefault_telemetry_box.{c,h})
- PF_BUCKET_{C0-C7, MID, L25, SSM} for domain-specific measurement
- Integration: hak_pool_try_alloc(), l25_alloc_new_run(), shared_pool_allocate_superslab_unlocked()
4. Measurement results (Random Mixed 500K / 256B):
- Tiny C2-C7: 2-33 pages, high reuse (64-3.8 touches/page)
- SSM: 512 pages (initialization footprint)
- MID/L25: 0 (unused in this workload)
- Mid/Large VM benchmarks: 80-100K page-faults (13-16x higher than Tiny)
Ring Cache Enhancements:
5. Hot Ring Cache (core/front/tiny_ring_cache.{c,h})
- ENV: HAKMEM_TINY_HOT_RING_ENABLE=1, HAKMEM_TINY_HOT_RING_C{0-7}=size
- Conditional compilation cleanup
Documentation:
6. Analysis reports
- RANDOM_MIXED_BOTTLENECK_ANALYSIS.md: Page-fault breakdown
- RANDOM_MIXED_SUMMARY.md: Phase 23 summary
- RING_CACHE_ACTIVATION_GUIDE.md: Ring cache usage
- CURRENT_TASK.md: Updated with Phase 23 results and Phase 24 plan
Next Steps (Phase 24):
- Target: Mid/VM PageArena/HotSpanBox (page-fault reduction 80-100K → 30-40K)
- Tiny SSM optimization deferred (low ROI, ~6K page-faults already optimal)
- Expected improvement: +30-50% for Mid/Large workloads
Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 02:47:58 +09:00
|
|
|
|
case PTR_KIND_MID_LARGE: // FIX: Include Mid-Large (mmap/ACE) pointers
|
2025-11-11 00:02:24 +09:00
|
|
|
|
is_hakmem_owned = 1; break;
|
|
|
|
|
|
default: break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-12-01 22:06:10 +09:00
|
|
|
|
if (!is_hakmem_owned) {
|
|
|
|
|
|
// Failsafe: Mid registry lookup catches headerless/corrupted Mid allocations
|
|
|
|
|
|
if (hak_pool_mid_lookup(ptr, NULL)) {
|
|
|
|
|
|
is_hakmem_owned = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-11 00:02:24 +09:00
|
|
|
|
|
|
|
|
|
|
if (is_hakmem_owned) {
|
|
|
|
|
|
// Route to hak_free_at even if lock_depth>0(ログ抑制のためptr_traceのみ使用)
|
|
|
|
|
|
g_hakmem_lock_depth++;
|
|
|
|
|
|
hak_free_at(ptr, 0, HAK_CALLSITE());
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
2025-11-11 01:47:06 +09:00
|
|
|
|
// Front Gate libc bypass detection (quiet in release)
|
2025-11-11 00:02:24 +09:00
|
|
|
|
static _Atomic uint64_t fg_libc_bypass_count = 0;
|
2025-11-11 01:47:06 +09:00
|
|
|
|
|
|
|
|
|
|
if (g_hakmem_lock_depth > 0) {
|
|
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-11 00:02:24 +09:00
|
|
|
|
uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed);
|
2025-11-11 01:47:06 +09:00
|
|
|
|
if (count < 10) {
|
2025-11-11 00:02:24 +09:00
|
|
|
|
fprintf(stderr, "[FG_LIBC_BYPASS] lockdepth=%d count=%llu ptr=%p\n", g_hakmem_lock_depth, (unsigned long long)count, ptr);
|
|
|
|
|
|
}
|
2025-11-11 01:47:06 +09:00
|
|
|
|
#else
|
|
|
|
|
|
(void)fg_libc_bypass_count;
|
|
|
|
|
|
#endif
|
2025-11-11 10:00:36 +09:00
|
|
|
|
// Safety: If this is a HAKMEM-owned header allocation, free raw correctly
|
|
|
|
|
|
do {
|
|
|
|
|
|
void* raw = (char*)ptr - HEADER_SIZE;
|
|
|
|
|
|
int safe_same_page = (((uintptr_t)ptr & 0xFFFu) >= HEADER_SIZE);
|
|
|
|
|
|
if (!safe_same_page) {
|
|
|
|
|
|
if (!hak_is_memory_readable(raw)) break;
|
|
|
|
|
|
}
|
|
|
|
|
|
AllocHeader* hdr = (AllocHeader*)raw;
|
|
|
|
|
|
if (hdr->magic == HAKMEM_MAGIC) {
|
|
|
|
|
|
// Dispatch based on allocation method
|
|
|
|
|
|
if (hdr->method == ALLOC_METHOD_MALLOC) {
|
|
|
|
|
|
extern void __libc_free(void*);
|
|
|
|
|
|
ptr_trace_dump_now("wrap_libc_lockdepth_hak_hdr_malloc");
|
|
|
|
|
|
__libc_free(raw);
|
|
|
|
|
|
return;
|
|
|
|
|
|
} else if (hdr->method == ALLOC_METHOD_MMAP) {
|
|
|
|
|
|
ptr_trace_dump_now("wrap_libc_lockdepth_hak_hdr_mmap");
|
|
|
|
|
|
hkm_sys_munmap(raw, hdr->size);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} while (0);
|
|
|
|
|
|
// Unknown pointer or non-HAKMEM: fall back to libc free(ptr)
|
2025-11-11 01:47:06 +09:00
|
|
|
|
extern void __libc_free(void*);
|
|
|
|
|
|
ptr_trace_dump_now("wrap_libc_lockdepth");
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_LOCKDEPTH, "[wrap] libc free: lockdepth\n");
|
2025-11-11 01:47:06 +09:00
|
|
|
|
__libc_free(ptr);
|
|
|
|
|
|
return;
|
2025-11-11 00:02:24 +09:00
|
|
|
|
}
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int free_init_wait = hak_init_wait_for_ready();
|
|
|
|
|
|
if (__builtin_expect(free_init_wait <= 0, 0)) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_INIT_WAIT_FAIL, "[wrap] libc free: init_wait\n");
|
2025-11-11 01:47:06 +09:00
|
|
|
|
#if !HAKMEM_BUILD_RELEASE
|
2025-11-11 00:02:24 +09:00
|
|
|
|
uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed);
|
2025-11-11 01:47:06 +09:00
|
|
|
|
if (count < 10) {
|
2025-11-11 00:02:24 +09:00
|
|
|
|
fprintf(stderr, "[FG_LIBC_BYPASS] init=%d count=%llu ptr=%p\n", g_initializing, (unsigned long long)count, ptr);
|
|
|
|
|
|
}
|
2025-11-11 01:47:06 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
extern void __libc_free(void*);
|
|
|
|
|
|
ptr_trace_dump_now("wrap_libc_init");
|
|
|
|
|
|
__libc_free(ptr);
|
|
|
|
|
|
return;
|
2025-11-11 00:02:24 +09:00
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(hak_force_libc_alloc(), 0)) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_force"); __libc_free(ptr); return; }
|
2025-11-07 00:37:33 +09:00
|
|
|
|
if (hak_ld_env_mode()) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
// BUG FIX: g_jemalloc_loaded == -1 (unknown) should not trigger fallback
|
|
|
|
|
|
if (hak_ld_block_jemalloc() && g_jemalloc_loaded > 0) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_jemalloc"); __libc_free(ptr); return; }
|
2025-11-07 00:37:33 +09:00
|
|
|
|
if (!g_initialized) { hak_init(); }
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int free_ld_wait = hak_init_wait_for_ready();
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
if (__builtin_expect(free_ld_wait <= 0, 0)) { wrapper_record_fallback(FB_INIT_LD_WAIT_FAIL, "[wrap] libc free: ld init_wait\n"); extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_init"); __libc_free(ptr); return; }
|
2025-11-07 00:37:33 +09:00
|
|
|
|
}
|
2025-11-16 00:38:29 +09:00
|
|
|
|
|
|
|
|
|
|
// Phase 15: Box Separation - Domain check to distinguish hakmem vs external pointers
|
|
|
|
|
|
// CRITICAL: Prevent BenchMeta (slots[]) from entering CoreAlloc (hak_free_at)
|
|
|
|
|
|
// Strategy: Check 1-byte header at ptr-1 for HEADER_MAGIC (0xa0/0xb0)
|
|
|
|
|
|
// - If hakmem Tiny allocation → route to hak_free_at()
|
|
|
|
|
|
// - Otherwise → delegate to __libc_free() (external/BenchMeta)
|
|
|
|
|
|
//
|
|
|
|
|
|
// Safety: Only check header if ptr is NOT page-aligned (ptr-1 is safe to read)
|
|
|
|
|
|
uintptr_t offset_in_page = (uintptr_t)ptr & 0xFFF;
|
|
|
|
|
|
if (offset_in_page > 0) {
|
|
|
|
|
|
// Not page-aligned, safe to check ptr-1
|
|
|
|
|
|
uint8_t header = *((uint8_t*)ptr - 1);
|
2025-12-01 22:06:10 +09:00
|
|
|
|
if ((header & 0xF0) == 0xA0) {
|
|
|
|
|
|
// Tiny header byte → require Superslab to avoid誤分類
|
|
|
|
|
|
SuperSlab* ss = hak_super_lookup(ptr);
|
|
|
|
|
|
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
|
|
|
|
|
g_hakmem_lock_depth++;
|
|
|
|
|
|
hak_free_at(ptr, 0, HAK_CALLSITE());
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Superslab未登録 → hakmem管理外。libc free にも渡さず無視(ワークセットのゴミ対策)。
|
|
|
|
|
|
return;
|
|
|
|
|
|
} else if ((header & 0xF0) == 0xB0) {
|
|
|
|
|
|
// Pool TLS header (if enabled) — no registry check needed
|
|
|
|
|
|
#ifdef HAKMEM_POOL_TLS_PHASE1
|
2025-11-16 00:38:29 +09:00
|
|
|
|
g_hakmem_lock_depth++;
|
|
|
|
|
|
hak_free_at(ptr, 0, HAK_CALLSITE());
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
return;
|
2025-12-01 22:06:10 +09:00
|
|
|
|
#endif
|
2025-11-16 00:38:29 +09:00
|
|
|
|
}
|
2025-12-01 22:06:10 +09:00
|
|
|
|
// No valid hakmem header → external pointer (BenchMeta, libc allocation, etc.)
|
2025-12-03 09:15:59 +09:00
|
|
|
|
if (__builtin_expect(wcfg->wrap_diag, 0)) {
|
|
|
|
|
|
SuperSlab* ss = hak_super_lookup(ptr);
|
|
|
|
|
|
int slab_idx = -1;
|
|
|
|
|
|
int meta_cls = -1;
|
|
|
|
|
|
int alloc_method = -1;
|
|
|
|
|
|
if (__builtin_expect(ss && ss->magic == SUPERSLAB_MAGIC, 0)) {
|
|
|
|
|
|
slab_idx = slab_index_for(ss, (void*)((uint8_t*)ptr - 1));
|
|
|
|
|
|
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
|
|
|
|
|
|
meta_cls = ss->slabs[slab_idx].class_idx;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if (offset_in_page >= HEADER_SIZE) {
|
|
|
|
|
|
AllocHeader* ah = hak_header_from_user(ptr);
|
|
|
|
|
|
if (hak_header_validate(ah)) {
|
|
|
|
|
|
alloc_method = ah->method;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
|
"[WRAP_FREE_NOT_OWNED] ptr=%p hdr=0x%02x off=0x%lx lockdepth=%d init=%d ss=%p slab=%d meta_cls=%d alloc_method=%d\n",
|
|
|
|
|
|
ptr,
|
|
|
|
|
|
header,
|
|
|
|
|
|
(unsigned long)offset_in_page,
|
|
|
|
|
|
g_hakmem_lock_depth,
|
|
|
|
|
|
g_initializing,
|
|
|
|
|
|
(void*)ss,
|
|
|
|
|
|
slab_idx,
|
|
|
|
|
|
meta_cls,
|
|
|
|
|
|
alloc_method);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Self-heal: if this looks like a SuperSlab (magic matches) but registry lookup failed,
|
|
|
|
|
|
// re-register on the fly and route to hakmem free to avoid libc abort.
|
|
|
|
|
|
{
|
|
|
|
|
|
SuperSlab* ss_guess = (SuperSlab*)((uintptr_t)ptr & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u));
|
|
|
|
|
|
long page_sz = sysconf(_SC_PAGESIZE);
|
|
|
|
|
|
unsigned char mincore_vec = 0;
|
|
|
|
|
|
int mapped = (page_sz > 0) &&
|
|
|
|
|
|
(mincore((void*)((uintptr_t)ss_guess & ~(uintptr_t)(page_sz - 1)),
|
|
|
|
|
|
(size_t)page_sz,
|
|
|
|
|
|
&mincore_vec) == 0);
|
|
|
|
|
|
if (mapped && ss_guess->magic == SUPERSLAB_MAGIC) {
|
|
|
|
|
|
hak_super_register((uintptr_t)ss_guess, ss_guess); // idempotent if already registered
|
|
|
|
|
|
g_hakmem_lock_depth++;
|
|
|
|
|
|
hak_free_at(ptr, 0, HAK_CALLSITE());
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-16 00:38:29 +09:00
|
|
|
|
extern void __libc_free(void*);
|
|
|
|
|
|
ptr_trace_dump_now("wrap_libc_external_nomag");
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_NOT_OWNED, "[wrap] libc free: not_owned\n");
|
2025-11-16 00:38:29 +09:00
|
|
|
|
__libc_free(ptr);
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Page-aligned pointer → cannot safely check header, use full classification
|
|
|
|
|
|
// (This includes Pool/Mid/L25 allocations which may be page-aligned)
|
2025-11-07 00:37:33 +09:00
|
|
|
|
g_hakmem_lock_depth++;
|
|
|
|
|
|
hak_free_at(ptr, 0, HAK_CALLSITE());
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void* calloc(size_t nmemb, size_t size) {
|
2025-12-03 20:42:28 +09:00
|
|
|
|
static _Atomic int g_wrap_calloc_trace_count = 0;
|
|
|
|
|
|
if (atomic_fetch_add_explicit(&g_wrap_calloc_trace_count, 1, memory_order_relaxed) < 128) {
|
|
|
|
|
|
HAK_TRACE("[wrap_calloc_enter]\n");
|
|
|
|
|
|
}
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
// CRITICAL FIX (BUG #8): Increment lock depth FIRST, before ANY libc calls
|
|
|
|
|
|
g_hakmem_lock_depth++;
|
|
|
|
|
|
|
|
|
|
|
|
// Early check for recursion (lock depth already incremented by outer call)
|
|
|
|
|
|
if (g_hakmem_lock_depth > 1) {
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_LOCKDEPTH, "[wrap] libc calloc: lockdepth\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int calloc_init_wait = hak_init_wait_for_ready();
|
|
|
|
|
|
if (__builtin_expect(calloc_init_wait <= 0, 0)) {
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_INIT_WAIT_FAIL, "[wrap] libc calloc: init_wait\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Overflow check
|
|
|
|
|
|
if (size != 0 && nmemb > (SIZE_MAX / size)) {
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
errno = ENOMEM;
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (__builtin_expect(hak_force_libc_alloc(), 0)) {
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
|
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-07 00:37:33 +09:00
|
|
|
|
int ld_mode = hak_ld_env_mode();
|
|
|
|
|
|
if (ld_mode) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
// BUG FIX: g_jemalloc_loaded == -1 (unknown) should not trigger fallback
|
|
|
|
|
|
if (hak_ld_block_jemalloc() && g_jemalloc_loaded > 0) {
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_JEMALLOC_BLOCK, "[wrap] libc calloc: jemalloc block\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
2025-11-07 00:37:33 +09:00
|
|
|
|
if (!g_initialized) { hak_init(); }
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int calloc_ld_wait = hak_init_wait_for_ready();
|
|
|
|
|
|
if (__builtin_expect(calloc_ld_wait <= 0, 0)) {
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
wrapper_record_fallback(FB_INIT_LD_WAIT_FAIL, "[wrap] libc calloc: ld init_wait\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
2025-11-07 01:15:28 +09:00
|
|
|
|
// Reuse cached ld_safe_mode from malloc (same static variable scope won't work, use inline function instead)
|
|
|
|
|
|
// For now, duplicate the caching logic
|
|
|
|
|
|
static _Atomic int ld_safe_mode_calloc = -1;
|
|
|
|
|
|
if (__builtin_expect(ld_safe_mode_calloc < 0, 0)) {
|
|
|
|
|
|
const char* lds = getenv("HAKMEM_LD_SAFE");
|
|
|
|
|
|
ld_safe_mode_calloc = (lds ? atoi(lds) : 1);
|
|
|
|
|
|
}
|
2025-11-07 00:37:33 +09:00
|
|
|
|
size_t total = nmemb * size;
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
if (ld_safe_mode_calloc >= 2 || total > TINY_MAX_SIZE) {
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
extern void* __libc_calloc(size_t, size_t);
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
if (ld_safe_mode_calloc >= 2) wrapper_record_fallback(FB_LD_SAFE, "[wrap] libc calloc: ld_safe\n");
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
return __libc_calloc(nmemb, size);
|
|
|
|
|
|
}
|
2025-11-07 00:37:33 +09:00
|
|
|
|
}
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
|
2025-11-07 00:37:33 +09:00
|
|
|
|
size_t total_size = nmemb * size;
|
|
|
|
|
|
void* ptr = hak_alloc_at(total_size, HAK_CALLSITE());
|
|
|
|
|
|
if (ptr) { memset(ptr, 0, total_size); }
|
|
|
|
|
|
g_hakmem_lock_depth--;
|
|
|
|
|
|
return ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void* realloc(void* ptr, size_t size) {
|
2025-12-03 20:42:28 +09:00
|
|
|
|
static _Atomic int g_wrap_realloc_trace_count = 0;
|
|
|
|
|
|
if (atomic_fetch_add_explicit(&g_wrap_realloc_trace_count, 1, memory_order_relaxed) < 128) {
|
|
|
|
|
|
HAK_TRACE("[wrap_realloc_enter]\n");
|
|
|
|
|
|
}
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
if (g_hakmem_lock_depth > 0) { wrapper_record_fallback(FB_LOCKDEPTH, "[wrap] libc realloc: lockdepth\n"); extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int realloc_init_wait = hak_init_wait_for_ready();
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
if (__builtin_expect(realloc_init_wait <= 0, 0)) { wrapper_record_fallback(FB_INIT_WAIT_FAIL, "[wrap] libc realloc: init_wait\n"); extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
|
|
|
|
|
|
if (__builtin_expect(hak_force_libc_alloc(), 0)) { wrapper_record_fallback(FB_FORCE_LIBC, "[wrap] libc realloc: force_libc\n"); extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
|
2025-11-07 00:37:33 +09:00
|
|
|
|
int ld_mode = hak_ld_env_mode();
|
|
|
|
|
|
if (ld_mode) {
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
// BUG FIX: g_jemalloc_loaded == -1 (unknown) should not trigger fallback
|
|
|
|
|
|
if (hak_ld_block_jemalloc() && g_jemalloc_loaded > 0) { wrapper_record_fallback(FB_JEMALLOC_BLOCK, "[wrap] libc realloc: jemalloc block\n"); extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
|
2025-11-07 00:37:33 +09:00
|
|
|
|
if (!g_initialized) { hak_init(); }
|
2025-12-02 16:44:27 +09:00
|
|
|
|
int realloc_ld_wait = hak_init_wait_for_ready();
|
feat(Phase 2-1): Lane Classification + Fallback Reduction
## Phase 2-1: Lane Classification Box (Single Source of Truth)
### New Module: hak_lane_classify.inc.h
- Centralized size-to-lane mapping with unified boundary definitions
- Lane architecture:
- LANE_TINY: [0, 1024B] SuperSlab (unchanged)
- LANE_POOL: [1025, 52KB] Pool per-thread (extended!)
- LANE_ACE: [52KB, 2MB] ACE learning
- LANE_HUGE: [2MB+] mmap direct
- Key invariant: POOL_MIN = TINY_MAX + 1 (no gaps)
### Fixed: Tiny/Pool Boundary Mismatch
- Before: TINY_MAX_SIZE=1024 vs tiny_get_max_size()=2047 (inconsistent!)
- After: Both reference LANE_TINY_MAX=1024 (authoritative)
- Impact: Eliminates 1025-2047B "unmanaged zone" causing libc fragmentation
### Updated Files
- core/hakmem_tiny.h: Use LANE_TINY_MAX, fix sizes[7]=1024 (was 2047)
- core/hakmem_pool.h: Use POOL_MIN_REQUEST_SIZE=1025 (was 2048)
- core/box/hak_alloc_api.inc.h: Lane-based routing (HAK_LANE_IS_*)
## jemalloc Block Bug Fix
### Root Cause
- g_jemalloc_loaded initialized to -1 (unknown)
- Condition `if (block && g_jemalloc_loaded)` treated -1 as true
- Result: ALL allocations fallback to libc (even when jemalloc not loaded!)
### Fix
- Change condition to `g_jemalloc_loaded > 0`
- Only fallback when jemalloc is ACTUALLY loaded
- Applied to: malloc/free/calloc/realloc
### Impact
- Before: 100% libc fallback (jemalloc block false positive)
- After: Only genuine cases fallback (init_wait, lockdepth, etc.)
## Fallback Diagnostics (ChatGPT contribution)
### New Feature: HAKMEM_WRAP_DIAG
- ENV flag to enable fallback logging
- Reason-specific counters (init_wait, jemalloc_block, lockdepth, etc.)
- First 4 occurrences logged per reason
- Helps identify unwanted fallback paths
### Implementation
- core/box/wrapper_env_box.{c,h}: ENV cache + DIAG flag
- core/box/hak_wrappers.inc.h: wrapper_record_fallback() calls
## Verification
### Fallback Reduction
- Before fix: [wrap] libc malloc: jemalloc block (100% fallback)
- After fix: Only init_wait + lockdepth (expected, minimal)
### Known Issue
- Tiny allocator OOM (size=8) still crashes
- This is a pre-existing bug, unrelated to Phase 2-1
- Was hidden by jemalloc block false positive
- Will be investigated separately
## Performance Impact
### sh8bench 8 threads
- Phase 1-1: 15秒
- Phase 2-1: 14秒 (~7% improvement)
### Note
- True hakmem performance now measurable (no more 100% fallback)
- Tiny OOM prevents full benchmark completion
- Next: Fix Tiny allocator for complete evaluation
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-12-02 19:13:28 +09:00
|
|
|
|
if (__builtin_expect(realloc_ld_wait <= 0, 0)) { wrapper_record_fallback(FB_INIT_LD_WAIT_FAIL, "[wrap] libc realloc: ld init_wait\n"); extern void* __libc_realloc(void*, size_t); return __libc_realloc(ptr, size); }
|
2025-11-07 00:37:33 +09:00
|
|
|
|
}
|
|
|
|
|
|
if (ptr == NULL) { return malloc(size); }
|
|
|
|
|
|
if (size == 0) { free(ptr); return NULL; }
|
|
|
|
|
|
void* new_ptr = malloc(size);
|
|
|
|
|
|
if (!new_ptr) return NULL;
|
|
|
|
|
|
memcpy(new_ptr, ptr, size);
|
|
|
|
|
|
free(ptr);
|
|
|
|
|
|
return new_ptr;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#endif // HAKMEM_FORCE_LIBC_ALLOC_BUILD
|
|
|
|
|
|
|
|
|
|
|
|
#endif // HAK_WRAPPERS_INC_H
|