2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny.h"
|
|
|
|
|
|
#include "hakmem_tiny_config.h" // Centralized configuration
|
2025-11-08 12:54:52 +09:00
|
|
|
|
#include "hakmem_phase7_config.h" // Phase 7: Task 3 constants (PREWARM_COUNT, etc.)
|
2025-11-20 02:01:52 +09:00
|
|
|
|
#include "hakmem_tiny_superslab.h"
|
|
|
|
|
|
#include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary // Phase 6.22: SuperSlab allocator
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_super_registry.h" // Phase 8.2: SuperSlab registry for memory profiling
|
|
|
|
|
|
#include "hakmem_internal.h"
|
|
|
|
|
|
#include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD)
|
|
|
|
|
|
#include "hakmem_tiny_magazine.h"
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
|
2025-11-13 01:45:30 +09:00
|
|
|
|
#include "box/tiny_next_ptr_box.h" // Box API: next pointer read/write
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Phase 1 modules (must come AFTER hakmem_tiny.h for TinyPool definition)
|
|
|
|
|
|
#include "hakmem_tiny_batch_refill.h" // Phase 1: Batch refill/spill for mini-magazine
|
|
|
|
|
|
#include "hakmem_tiny_stats.h" // Phase 1: Batched statistics (replaces XOR RNG)
|
|
|
|
|
|
// Phase 2B modules
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_api.h" // Consolidated: stats_api, query_api, rss_api, registry_api
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "tiny_tls.h"
|
|
|
|
|
|
#include "tiny_debug.h"
|
|
|
|
|
|
#include "tiny_mmap_gate.h"
|
|
|
|
|
|
#include "tiny_debug_ring.h"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_route.h"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "tiny_tls_guard.h"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_ready.h"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny_tls_list.h"
|
|
|
|
|
|
#include "hakmem_tiny_remote_target.h" // Phase 2C-1: Remote target queue
|
|
|
|
|
|
#include "hakmem_tiny_bg_spill.h" // Phase 2C-2: Background spill queue
|
feat: Phase 7 + Phase 2 - Massive performance & stability improvements
Performance Achievements:
- Tiny allocations: +180-280% (21M → 59-70M ops/s random mixed)
- Single-thread: +24% (2.71M → 3.36M ops/s Larson)
- 4T stability: 0% → 95% (19/20 success rate)
- Overall: 91.3% of System malloc average (target was 40-55%) ✓
Phase 7 (Tasks 1-3): Core Optimizations
- Task 1: Header validation removal (Region-ID direct lookup)
- Task 2: Aggressive inline (TLS cache access optimization)
- Task 3: Pre-warm TLS cache (eliminate cold-start penalty)
Result: +180-280% improvement, 85-146% of System malloc
Critical Bug Fixes:
- Fix 64B allocation crash (size-to-class +1 for header)
- Fix 4T wrapper recursion bugs (BUG #7, #8, #10, #11)
- Remove malloc fallback (30% → 50% stability)
Phase 2a: SuperSlab Dynamic Expansion (CRITICAL)
- Implement mimalloc-style chunk linking
- Unlimited slab expansion (no more OOM at 32 slabs)
- Fix chunk initialization bug (bitmap=0x00000001 after expansion)
Files: core/hakmem_tiny_superslab.c/h, core/superslab/superslab_types.h
Result: 50% → 95% stability (19/20 4T success)
Phase 2b: TLS Cache Adaptive Sizing
- Dynamic capacity: 16-2048 slots based on usage
- High-water mark tracking + exponential growth/shrink
- Expected: +3-10% performance, -30-50% memory
Files: core/tiny_adaptive_sizing.c/h (new)
Phase 2c: BigCache Dynamic Hash Table
- Migrate from fixed 256×8 array to dynamic hash table
- Auto-resize: 256 → 512 → 1024 → 65,536 buckets
- Improved hash function (FNV-1a) + collision chaining
Files: core/hakmem_bigcache.c/h
Expected: +10-20% cache hit rate
Design Flaws Analysis:
- Identified 6 components with fixed-capacity bottlenecks
- SuperSlab (CRITICAL), TLS Cache (HIGH), BigCache/L2.5 (MEDIUM)
- Report: DESIGN_FLAWS_ANALYSIS.md (11 chapters)
Documentation:
- 13 comprehensive reports (PHASE*.md, DESIGN_FLAWS*.md)
- Implementation guides, test results, production readiness
- Bug fix reports, root cause analysis
Build System:
- Makefile: phase7 targets, PREWARM_TLS flag
- Auto dependency generation (-MMD -MP) for .inc files
Known Issues:
- 4T stability: 19/20 (95%) - investigating 1 failure for 100%
- L2.5 Pool dynamic sharding: design only (needs 2-3 days integration)
🤖 Generated with Claude Code (https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 17:08:00 +09:00
|
|
|
|
#include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive TLS cache sizing
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// NOTE: hakmem_tiny_tls_ops.h included later (after type definitions)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
#include "tiny_system.h" // Consolidated: stdio, stdlib, string, etc.
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_prof.h"
|
|
|
|
|
|
#include "hakmem_trace.h" // Optional USDT (perf) tracepoints
|
|
|
|
|
|
|
2025-11-20 02:01:52 +09:00
|
|
|
|
// Phase E5: Ultra fast path (8-instruction alloc/free)
|
|
|
|
|
|
#if HAKMEM_ULTRA_FAST_PATH
|
|
|
|
|
|
#include "tiny_ultra_fast.inc.h"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
extern uint64_t g_bytes_allocated; // from hakmem_tiny_superslab.c
|
|
|
|
|
|
|
2025-11-21 01:16:45 +09:00
|
|
|
|
// Tiny allocator configuration, debug counters, and return helpers
|
|
|
|
|
|
#include "hakmem_tiny_config_box.inc"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// Forward declarations for static helpers used before definition
|
|
|
|
|
|
struct TinySlab; // forward
|
|
|
|
|
|
static void move_to_free_list(int class_idx, struct TinySlab* target_slab);
|
|
|
|
|
|
static void move_to_full_list(int class_idx, struct TinySlab* target_slab);
|
|
|
|
|
|
static void release_slab(struct TinySlab* slab);
|
|
|
|
|
|
static TinySlab* allocate_new_slab(int class_idx);
|
|
|
|
|
|
static void tiny_tls_cache_drain(int class_idx);
|
|
|
|
|
|
static void tiny_apply_mem_diet(void);
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.23: SuperSlab allocation forward declaration
|
|
|
|
|
|
static inline void* hak_tiny_alloc_superslab(int class_idx);
|
|
|
|
|
|
static inline void* superslab_tls_bump_fast(int class_idx);
|
2025-11-13 01:45:30 +09:00
|
|
|
|
SuperSlab* superslab_refill(int class_idx);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx);
|
|
|
|
|
|
static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap);
|
|
|
|
|
|
// Forward decl: used by tiny_spec_pop_path before its definition
|
2025-11-09 22:12:34 +09:00
|
|
|
|
#if HAKMEM_TINY_P0_BATCH_REFILL
|
2025-11-13 16:33:03 +09:00
|
|
|
|
// P0 enabled: sll_refill_batch_from_ss is defined in hakmem_tiny_refill_p0.inc.h
|
2025-11-09 22:12:34 +09:00
|
|
|
|
static inline int sll_refill_batch_from_ss(int class_idx, int max_take);
|
|
|
|
|
|
#else
|
2025-11-13 16:33:03 +09:00
|
|
|
|
// Phase 12: sll_refill_small_from_ss is defined in hakmem_tiny_refill.inc.h
|
|
|
|
|
|
// Only a single implementation exists there; declare here for callers.
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
int sll_refill_small_from_ss(int class_idx, int max_take);
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline int sll_refill_small_from_ss(int class_idx, int max_take);
|
|
|
|
|
|
#endif
|
2025-11-09 22:12:34 +09:00
|
|
|
|
#endif
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss);
|
|
|
|
|
|
static void* __attribute__((cold, noinline)) tiny_slow_alloc_fast(int class_idx);
|
|
|
|
|
|
static inline void tiny_remote_drain_owner(struct TinySlab* slab);
|
|
|
|
|
|
static void tiny_remote_drain_locked(struct TinySlab* slab);
|
|
|
|
|
|
// Ultra-fast try-only variant: attempt a direct SuperSlab bump/freelist pop
|
|
|
|
|
|
// without any refill or slow-path work. Returns NULL on miss.
|
|
|
|
|
|
/* moved below TinyTLSSlab definition */
|
|
|
|
|
|
|
|
|
|
|
|
// Step 3d: Forced inlining for readability + performance (306M target)
|
|
|
|
|
|
__attribute__((always_inline))
|
|
|
|
|
|
static inline void* hak_tiny_alloc_wrapper(int class_idx);
|
|
|
|
|
|
// Helpers for SuperSlab active block accounting (atomic, saturating dec)
|
2025-11-13 01:45:30 +09:00
|
|
|
|
void ss_active_add(SuperSlab* ss, uint32_t n) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
atomic_fetch_add_explicit(&ss->total_active_blocks, n, memory_order_relaxed);
|
|
|
|
|
|
}
|
|
|
|
|
|
static inline __attribute__((always_inline)) void ss_active_inc(SuperSlab* ss) {
|
|
|
|
|
|
atomic_fetch_add_explicit(&ss->total_active_blocks, 1u, memory_order_relaxed);
|
|
|
|
|
|
}
|
|
|
|
|
|
// EXTRACTED: ss_active_dec_one() moved to hakmem_tiny_superslab.h (Phase 2C-2)
|
|
|
|
|
|
|
2025-11-05 17:45:11 +09:00
|
|
|
|
// Front refill count global config (declare before init.inc uses them)
|
|
|
|
|
|
extern int g_refill_count_global;
|
|
|
|
|
|
extern int g_refill_count_hot;
|
|
|
|
|
|
extern int g_refill_count_mid;
|
|
|
|
|
|
extern int g_refill_count_class[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Step 3d: Forced inlining for slow path (maintain monolithic performance)
|
|
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c)
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx);
|
|
|
|
|
|
#else
|
|
|
|
|
|
static void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Box: adopt_gate_try (implementation moved from header for robust linkage)
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
#include "box/adopt_gate_box.h"
|
|
|
|
|
|
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
|
|
|
|
|
|
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
|
|
|
|
|
|
extern unsigned long long g_adopt_gate_calls[];
|
|
|
|
|
|
extern unsigned long long g_adopt_gate_success[];
|
|
|
|
|
|
extern unsigned long long g_reg_scan_attempts[];
|
|
|
|
|
|
extern unsigned long long g_reg_scan_hits[];
|
|
|
|
|
|
SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
|
|
|
|
|
|
g_adopt_gate_calls[class_idx]++;
|
|
|
|
|
|
ROUTE_MARK(13);
|
|
|
|
|
|
SuperSlab* ss = tiny_refill_try_fast(class_idx, tls);
|
|
|
|
|
|
if (ss) { g_adopt_gate_success[class_idx]++; return ss; }
|
|
|
|
|
|
g_reg_scan_attempts[class_idx]++;
|
|
|
|
|
|
int reg_size = g_super_reg_class_size[class_idx];
|
|
|
|
|
|
int scan_limit = tiny_reg_scan_max();
|
|
|
|
|
|
if (scan_limit > reg_size) scan_limit = reg_size;
|
|
|
|
|
|
uint32_t self_tid = tiny_self_u32();
|
2025-11-09 18:55:50 +09:00
|
|
|
|
// Local helper (mirror adopt_bind_if_safe) to avoid including alloc inline here
|
|
|
|
|
|
auto int adopt_bind_if_safe_local(TinyTLSSlab* tls_l, SuperSlab* ss, int slab_idx, int class_idx_l) {
|
|
|
|
|
|
uint32_t self_tid = tiny_self_u32();
|
|
|
|
|
|
SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
|
|
|
|
|
|
if (!slab_is_valid(&h)) return 0;
|
|
|
|
|
|
slab_drain_remote_full(&h);
|
|
|
|
|
|
if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) {
|
|
|
|
|
|
tiny_tls_bind_slab(tls_l, h.ss, h.slab_idx);
|
|
|
|
|
|
slab_release(&h);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
slab_release(&h);
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
for (int i = 0; i < scan_limit; i++) {
|
|
|
|
|
|
SuperSlab* cand = g_super_reg_by_class[class_idx][i];
|
|
|
|
|
|
if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue;
|
|
|
|
|
|
// Fast path: use nonempty_mask / freelist_mask to locate candidates in O(1)
|
|
|
|
|
|
uint32_t mask = cand->nonempty_mask;
|
|
|
|
|
|
// Fallback to atomic freelist_mask for cross-thread visibility
|
|
|
|
|
|
if (mask == 0) {
|
|
|
|
|
|
mask = atomic_load_explicit(&cand->freelist_mask, memory_order_acquire);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (mask == 0) continue; // No visible freelists in this SS
|
|
|
|
|
|
int cap = ss_slabs_capacity(cand);
|
|
|
|
|
|
while (mask) {
|
|
|
|
|
|
int sidx = __builtin_ctz(mask);
|
2025-11-09 18:55:50 +09:00
|
|
|
|
mask &= (mask - 1);
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
if (sidx >= cap) continue;
|
2025-11-09 18:55:50 +09:00
|
|
|
|
if (adopt_bind_if_safe_local(tls, cand, sidx, class_idx)) {
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
g_adopt_gate_success[class_idx]++;
|
|
|
|
|
|
g_reg_scan_hits[class_idx]++;
|
|
|
|
|
|
ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07);
|
2025-11-09 18:55:50 +09:00
|
|
|
|
return cand;
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-21 01:16:45 +09:00
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// ============================================================================
|
2025-11-21 01:16:45 +09:00
|
|
|
|
// Global State - EXTRACTED to hakmem_tiny_globals_box.inc
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// ============================================================================
|
2025-11-21 01:16:45 +09:00
|
|
|
|
#include "hakmem_tiny_globals_box.inc"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
2025-11-21 01:16:45 +09:00
|
|
|
|
#include "hakmem_tiny_publish_box.inc"
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: tiny_fast_pop(), tiny_fast_push() - 28 lines (lines 377-404)
|
|
|
|
|
|
// Forward declarations for functions defined in hakmem_tiny_fastcache.inc.h
|
|
|
|
|
|
static inline void* tiny_fast_pop(int class_idx);
|
|
|
|
|
|
static inline int tiny_fast_push(int class_idx, void* ptr);
|
2025-11-11 21:49:05 +09:00
|
|
|
|
static inline void* fastcache_pop(int class_idx);
|
|
|
|
|
|
static inline int fastcache_push(int class_idx, void* ptr);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_hot_pop.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: tiny_hot_pop_class0(), tiny_hot_pop_class1(), tiny_hot_pop_class2(), tiny_hot_pop_class3()
|
|
|
|
|
|
// 88 lines (lines 407-494)
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
int tls_enabled = g_tls_list_enable;
|
|
|
|
|
|
TinyTLSList* tls = &g_tls_lists[class_idx];
|
|
|
|
|
|
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
|
|
|
|
|
|
pthread_mutex_lock(lock);
|
|
|
|
|
|
|
|
|
|
|
|
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
|
|
|
|
|
|
if (slab) {
|
|
|
|
|
|
g_tiny_pool.free_slabs[class_idx] = slab->next;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
slab = allocate_new_slab(class_idx);
|
|
|
|
|
|
if (!slab) {
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
slab->next = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
if (atomic_load_explicit(&slab->remote_head, memory_order_acquire)) {
|
|
|
|
|
|
tiny_remote_drain_locked(slab);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int block_idx = hak_tiny_find_free_block(slab);
|
|
|
|
|
|
if (block_idx < 0) {
|
|
|
|
|
|
slab->next = g_tiny_pool.free_slabs[class_idx];
|
|
|
|
|
|
g_tiny_pool.free_slabs[class_idx] = slab;
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
hak_tiny_set_used(slab, block_idx);
|
|
|
|
|
|
slab->free_count--;
|
|
|
|
|
|
size_t block_size = g_tiny_class_sizes[class_idx];
|
|
|
|
|
|
uint8_t* base = (uint8_t*)slab->base;
|
|
|
|
|
|
void* ret = (void*)(base + ((size_t)block_idx * block_size));
|
|
|
|
|
|
g_tiny_pool.alloc_count[class_idx]++;
|
|
|
|
|
|
|
|
|
|
|
|
uint16_t cap = g_fast_cap_defaults[class_idx];
|
|
|
|
|
|
uint16_t count = g_fast_count[class_idx];
|
|
|
|
|
|
uint16_t fast_need = (cap > count) ? (uint16_t)(cap - count) : 0;
|
|
|
|
|
|
if (fast_need > slab->free_count) fast_need = (uint16_t)slab->free_count;
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t tls_need = 0;
|
|
|
|
|
|
if (tls_enabled && tls_list_needs_refill(tls)) {
|
|
|
|
|
|
uint32_t target = tls_list_refill_threshold(tls);
|
|
|
|
|
|
if (tls->count < target) {
|
|
|
|
|
|
tls_need = target - tls->count;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
uint32_t remaining = slab->free_count;
|
|
|
|
|
|
if (fast_need > remaining) fast_need = (uint16_t)remaining;
|
|
|
|
|
|
remaining -= fast_need;
|
|
|
|
|
|
if (tls_need > remaining) tls_need = remaining;
|
|
|
|
|
|
|
|
|
|
|
|
while (fast_need > 0) {
|
|
|
|
|
|
int extra_idx = hak_tiny_find_free_block(slab);
|
|
|
|
|
|
if (extra_idx < 0) break;
|
|
|
|
|
|
hak_tiny_set_used(slab, extra_idx);
|
|
|
|
|
|
slab->free_count--;
|
|
|
|
|
|
void* extra = (void*)(base + ((size_t)extra_idx * block_size));
|
2025-11-11 21:49:05 +09:00
|
|
|
|
int pushed = 0;
|
|
|
|
|
|
if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) {
|
|
|
|
|
|
pushed = fastcache_push(class_idx, extra);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
pushed = tiny_fast_push(class_idx, extra);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!pushed) {
|
2025-11-05 12:31:14 +09:00
|
|
|
|
if (tls_enabled) {
|
|
|
|
|
|
tiny_tls_list_guard_push(class_idx, tls, extra);
|
2025-11-11 10:00:36 +09:00
|
|
|
|
tls_list_push(tls, extra, class_idx);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
fast_need--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
while (tls_enabled && tls_need > 0) {
|
|
|
|
|
|
int extra_idx = hak_tiny_find_free_block(slab);
|
|
|
|
|
|
if (extra_idx < 0) break;
|
|
|
|
|
|
hak_tiny_set_used(slab, extra_idx);
|
|
|
|
|
|
slab->free_count--;
|
|
|
|
|
|
void* extra = (void*)(base + ((size_t)extra_idx * block_size));
|
|
|
|
|
|
tiny_tls_list_guard_push(class_idx, tls, extra);
|
2025-11-11 10:00:36 +09:00
|
|
|
|
tls_list_push(tls, extra, class_idx);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
tls_need--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (slab->free_count == 0) {
|
|
|
|
|
|
move_to_full_list(class_idx, slab);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
slab->next = g_tiny_pool.free_slabs[class_idx];
|
|
|
|
|
|
g_tiny_pool.free_slabs[class_idx] = slab;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tiny_fast_refill_and_take() - 39 lines (lines 584-622)
|
|
|
|
|
|
// Hot-path cheap sampling counter to avoid rand() in allocation path
|
|
|
|
|
|
// Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
int g_tls_sll_enable = 1; // HAKMEM_TINY_TLS_SLL=0 to disable
|
2025-11-20 07:32:30 +09:00
|
|
|
|
int g_tiny_hotpath_class5 = 0; // HAKMEM_TINY_HOTPATH_CLASS5=1 to enable class 5 hotpath
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c)
|
2025-11-07 01:27:04 +09:00
|
|
|
|
// CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
// PRIORITY 3: TLS Canaries - Add canaries around TLS arrays to detect buffer overruns
|
|
|
|
|
|
#define TLS_CANARY_MAGIC 0xDEADBEEFDEADBEEFULL
|
2025-11-20 07:32:30 +09:00
|
|
|
|
// Phase 3d-B: Unified TLS SLL (head+count in same cache line for +12-18% cache hit rate)
|
|
|
|
|
|
__thread uint64_t g_tls_canary_before_sll = TLS_CANARY_MAGIC;
|
|
|
|
|
|
__thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {0};
|
|
|
|
|
|
__thread uint64_t g_tls_canary_after_sll = TLS_CANARY_MAGIC;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static int g_tiny_ultra = 0; // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode
|
|
|
|
|
|
static int g_ultra_validate = 0; // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation
|
|
|
|
|
|
// Ultra debug counters
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0};
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Path counters (normal mode visibility): lightweight, for debugging/bench only
|
|
|
|
|
|
#if HAKMEM_DEBUG_COUNTERS
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// New: slow/bitmap/bump/bin instrumentation
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#endif
|
|
|
|
|
|
static int g_path_debug_enabled = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Spill hysteresis(freeホットパスからgetenvを排除)
|
|
|
|
|
|
static int g_spill_hyst = 32; // default margin (configured at init; never getenv on hot path)
|
|
|
|
|
|
|
|
|
|
|
|
// Optional per-class refill batch overrides (0=use global defaults)
|
|
|
|
|
|
static int g_refill_max_c[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) {
|
|
|
|
|
|
int v = g_refill_max_c[class_idx];
|
|
|
|
|
|
if (v > 0) return v;
|
|
|
|
|
|
if (class_idx <= 3) {
|
|
|
|
|
|
int hv = g_refill_max_hot_c[class_idx];
|
|
|
|
|
|
if (hv > 0) return hv;
|
|
|
|
|
|
return g_tiny_refill_max_hot;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_tiny_refill_max;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
|
|
|
|
// Phase 9.5: Frontend/Backend split - Tiny Front modules(QuickSlot / FastCache)
|
|
|
|
|
|
#include "front/quick_slot.h"
|
|
|
|
|
|
#include "front/fast_cache.h"
|
|
|
|
|
|
__thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
|
2025-11-05 12:31:14 +09:00
|
|
|
|
static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend)
|
|
|
|
|
|
// SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER)
|
2025-11-13 01:45:30 +09:00
|
|
|
|
int g_sll_multiplier = 2;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Cached thread id (uint32) to avoid repeated pthread_self() in hot paths
|
|
|
|
|
|
static __thread uint32_t g_tls_tid32;
|
|
|
|
|
|
static __thread int g_tls_tid32_inited;
|
|
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c)
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) {
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) {
|
|
|
|
|
|
#endif
|
|
|
|
|
|
if (__builtin_expect(!g_tls_tid32_inited, 0)) {
|
|
|
|
|
|
g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self();
|
|
|
|
|
|
g_tls_tid32_inited = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_tls_tid32;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Cached pthread_t as-is for APIs that require pthread_t comparison
|
|
|
|
|
|
static __thread pthread_t g_tls_pt_self;
|
|
|
|
|
|
static __thread int g_tls_pt_inited;
|
2025-11-11 21:49:05 +09:00
|
|
|
|
|
|
|
|
|
|
// Frontend FastCache hit/miss counters (Small diagnostics)
|
|
|
|
|
|
unsigned long long g_front_fc_hit[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
unsigned long long g_front_fc_miss[TINY_NUM_CLASSES] = {0};
|
2025-11-14 01:05:30 +09:00
|
|
|
|
// TLS SLL class mask: bit i = 1 allows SLL for class i. Default: all 8 classes enabled.
|
|
|
|
|
|
int g_tls_sll_class_mask = 0xFF;
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c)
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
|
inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) {
|
|
|
|
|
|
#else
|
|
|
|
|
|
static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) {
|
|
|
|
|
|
#endif
|
|
|
|
|
|
if (__builtin_expect(!g_tls_pt_inited, 0)) {
|
|
|
|
|
|
g_tls_pt_self = pthread_self();
|
|
|
|
|
|
g_tls_pt_inited = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
return g_tls_pt_self;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#include "tiny_refill.h"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// tiny_mmap_gate.h already included at top
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "tiny_publish.h"
|
|
|
|
|
|
|
2025-11-14 01:02:00 +09:00
|
|
|
|
int g_sll_cap_override[TINY_NUM_CLASSES] = {0}; // LEGACY (Phase12以降は参照しない/互換用ダミー)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1)
|
|
|
|
|
|
static int g_tiny_prefetch = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Small-class magazine pre-initialization (to avoid cap==0 checks on hot path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Hot-class small TLS magazine(実体とスイッチ)
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
|
void* slots[128];
|
|
|
|
|
|
uint16_t top; // 0..128
|
|
|
|
|
|
uint16_t cap; // =128
|
|
|
|
|
|
} TinyHotMag;
|
|
|
|
|
|
static int g_hotmag_cap_default = 128; // default capacity (env override)
|
|
|
|
|
|
static int g_hotmag_refill_default = 32; // default refill batch (env override)
|
|
|
|
|
|
static int g_hotmag_enable = 0; // 既定OFF(A/B用)。envでON可。
|
|
|
|
|
|
static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES];
|
|
|
|
|
|
static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES]; // 0=disabled for class, 1=enabled
|
|
|
|
|
|
static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES];
|
|
|
|
|
|
// Inline helpers
|
|
|
|
|
|
|
2025-11-10 16:48:20 +09:00
|
|
|
|
#include "box/tls_sll_box.h" // Box TLS-SLL: Safe SLL operations API (needed by hotmag)
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny_hotmag.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Size-specialized tiny alloc (32B/64B) via function pointers (A/B用)
|
|
|
|
|
|
// TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata)
|
|
|
|
|
|
// Opt-in via HAKMEM_TINY_QUICK=1
|
|
|
|
|
|
// NOTE: This type definition must come BEFORE the Phase 2D-1 includes below
|
Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
|
|
|
|
int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1
|
|
|
|
|
|
__thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2D-1: Hot-path inline function extractions(Front)
|
|
|
|
|
|
// NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3}
|
|
|
|
|
|
#include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations
|
Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
|
|
|
|
#if HAKMEM_TINY_P0_BATCH_REFILL
|
|
|
|
|
|
#include "hakmem_tiny_refill_p0.inc.h" // P0 batch refill → FastCache 直補充
|
|
|
|
|
|
#endif
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
2025-11-08 12:54:52 +09:00
|
|
|
|
// Phase 7 Task 3: Pre-warm TLS cache at init
|
|
|
|
|
|
// Pre-allocate blocks to reduce first-allocation miss penalty
|
|
|
|
|
|
#if HAKMEM_TINY_PREWARM_TLS
|
|
|
|
|
|
void hak_tiny_prewarm_tls_cache(void) {
|
|
|
|
|
|
// Pre-warm each class with HAKMEM_TINY_PREWARM_COUNT blocks
|
|
|
|
|
|
// This reduces the first-allocation miss penalty by populating TLS cache
|
2025-11-13 01:45:30 +09:00
|
|
|
|
// Phase E1-CORRECT: ALL classes (including C7) now use TLS SLL
|
2025-11-08 12:54:52 +09:00
|
|
|
|
for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) {
|
|
|
|
|
|
int count = HAKMEM_TINY_PREWARM_COUNT; // Default: 16 blocks per class
|
|
|
|
|
|
|
|
|
|
|
|
// Trigger refill to populate TLS cache
|
2025-11-09 22:12:34 +09:00
|
|
|
|
// P0 Fix: Use appropriate refill function based on P0 status
|
|
|
|
|
|
#if HAKMEM_TINY_P0_BATCH_REFILL
|
|
|
|
|
|
sll_refill_batch_from_ss(class_idx, count);
|
|
|
|
|
|
#else
|
2025-11-08 12:54:52 +09:00
|
|
|
|
sll_refill_small_from_ss(class_idx, count);
|
2025-11-09 22:12:34 +09:00
|
|
|
|
#endif
|
2025-11-08 12:54:52 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// Ultra-Simple front (small per-class stack) — combines tiny front to minimize
|
|
|
|
|
|
// instructions and memory touches on alloc/free. Uses existing TLS bump shadow
|
|
|
|
|
|
// (g_tls_bcur/bend) when enabled to avoid per-alloc header writes.
|
|
|
|
|
|
// UltraFront capacity for 32/64B fast pop
|
|
|
|
|
|
#ifndef ULTRA_FRONT_CAP
|
|
|
|
|
|
#define ULTRA_FRONT_CAP 64
|
|
|
|
|
|
#endif
|
|
|
|
|
|
typedef struct __attribute__((aligned(64))) {
|
|
|
|
|
|
void* slots[ULTRA_FRONT_CAP];
|
|
|
|
|
|
uint16_t top; // 0..ULTRA_FRONT_CAP
|
|
|
|
|
|
uint16_t _pad;
|
|
|
|
|
|
} TinyUltraFront;
|
|
|
|
|
|
static int g_ultra_simple = 0; // HAKMEM_TINY_ULTRA_SIMPLE=1
|
|
|
|
|
|
static __thread TinyUltraFront g_tls_ultra[TINY_NUM_CLASSES];
|
|
|
|
|
|
// Inline helpers
|
|
|
|
|
|
#include "hakmem_tiny_ultra_front.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-Bump TLS shadow (bench/opt-in): keep a TLS-only bump window
|
|
|
|
|
|
// to avoid per-alloc header writes. Header is updated per-chunk reservation.
|
|
|
|
|
|
// NOTE: Non-static because used in hakmem_tiny_refill.inc.h
|
|
|
|
|
|
int g_bump_chunk = 32; // HAKMEM_TINY_BUMP_CHUNK (blocks)
|
2025-11-07 01:27:04 +09:00
|
|
|
|
__thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
__thread uint8_t* g_tls_bend[TINY_NUM_CLASSES] = {0};
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// SLL small refill batch for specialized class (32/64B)
|
|
|
|
|
|
// Specialized order toggle: 1 = mag-first, 0 = sll-first
|
|
|
|
|
|
// HotMag helpers (for classes 0..3)
|
|
|
|
|
|
static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; }
|
|
|
|
|
|
|
|
|
|
|
|
// Optional front (Ultra/HotMag) push helper: compile-out in release builds
|
|
|
|
|
|
static inline int tiny_optional_push(int class_idx, void* ptr) {
|
|
|
|
|
|
#if HAKMEM_BUILD_RELEASE
|
|
|
|
|
|
(void)class_idx;
|
|
|
|
|
|
(void)ptr;
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
#else
|
|
|
|
|
|
if (__builtin_expect(g_ultra_simple && is_hot_class(class_idx), 0)) {
|
|
|
|
|
|
if (__builtin_expect(ultra_push(class_idx, ptr), 0)) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (__builtin_expect(is_hot_class(class_idx), 0)) {
|
|
|
|
|
|
if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-Simple helpers
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 9.6: Deferred Intelligence (event queue + background)
|
|
|
|
|
|
// Extended event for FLINT Intelligence (lightweight; recorded off hot path only)
|
|
|
|
|
|
// Observability, ACE, and intelligence helpers
|
|
|
|
|
|
#include "hakmem_tiny_intel.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_rss.c (Phase 2B-2)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED: static int get_rss_kb_self(void) {
|
|
|
|
|
|
// EXTRACTED: FILE* f = fopen("/proc/self/status", "r");
|
|
|
|
|
|
// EXTRACTED: if (!f) return 0;
|
|
|
|
|
|
// EXTRACTED: char buf[256];
|
|
|
|
|
|
// EXTRACTED: int kb = 0;
|
|
|
|
|
|
// EXTRACTED: while (fgets(buf, sizeof(buf), f)) {
|
|
|
|
|
|
// EXTRACTED: if (strncmp(buf, "VmRSS:", 6) == 0) {
|
|
|
|
|
|
// EXTRACTED: char* p = buf;
|
|
|
|
|
|
// EXTRACTED: while (*p && (*p < '0' || *p > '9')) {
|
|
|
|
|
|
// EXTRACTED: p++;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: kb = atoi(p);
|
|
|
|
|
|
// EXTRACTED: break;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: fclose(f);
|
|
|
|
|
|
// EXTRACTED: return kb;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Miss時にマガジンへ大量リフィルせず、1個だけ確保して即返すオプション
|
|
|
|
|
|
// Env: HAKMEM_TINY_REFILL_ONE_ON_MISS=1 で有効(デフォルト: 0)
|
|
|
|
|
|
int g_refill_one_on_miss = 0;
|
|
|
|
|
|
|
|
|
|
|
|
// Frontend fill target per class (adaptive)
|
|
|
|
|
|
// NOTE: Non-static because used in hakmem_tiny_refill.inc.h
|
|
|
|
|
|
_Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
|
|
// Forward declarations for helpers referenced by frontend_refill_fc
|
|
|
|
|
|
static inline int ultra_batch_for_class(int class_idx);
|
|
|
|
|
|
enum { HAK_TIER_SLL=1, HAK_TIER_MAG=2, HAK_TIER_SLAB=3, HAK_TIER_SUPER=4, HAK_TIER_FRONT=5 };
|
|
|
|
|
|
|
|
|
|
|
|
static inline uint16_t hak_thread_id16(void) {
|
|
|
|
|
|
// best-effort compress cached thread id to 16 bits
|
|
|
|
|
|
uint32_t tid = tiny_self_u32();
|
|
|
|
|
|
return (uint16_t)(tid ^ (tid >> 16));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline void eventq_push_ex(int class_idx, uint32_t size, uint8_t tier, uint8_t flags,
|
|
|
|
|
|
uint32_t site_id, uint16_t lat_bucket) {
|
|
|
|
|
|
(void)flags;
|
|
|
|
|
|
|
|
|
|
|
|
(void)lat_bucket;
|
|
|
|
|
|
(void)site_id;
|
|
|
|
|
|
|
|
|
|
|
|
if (!g_int_engine) return;
|
|
|
|
|
|
// Lightweight sampling: if mask set, log 1 out of 2^N
|
|
|
|
|
|
unsigned m = g_int_sample_mask;
|
|
|
|
|
|
if (m != 0) {
|
|
|
|
|
|
unsigned x = g_tls_ev_seq++;
|
|
|
|
|
|
if ((x & m) != 0) return;
|
|
|
|
|
|
}
|
|
|
|
|
|
uint32_t t = atomic_fetch_add_explicit(&g_ev_tail, 1u, memory_order_relaxed);
|
|
|
|
|
|
AllocEvent ev;
|
|
|
|
|
|
ev.ts_ns = g_int_event_ts ? hak_now_ns() : 0;
|
|
|
|
|
|
ev.size = size;
|
|
|
|
|
|
ev.site_id = 0; // keep minimal
|
|
|
|
|
|
ev.latency_bucket = 0;
|
|
|
|
|
|
ev.tier_hit = tier;
|
|
|
|
|
|
ev.flags = 0;
|
|
|
|
|
|
ev.class_idx = (uint16_t)class_idx;
|
|
|
|
|
|
ev.thread_id = 0;
|
|
|
|
|
|
g_ev_ring[t & EVENTQ_MASK] = ev; // best-effort overwrite on overflow
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Background refill workers and intelligence engine
|
|
|
|
|
|
#include "hakmem_tiny_background.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_fastcache.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: fastcache_pop(), fastcache_push(), quick_pop() - 25 lines (lines 873-896)
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-fast try-only variant: attempt a direct SuperSlab bump/freelist pop
|
|
|
|
|
|
// without any refill or slow-path work. Returns NULL on miss.
|
|
|
|
|
|
static inline void* hak_tiny_alloc_superslab_try_fast(int class_idx) {
|
|
|
|
|
|
if (!g_use_superslab) return NULL;
|
|
|
|
|
|
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
|
|
|
|
|
|
TinySlabMeta* meta = tls->meta;
|
|
|
|
|
|
if (!meta) return NULL;
|
|
|
|
|
|
// Try linear (bump) allocation first when freelist is empty
|
|
|
|
|
|
if (meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
|
2025-11-13 16:33:03 +09:00
|
|
|
|
// Use per-slab class_idx to get stride
|
|
|
|
|
|
size_t block_size = tiny_stride_for_class(meta->class_idx);
|
2025-11-05 12:31:14 +09:00
|
|
|
|
void* block = tls->slab_base + ((size_t)meta->used * block_size);
|
|
|
|
|
|
meta->used++;
|
|
|
|
|
|
// Track active blocks in SuperSlab for conservative reclamation
|
|
|
|
|
|
ss_active_inc(tls->ss);
|
|
|
|
|
|
return block;
|
|
|
|
|
|
}
|
|
|
|
|
|
// Do not pop freelist here (keep magazine/SLL handling consistent)
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Functions: quick_refill_from_sll(), quick_refill_from_mag() - 31 lines (lines 918-949)
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: sll_refill_small_from_ss() - 45 lines (lines 952-996)
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2C-3: TLS operations module (included after helper function definitions)
|
|
|
|
|
|
#include "hakmem_tiny_tls_ops.h"
|
|
|
|
|
|
|
|
|
|
|
|
// New TLS list refill: owner-only bulk take from TLS-cached SuperSlab slab
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_tls_ops.h (Phase 2C-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tls_refill_from_tls_slab() - 101 lines
|
|
|
|
|
|
// Hot path refill operation, moved to inline function in header
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_tls_ops.h (Phase 2C-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tls_list_spill_excess() - 97 lines
|
|
|
|
|
|
// Hot path spill operation, moved to inline function in header
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: superslab_tls_bump_fast() - 45 lines (lines 1016-1060)
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: frontend_refill_fc() - 44 lines (lines 1063-1106)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// SLL capacity policy: for hot tiny classes (0..3), allow larger SLL up to multiplier * mag_cap
|
|
|
|
|
|
// for >=4 keep current conservative half (to limit footprint).
|
|
|
|
|
|
static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap) {
|
2025-11-14 01:02:00 +09:00
|
|
|
|
// Phase12: g_sll_cap_override は非推奨。ここでは無視して通常capを返す。
|
2025-11-05 12:31:14 +09:00
|
|
|
|
uint32_t cap = mag_cap;
|
|
|
|
|
|
if (class_idx <= 3) {
|
|
|
|
|
|
uint32_t mult = (g_sll_multiplier > 0 ? (uint32_t)g_sll_multiplier : 1u);
|
|
|
|
|
|
uint64_t want = (uint64_t)cap * (uint64_t)mult;
|
|
|
|
|
|
if (want > (uint64_t)TINY_TLS_MAG_CAP) cap = TINY_TLS_MAG_CAP; else cap = (uint32_t)want;
|
|
|
|
|
|
} else if (class_idx >= 4) {
|
|
|
|
|
|
cap = (mag_cap > 1u ? (mag_cap / 2u) : 1u);
|
|
|
|
|
|
}
|
|
|
|
|
|
return cap;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: bulk_mag_to_sll_if_room() - 22 lines (lines 1133-1154)
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra helpers forward declarations (defined later)
|
|
|
|
|
|
static inline int ultra_sll_cap_for_class(int class_idx);
|
|
|
|
|
|
static inline int ultra_validate_sll_head(int class_idx, void* head);
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra-mode (SLL-only) helpers
|
|
|
|
|
|
// Ultra batch overrides via env: HAKMEM_TINY_ULTRA_BATCH_C{0..7}
|
|
|
|
|
|
static int g_ultra_batch_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
static int g_ultra_sll_cap_override[TINY_NUM_CLASSES] = {0};
|
|
|
|
|
|
|
|
|
|
|
|
static inline int ultra_batch_for_class(int class_idx) {
|
|
|
|
|
|
int ov = g_ultra_batch_override[class_idx];
|
|
|
|
|
|
if (ov > 0) return ov;
|
|
|
|
|
|
switch (class_idx) {
|
|
|
|
|
|
case 0: return 64; // 8B
|
|
|
|
|
|
case 1: return 96; // 16B(A/B最良)
|
|
|
|
|
|
case 2: return 96; // 32B(A/B最良)
|
|
|
|
|
|
case 3: return 224; // 64B(A/B最良)
|
2025-11-09 18:55:50 +09:00
|
|
|
|
case 4: return 96; // 128B (promote front refill a bit)
|
2025-11-09 17:31:30 +09:00
|
|
|
|
case 5: return 64; // 256B (promote front refill)
|
|
|
|
|
|
case 6: return 64; // 512B (promote front refill)
|
|
|
|
|
|
default: return 32; // 1024B and others
|
2025-11-05 12:31:14 +09:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_refill.inc.h (Phase 2D-1)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: ultra_refill_sll() - 56 lines (lines 1178-1233)
|
|
|
|
|
|
|
|
|
|
|
|
#include "hakmem_tiny_remote.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Internal Helpers
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// Step 2: Slab Registry Operations
|
|
|
|
|
|
|
|
|
|
|
|
// Hash function for slab_base (64KB aligned)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_registry.c (Phase 2B-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED: static inline int registry_hash(uintptr_t slab_base) {
|
|
|
|
|
|
// EXTRACTED: return (slab_base >> 16) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Register slab in hash table (returns 1 on success, 0 on failure)
|
|
|
|
|
|
// EXTRACTED: static int registry_register(uintptr_t slab_base, TinySlab* owner) {
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_lock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: int hash = registry_hash(slab_base);
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: // Linear probing (max 8 attempts)
|
|
|
|
|
|
// EXTRACTED: for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) {
|
|
|
|
|
|
// EXTRACTED: int idx = (hash + i) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
// EXTRACTED: SlabRegistryEntry* entry = &g_slab_registry[idx];
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: if (entry->slab_base == 0) {
|
|
|
|
|
|
// EXTRACTED: // Empty slot found
|
|
|
|
|
|
// EXTRACTED: entry->slab_base = slab_base;
|
|
|
|
|
|
// EXTRACTED: atomic_store_explicit(&entry->owner, owner, memory_order_release);
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return 1;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: // Registry full (collision limit exceeded)
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return 0;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Unregister slab from hash table
|
|
|
|
|
|
// EXTRACTED: static void registry_unregister(uintptr_t slab_base) {
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_lock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: int hash = registry_hash(slab_base);
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: // Linear probing search
|
|
|
|
|
|
// EXTRACTED: for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) {
|
|
|
|
|
|
// EXTRACTED: int idx = (hash + i) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
// EXTRACTED: SlabRegistryEntry* entry = &g_slab_registry[idx];
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: if (entry->slab_base == slab_base) {
|
|
|
|
|
|
// EXTRACTED: // Found - clear entry (atomic store prevents TOCTOU race)
|
|
|
|
|
|
// EXTRACTED: atomic_store_explicit(&entry->owner, NULL, memory_order_release);
|
|
|
|
|
|
// EXTRACTED: entry->slab_base = 0;
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED:
|
|
|
|
|
|
// EXTRACTED: if (entry->slab_base == 0) {
|
|
|
|
|
|
// EXTRACTED: // Empty slot - not found
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: return;
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
// EXTRACTED: pthread_mutex_unlock(&g_tiny_registry_lock);
|
|
|
|
|
|
// EXTRACTED: }
|
|
|
|
|
|
|
|
|
|
|
|
// Lookup slab by base address (O(1) average)
|
|
|
|
|
|
static TinySlab* registry_lookup(uintptr_t slab_base) {
|
|
|
|
|
|
// Lock-free read with atomic owner access (MT-safe)
|
|
|
|
|
|
int hash = registry_hash(slab_base);
|
|
|
|
|
|
|
|
|
|
|
|
// Linear probing search
|
|
|
|
|
|
for (int i = 0; i < SLAB_REGISTRY_MAX_PROBE; i++) {
|
|
|
|
|
|
int idx = (hash + i) & SLAB_REGISTRY_MASK;
|
|
|
|
|
|
SlabRegistryEntry* entry = &g_slab_registry[idx];
|
|
|
|
|
|
|
|
|
|
|
|
if (entry->slab_base == slab_base) {
|
|
|
|
|
|
// Atomic load to prevent TOCTOU race with registry_unregister()
|
|
|
|
|
|
TinySlab* owner = atomic_load_explicit(&entry->owner, memory_order_acquire);
|
|
|
|
|
|
if (!owner) return NULL; // Entry cleared by unregister
|
|
|
|
|
|
return owner;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (entry->slab_base == 0) {
|
|
|
|
|
|
return NULL; // Empty slot - not found
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL; // Not found after max probes
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_slab_mgmt.inc (Phase 2D-4 FINAL)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: allocate_new_slab() - 79 lines (lines 952-1030)
|
|
|
|
|
|
// Allocate new slab for a class
|
|
|
|
|
|
|
|
|
|
|
|
// Function: release_slab() - 23 lines (lines 1033-1055)
|
|
|
|
|
|
// Release a slab back to system
|
|
|
|
|
|
|
|
|
|
|
|
// Step 2: Find slab owner by pointer (O(1) via hash table registry, or O(N) fallback)
|
|
|
|
|
|
TinySlab* hak_tiny_owner_slab(void* ptr) {
|
|
|
|
|
|
if (!ptr || !g_tiny_initialized) return NULL;
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 6.14: Runtime toggle between Registry (O(1)) and List (O(N))
|
|
|
|
|
|
if (g_use_registry) {
|
|
|
|
|
|
// O(1) lookup via hash table
|
|
|
|
|
|
uintptr_t slab_base = (uintptr_t)ptr & ~(TINY_SLAB_SIZE - 1);
|
|
|
|
|
|
TinySlab* slab = registry_lookup(slab_base);
|
|
|
|
|
|
if (!slab) return NULL;
|
|
|
|
|
|
// SAFETY: validate membership (ptr must be inside [base, base+64KB))
|
|
|
|
|
|
uintptr_t start = (uintptr_t)slab->base;
|
|
|
|
|
|
uintptr_t end = start + TINY_SLAB_SIZE;
|
|
|
|
|
|
if ((uintptr_t)ptr < start || (uintptr_t)ptr >= end) {
|
|
|
|
|
|
return NULL; // false positive from registry → treat as non-Tiny
|
|
|
|
|
|
}
|
|
|
|
|
|
return slab;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// O(N) fallback: linear search through all slab lists (lock per class)
|
|
|
|
|
|
for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) {
|
|
|
|
|
|
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
|
|
|
|
|
|
pthread_mutex_lock(lock);
|
|
|
|
|
|
// Search free slabs
|
|
|
|
|
|
for (TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; slab; slab = slab->next) {
|
|
|
|
|
|
uintptr_t slab_start = (uintptr_t)slab->base;
|
|
|
|
|
|
uintptr_t slab_end = slab_start + TINY_SLAB_SIZE;
|
|
|
|
|
|
if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) {
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return slab;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Search full slabs
|
|
|
|
|
|
for (TinySlab* slab = g_tiny_pool.full_slabs[class_idx]; slab; slab = slab->next) {
|
|
|
|
|
|
uintptr_t slab_start = (uintptr_t)slab->base;
|
|
|
|
|
|
uintptr_t slab_end = slab_start + TINY_SLAB_SIZE;
|
|
|
|
|
|
if ((uintptr_t)ptr >= slab_start && (uintptr_t)ptr < slab_end) {
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
return slab;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
pthread_mutex_unlock(lock);
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL; // Not found
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Function: move_to_full_list() - 20 lines (lines 1104-1123)
|
|
|
|
|
|
// Move slab to full list
|
|
|
|
|
|
|
|
|
|
|
|
// Function: move_to_free_list() - 20 lines (lines 1126-1145)
|
|
|
|
|
|
// Move slab to free list
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Public API
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 2D-2: Initialization function (extracted to hakmem_tiny_init.inc)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
#include "hakmem_tiny_init.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// 3-Layer Architecture (2025-11-01 Simplification)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Layer 1: TLS Bump Allocator (ultra-fast, 2-3 instructions/op)
|
|
|
|
|
|
#include "hakmem_tiny_bump.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// Layer 2: TLS Small Magazine (fast, 5-10 instructions/op)
|
|
|
|
|
|
#include "hakmem_tiny_smallmag.inc.h"
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 6 Fast Path Options (mutually exclusive)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Choose ONE of the following Phase 6 optimizations:
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 6-1.5: Alignment Guessing (LEGACY - committed 2025-11-02)
|
|
|
|
|
|
// - Enable: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1
|
|
|
|
|
|
// - Speed: 235 M ops/sec
|
|
|
|
|
|
// - Memory: 0% overhead
|
|
|
|
|
|
// - Method: Guess size class from pointer alignment (__builtin_ctzl)
|
|
|
|
|
|
// - Risk: Alignment assumptions may break with future changes
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 6-1.6: Metadata Header (NEW - recommended for production)
|
|
|
|
|
|
// - Enable: -DHAKMEM_TINY_PHASE6_METADATA=1
|
|
|
|
|
|
// - Speed: 450-480 M ops/sec (expected, Phase 6-1 level)
|
|
|
|
|
|
// - Memory: ~6-12% overhead (8 bytes/allocation)
|
|
|
|
|
|
// - Method: Store pool_type + size_class in 8-byte header
|
|
|
|
|
|
// - Benefit: Extends to ALL pools (Tiny/Mid/L25/Whale)
|
|
|
|
|
|
// - Eliminates: Registry lookups, mid_lookup, owner checks
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// Forward declarations for Phase 6 alloc/free functions
|
|
|
|
|
|
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
|
|
|
|
|
|
void* hak_tiny_alloc_ultra_simple(size_t size);
|
|
|
|
|
|
void hak_tiny_free_ultra_simple(void* ptr);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#if defined(HAKMEM_TINY_PHASE6_METADATA) && defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
|
|
|
|
|
|
#error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
Add Box I (Integrity), Box E (Expansion), and comprehensive P0 debugging infrastructure
## Major Additions
### 1. Box I: Integrity Verification System (NEW - 703 lines)
- Files: core/box/integrity_box.h (267 lines), core/box/integrity_box.c (436 lines)
- Purpose: Unified integrity checking across all HAKMEM subsystems
- Features:
* 4-level integrity checking (0-4, compile-time controlled)
* Priority 1: TLS array bounds validation
* Priority 2: Freelist pointer validation
* Priority 3: TLS canary monitoring
* Priority ALPHA: Slab metadata invariant checking (5 invariants)
* Atomic statistics tracking (thread-safe)
* Beautiful BOX_BOUNDARY design pattern
### 2. Box E: SuperSlab Expansion System (COMPLETE)
- Files: core/box/superslab_expansion_box.h, core/box/superslab_expansion_box.c
- Purpose: Safe SuperSlab expansion with TLS state guarantee
- Features:
* Immediate slab 0 binding after expansion
* TLS state snapshot and restoration
* Design by Contract (pre/post-conditions, invariants)
* Thread-safe with mutex protection
### 3. Comprehensive Integrity Checking System
- File: core/hakmem_tiny_integrity.h (NEW)
- Unified validation functions for all allocator subsystems
- Uninitialized memory pattern detection (0xa2, 0xcc, 0xdd, 0xfe)
- Pointer range validation (null-page, kernel-space)
### 4. P0 Bug Investigation - Root Cause Identified
**Bug**: SEGV at iteration 28440 (deterministic with seed 42)
**Pattern**: 0xa2a2a2a2a2a2a2a2 (uninitialized/ASan poisoning)
**Location**: TLS SLL (Single-Linked List) cache layer
**Root Cause**: Race condition or use-after-free in TLS list management (class 0)
**Detection**: Box I successfully caught invalid pointer at exact crash point
### 5. Defensive Improvements
- Defensive memset in SuperSlab allocation (all metadata arrays)
- Enhanced pointer validation with pattern detection
- BOX_BOUNDARY markers throughout codebase (beautiful modular design)
- 5 metadata invariant checks in allocation/free/refill paths
## Integration Points
- Modified 13 files with Box I/E integration
- Added 10+ BOX_BOUNDARY markers
- 5 critical integrity check points in P0 refill path
## Test Results (100K iterations)
- Baseline: 7.22M ops/s
- Hotpath ON: 8.98M ops/s (+24% improvement ✓)
- P0 Bug: Still crashes at 28440 iterations (TLS SLL race condition)
- Root cause: Identified but not yet fixed (requires deeper investigation)
## Performance
- Box I overhead: Zero in release builds (HAKMEM_INTEGRITY_LEVEL=0)
- Debug builds: Full validation enabled (HAKMEM_INTEGRITY_LEVEL=4)
- Beautiful modular design maintains clean separation of concerns
## Known Issues
- P0 Bug at 28440 iterations: Race condition in TLS SLL cache (class 0)
- Cause: Use-after-free or race in remote free draining
- Next step: Valgrind investigation to pinpoint exact corruption location
## Code Quality
- Total new code: ~1400 lines (Box I + Box E + integrity system)
- Design: Beautiful Box Theory with clear boundaries
- Modularity: Complete separation of concerns
- Documentation: Comprehensive inline comments and BOX_BOUNDARY markers
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 02:45:00 +09:00
|
|
|
|
|
2025-11-21 01:16:45 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Phase 6 Wrapper Functions - EXTRACTED to hakmem_tiny_phase6_wrappers_box.inc
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
#include "hakmem_tiny_phase6_wrappers_box.inc"
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
|
|
|
|
|
|
// Layer 1-3: Main allocation function (simplified)
|
Phase 1: Box Theory refactoring + include reduction
Phase 1-1: Split hakmem_tiny_free.inc (1,711 → 452 lines, -73%)
- Created tiny_free_magazine.inc.h (413 lines) - Magazine layer
- Created tiny_superslab_alloc.inc.h (394 lines) - SuperSlab alloc
- Created tiny_superslab_free.inc.h (305 lines) - SuperSlab free
Phase 1-2++: Refactor hakmem_pool.c (1,481 → 907 lines, -38.8%)
- Created pool_tls_types.inc.h (32 lines) - TLS structures
- Created pool_mf2_types.inc.h (266 lines) - MF2 data structures
- Created pool_mf2_helpers.inc.h (158 lines) - Helper functions
- Created pool_mf2_adoption.inc.h (129 lines) - Adoption logic
Phase 1-3: Reduce hakmem_tiny.c includes (60 → 46, -23.3%)
- Created tiny_system.h - System headers umbrella (stdio, stdlib, etc.)
- Created tiny_api.h - API headers umbrella (stats, query, rss, registry)
Performance: 4.19M ops/s maintained (±0% regression)
Verified: Larson benchmark 2×8×128×1024 = 4,192,128 ops/s
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-06 21:54:12 +09:00
|
|
|
|
// Build-time configurable via: -DHAKMEM_TINY_USE_NEW_3LAYER=1
|
|
|
|
|
|
#ifndef HAKMEM_TINY_USE_NEW_3LAYER
|
|
|
|
|
|
#define HAKMEM_TINY_USE_NEW_3LAYER 0 // default OFF (legacy path)
|
|
|
|
|
|
#endif
|
2025-11-05 12:31:14 +09:00
|
|
|
|
#if HAKMEM_TINY_USE_NEW_3LAYER
|
|
|
|
|
|
#include "hakmem_tiny_alloc_new.inc"
|
|
|
|
|
|
#else
|
|
|
|
|
|
// Old 6-7 layer architecture (backup)
|
|
|
|
|
|
#include "hakmem_tiny_alloc.inc"
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#include "hakmem_tiny_slow.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// Free path implementations
|
|
|
|
|
|
#include "hakmem_tiny_free.inc"
|
|
|
|
|
|
|
Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
|
|
|
|
// ---- Phase 1: Provide default batch-refill symbol (fallback to small refill)
|
|
|
|
|
|
// Allows runtime gate HAKMEM_TINY_REFILL_BATCH=1 without requiring a rebuild.
|
|
|
|
|
|
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
|
|
|
|
|
|
int sll_refill_small_from_ss(int class_idx, int max_take);
|
|
|
|
|
|
__attribute__((weak)) int sll_refill_batch_from_ss(int class_idx, int max_take)
|
|
|
|
|
|
{
|
|
|
|
|
|
return sll_refill_small_from_ss(class_idx, max_take);
|
|
|
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: hak_tiny_trim() - 116 lines (lines 1164-1279)
|
|
|
|
|
|
// Public trim and cleanup operation for lifecycle management
|
|
|
|
|
|
|
|
|
|
|
|
// Forward decl for internal registry lookup used by ultra safety validation
|
|
|
|
|
|
static TinySlab* registry_lookup(uintptr_t slab_base);
|
|
|
|
|
|
|
|
|
|
|
|
// Ultra helpers: per-class SLL cap and pointer validation
|
|
|
|
|
|
static inline int ultra_sll_cap_for_class(int class_idx) {
|
|
|
|
|
|
int ov = g_ultra_sll_cap_override[class_idx];
|
|
|
|
|
|
if (ov > 0) return ov;
|
|
|
|
|
|
switch (class_idx) {
|
|
|
|
|
|
case 0: return 256; // 8B
|
|
|
|
|
|
case 1: return 384; // 16B(A/B最良)
|
|
|
|
|
|
case 2: return 384; // 32B(A/B最良)
|
|
|
|
|
|
case 3: return 768; // 64B(A/B最良)
|
|
|
|
|
|
case 4: return 256; // 128B
|
|
|
|
|
|
default: return 128; // others
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static inline int ultra_validate_sll_head(int class_idx, void* head) {
|
|
|
|
|
|
uintptr_t base = ((uintptr_t)head) & ~(TINY_SLAB_SIZE - 1);
|
|
|
|
|
|
TinySlab* owner = registry_lookup(base);
|
|
|
|
|
|
if (!owner) return 0;
|
|
|
|
|
|
uintptr_t start = (uintptr_t)owner->base;
|
|
|
|
|
|
if ((uintptr_t)head < start || (uintptr_t)head >= start + TINY_SLAB_SIZE) return 0;
|
|
|
|
|
|
return (owner->class_idx == class_idx);
|
|
|
|
|
|
}
|
|
|
|
|
|
// Optional: wrapper TLS guard(ラッパー再入検知をTLSカウンタで)
|
|
|
|
|
|
#ifndef HAKMEM_WRAPPER_TLS_GUARD
|
|
|
|
|
|
#define HAKMEM_WRAPPER_TLS_GUARD 0
|
|
|
|
|
|
#endif
|
|
|
|
|
|
#if HAKMEM_WRAPPER_TLS_GUARD
|
|
|
|
|
|
extern __thread int g_tls_in_wrapper;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3)
|
|
|
|
|
|
// ============================================================================
|
|
|
|
|
|
// Function: tiny_tls_cache_drain() - 90 lines (lines 1314-1403)
|
|
|
|
|
|
// Static function for draining TLS caches
|
|
|
|
|
|
//
|
|
|
|
|
|
// Function: tiny_apply_mem_diet() - 20 lines (lines 1405-1424)
|
|
|
|
|
|
// Static function for memory diet mode application
|
|
|
|
|
|
//
|
|
|
|
|
|
// Phase 2D-3: Lifecycle management functions (226 lines total)
|
|
|
|
|
|
#include "hakmem_tiny_lifecycle.inc"
|
|
|
|
|
|
|
|
|
|
|
|
// Phase 2D-4 (FINAL): Slab management functions (142 lines total)
|
|
|
|
|
|
#include "hakmem_tiny_slab_mgmt.inc"
|
|
|
|
|
|
|
2025-11-21 01:16:45 +09:00
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// ============================================================================
|
2025-11-21 01:16:45 +09:00
|
|
|
|
// ACE Learning Layer & Tiny Guard - EXTRACTED to hakmem_tiny_ace_guard_box.inc
|
2025-11-05 12:31:14 +09:00
|
|
|
|
// ============================================================================
|
2025-11-21 01:16:45 +09:00
|
|
|
|
#include "hakmem_tiny_ace_guard_box.inc"
|