## Summary - ChatGPT により bench_profile.h の setenv segfault を修正(RTLD_NEXT 経由に切り替え) - core/box/pool_zero_mode_box.h 新設:ENV キャッシュ経由で ZERO_MODE を統一管理 - core/hakmem_pool.c で zero mode に応じた memset 制御(FULL/header/off) - A/B テスト結果:ZERO_MODE=header で +15.34% improvement(1M iterations, C6-heavy) ## Files Modified - core/box/pool_api.inc.h: pool_zero_mode_box.h include - core/bench_profile.h: glibc setenv → malloc+putenv(segfault 回避) - core/hakmem_pool.c: zero mode 参照・制御ロジック - core/box/pool_zero_mode_box.h (新設): enum/getter - CURRENT_TASK.md: Phase ML1 結果記載 ## Test Results | Iterations | ZERO_MODE=full | ZERO_MODE=header | Improvement | |-----------|----------------|-----------------|------------| | 10K | 3.06 M ops/s | 3.17 M ops/s | +3.65% | | 1M | 23.71 M ops/s | 27.34 M ops/s | **+15.34%** | 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
179 lines
6.6 KiB
C
179 lines
6.6 KiB
C
// tiny_front_cold_box.h - Phase 4-Step2: Tiny Front Cold Path Box
|
|
// Purpose: Slow path allocation (refill, diagnostics, error handling)
|
|
// Contract: Called on cache miss, handles SuperSlab refill + diagnostics
|
|
// Performance: Optimized for correctness, not speed (noinline, cold)
|
|
//
|
|
// Design Principles (Box Pattern):
|
|
// 1. Single Responsibility: Cold path ONLY (refill, errors, diagnostics)
|
|
// 2. Clear Contract: Returns USER pointer or NULL, handles all edge cases
|
|
// 3. Observable: Debug logging, error reporting, telemetry
|
|
// 4. Safe: Full error checking, defensive programming
|
|
// 5. Testable: Isolated from hot path, easy to test edge cases
|
|
//
|
|
// Performance Impact:
|
|
// - noinline: Keeps hot path small (better i-cache locality)
|
|
// - cold attribute: Hints compiler to optimize for size, not speed
|
|
// - Infrequent execution: Called only on cache miss (~1-5% of allocations)
|
|
|
|
#ifndef TINY_FRONT_COLD_BOX_H
|
|
#define TINY_FRONT_COLD_BOX_H
|
|
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
#include "../hakmem_build_flags.h"
|
|
#include "../hakmem_tiny_config.h"
|
|
#include "../tiny_region_id.h"
|
|
#include "../front/tiny_unified_cache.h" // For TinyUnifiedCache, unified_cache_refill
|
|
#include "tiny_layout_box.h" // For tiny_user_offset()
|
|
|
|
// ============================================================================
|
|
// Box 3: Tiny Cold Refill + Alloc
|
|
// ============================================================================
|
|
|
|
// Refill cache from SuperSlab + allocate one object
|
|
//
|
|
// CONTRACT:
|
|
// Input: class_idx (0-7, pre-validated by caller)
|
|
// Output: USER pointer on success, NULL on failure
|
|
// Precondition: Cache miss detected by hot path
|
|
// Postcondition: Cache refilled (if possible), one object allocated
|
|
//
|
|
// DESIGN:
|
|
// - noinline: Keeps hot path small (better i-cache)
|
|
// - cold: Hints compiler this is infrequent code
|
|
// - Defensive: Full error checking, diagnostics
|
|
//
|
|
// PERFORMANCE:
|
|
// - Called infrequently (~1-5% of allocations)
|
|
// - Optimized for correctness, not speed
|
|
// - Refill amortizes cost over batch (e.g., 64 objects)
|
|
//
|
|
// ERROR HANDLING:
|
|
// - SuperSlab allocation failure → NULL
|
|
// - Cache refill failure → NULL (fallback to normal path)
|
|
// - Logs errors in debug builds
|
|
//
|
|
__attribute__((noinline, cold, unused))
|
|
static void* tiny_cold_refill_and_alloc(int class_idx) {
|
|
// Refill cache from SuperSlab (batch allocation)
|
|
// unified_cache_refill() returns first BASE block (wrapped)
|
|
hak_base_ptr_t base = unified_cache_refill(class_idx);
|
|
|
|
if (hak_base_is_null(base)) {
|
|
// Refill failed (SuperSlab allocation error, or cache disabled)
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
static __thread uint64_t g_refill_fail_count[TINY_NUM_CLASSES] = {0};
|
|
if (g_refill_fail_count[class_idx] < 10) {
|
|
fprintf(stderr, "[COLD_BOX] Refill failed: class_idx=%d\n", class_idx);
|
|
fflush(stderr);
|
|
g_refill_fail_count[class_idx]++;
|
|
}
|
|
#endif
|
|
return NULL;
|
|
}
|
|
|
|
// Success: return USER pointer
|
|
// NOTE: Header already written by unified_cache_refill()
|
|
// (Removed redundant tiny_region_id_write_header() - P2 fix)
|
|
#if HAKMEM_TINY_HEADER_CLASSIDX
|
|
// Use centralized layout API for offset calculation
|
|
size_t user_offset = tiny_user_offset(class_idx);
|
|
void* raw_base = HAK_BASE_TO_RAW(base);
|
|
return (void*)((char*)raw_base + user_offset); // USER pointer
|
|
#else
|
|
return HAK_BASE_TO_RAW(base);
|
|
#endif
|
|
}
|
|
|
|
// ============================================================================
|
|
// Box 3b: Tiny Cold Drain + Free
|
|
// ============================================================================
|
|
|
|
// Drain cache to SuperSlab + free one object
|
|
//
|
|
// CONTRACT:
|
|
// Input: class_idx (0-7), base pointer (BASE, not USER)
|
|
// Output: 1=SUCCESS, 0=FAILURE
|
|
// Precondition: Cache full detected by hot path
|
|
// Postcondition: Cache drained (if possible), object freed
|
|
//
|
|
// DESIGN:
|
|
// - noinline: Keeps hot path small
|
|
// - cold: Infrequent execution
|
|
// - Batch drain: Drain multiple objects to amortize cost
|
|
//
|
|
// PERFORMANCE:
|
|
// - Called infrequently (~1-5% of frees)
|
|
// - Batch drain amortizes cost (e.g., drain 32 objects)
|
|
//
|
|
__attribute__((noinline, cold, unused))
|
|
static int tiny_cold_drain_and_free(int class_idx, void* base) {
|
|
extern __thread TinyUnifiedCache g_unified_cache[];
|
|
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
|
|
#if HAKMEM_BUILD_RELEASE
|
|
(void)cache;
|
|
#endif
|
|
|
|
// TODO: Implement batch drain logic
|
|
// For now, just reject the free (caller falls back to normal path)
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
static __thread uint64_t g_drain_count[TINY_NUM_CLASSES] = {0};
|
|
if (g_drain_count[class_idx] < 10) {
|
|
fprintf(stderr, "[COLD_BOX] Cache full, drain needed: class_idx=%d tail=%u head=%u\n",
|
|
class_idx, cache->tail, cache->head);
|
|
fflush(stderr);
|
|
g_drain_count[class_idx]++;
|
|
}
|
|
#endif
|
|
|
|
// Fallback: Return 0 (caller handles via normal free path)
|
|
(void)base; // Unused for now
|
|
return 0;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Box 3c: Tiny Cold Error Reporting
|
|
// ============================================================================
|
|
|
|
// Report error (debug builds only)
|
|
//
|
|
// CONTRACT:
|
|
// Input: class_idx, error reason string
|
|
// Output: void (logs to stderr)
|
|
// Precondition: Error detected in hot/cold path
|
|
// Postcondition: Error logged (debug only, zero overhead in release)
|
|
//
|
|
__attribute__((noinline, cold, unused))
|
|
static void tiny_cold_report_error(int class_idx, const char* reason) {
|
|
#if !HAKMEM_BUILD_RELEASE
|
|
fprintf(stderr, "[COLD_BOX_ERROR] class_idx=%d reason=%s\n", class_idx, reason);
|
|
fflush(stderr);
|
|
#else
|
|
(void)class_idx;
|
|
(void)reason;
|
|
#endif
|
|
}
|
|
|
|
// ============================================================================
|
|
// Performance Notes
|
|
// ============================================================================
|
|
|
|
// Cold path optimizations:
|
|
// 1. noinline: Reduces hot path code size → better i-cache
|
|
// 2. cold attribute: Compiler optimizes for size, not speed
|
|
// 3. Batch operations: Refill/drain multiple objects (amortize cost)
|
|
// 4. Defensive code: Full error checking (correctness > speed)
|
|
//
|
|
// Expected call frequency:
|
|
// - Refill: ~1-5% of allocations (depends on cache size)
|
|
// - Drain: ~1-5% of frees (depends on allocation pattern)
|
|
// - Error: <0.01% (only on actual errors)
|
|
//
|
|
// Impact on hot path:
|
|
// - Hot path stays small (~10-20 instructions)
|
|
// - Better i-cache locality (hot path doesn't include cold code)
|
|
// - CPU branch predictor learns hot path quickly
|
|
|
|
#endif // TINY_FRONT_COLD_BOX_H
|