Files
hakmem/core/refill/ss_refill_fc.h
Moe Charm (CI) a9ddb52ad4 ENV cleanup: Remove BG/HotMag vars & guard fprintf (Larson 52.3M ops/s)
Phase 1 完了:環境変数整理 + fprintf デバッグガード

ENV変数削除(BG/HotMag系):
- core/hakmem_tiny_init.inc: HotMag ENV 削除 (~131 lines)
- core/hakmem_tiny_bg_spill.c: BG spill ENV 削除
- core/tiny_refill.h: BG remote 固定値化
- core/hakmem_tiny_slow.inc: BG refs 削除

fprintf Debug Guards (#if !HAKMEM_BUILD_RELEASE):
- core/hakmem_shared_pool.c: Lock stats (~18 fprintf)
- core/page_arena.c: Init/Shutdown/Stats (~27 fprintf)
- core/hakmem.c: SIGSEGV init message

ドキュメント整理:
- 328 markdown files 削除(旧レポート・重複docs)

性能確認:
- Larson: 52.35M ops/s (前回52.8M、安定動作)
- ENV整理による機能影響なし
- Debug出力は一部残存(次phase で対応)

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 14:45:26 +09:00

243 lines
8.8 KiB
C

// ss_refill_fc.h - Direct SuperSlab → FastCache refill (bypass SLL)
// Purpose: Optimize refill path from 2 hops (SS→SLL→FC) to 1 hop (SS→FC)
//
// Box Theory Responsibility:
// - Refill FastCache directly from SuperSlab freelist/carving
// - Handle remote drain when threshold exceeded
// - Restore headers for classes 1-6 (NOT class 0 or 7)
// - Update active counters consistently
//
// Performance Impact:
// - Eliminates SLL intermediate layer overhead
// - Reduces allocation latency by ~30-50% (expected)
// - Simplifies refill path (fewer cache misses)
#ifndef HAK_REFILL_SS_REFILL_FC_H
#define HAK_REFILL_SS_REFILL_FC_H
// NOTE: This is an .inc.h file meant to be included from hakmem_tiny.c
// It assumes all types (SuperSlab, TinySlabMeta, TinyTLSSlab, etc.) are already defined.
// Do NOT include this file directly - it will be included at the appropriate point in hakmem_tiny.c
#include <stdatomic.h>
// Remote drain threshold (default: 32 blocks)
#ifndef REMOTE_DRAIN_THRESHOLD
#define REMOTE_DRAIN_THRESHOLD 32
#endif
// Header constants (from tiny_region_id.h - needed when HAKMEM_TINY_HEADER_CLASSIDX=1)
#ifndef HEADER_MAGIC
#define HEADER_MAGIC 0xA0
#endif
#ifndef HEADER_CLASS_MASK
#define HEADER_CLASS_MASK 0x0F
#endif
// ========================================================================
// REFILL CONTRACT: ss_refill_fc_fill() - Standard Refill Entry Point
// ========================================================================
//
// This is the CANONICAL refill function for the Front-Direct architecture.
// All allocation refills should route through this function when:
// - Front-Direct mode is active
// - Batch refill mode is active
// - P0 direct FastCache path is compiled in
//
// Architecture: SuperSlab → FastCache (1-hop, bypasses SLL)
//
// Replaces legacy 2-hop path: SuperSlab → SLL → FastCache
//
// Box Boundaries:
// - Input: class_idx (0-7), want (target refill count)
// - Output: BASE pointers pushed to FastCache (header at ptr-1 for C1-C6)
// - Side Effects: Updates meta->used, meta->carved, ss->total_active_blocks
//
// Guarantees:
// - Remote drain at threshold (default: 32 blocks)
// - Freelist priority (reuse before carve)
// - Header restoration for classes 1-6 (NOT class 0 or 7)
// - Atomic active counter updates (thread-safe)
// - Fail-fast on capacity exhaustion (no infinite loops)
//
// ========================================================================
/**
* ss_refill_fc_fill - Refill FastCache directly from SuperSlab
*
* @param class_idx Size class index (0-7)
* @param want Target number of blocks to refill
* @return Number of blocks successfully pushed to FastCache
*
* Algorithm:
* 1. Check TLS slab availability (call superslab_refill if needed)
* 2. Remote drain if pending count >= threshold
* 3. Refill loop (while produced < want and FC has room):
* a. Try pop from freelist (O(1))
* b. Try carve from slab (O(1))
* c. Call superslab_refill if slab exhausted
* d. Restore header for classes 1-6 (NOT 0 or 7)
* e. Push to FastCache
* 4. Update active counter (once, after loop)
* 5. Return produced count
*
* Box Contract:
* - Input: valid class_idx (0 <= idx < TINY_NUM_CLASSES)
* - Output: BASE pointers (header at ptr-1 for classes 1-6)
* - Invariants: meta->used, meta->carved consistent
* - Side effects: Updates ss->total_active_blocks
*/
static inline int ss_refill_fc_fill(int class_idx, int want) {
// ========== Step 1: Check TLS slab ==========
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
SuperSlab* ss = tls->ss;
TinySlabMeta* meta = tls->meta;
// If no TLS slab configured, attempt refill
if (!ss || !meta) {
ss = superslab_refill(class_idx);
if (!ss) return 0; // Failed to get SuperSlab
// Reload TLS state after superslab_refill
tls = &g_tls_slabs[class_idx];
ss = tls->ss;
meta = tls->meta;
// Safety check after reload
if (!ss || !meta) return 0;
}
int slab_idx = tls->slab_idx;
if (slab_idx < 0) return 0; // Invalid slab index
// ========== Step 2: Remote Drain (if needed) ==========
uint32_t remote_cnt = atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_acquire);
const int drain_thresh = REMOTE_DRAIN_THRESHOLD;
if (remote_cnt >= (uint32_t)drain_thresh) {
_ss_remote_drain_to_freelist_unsafe(ss, slab_idx, meta);
}
// ========== Step 3: Refill Loop ==========
int produced = 0;
size_t stride = tiny_stride_for_class(class_idx);
uint8_t* slab_base = tiny_slab_base_for_geometry(ss, slab_idx);
while (produced < want) {
void* p = NULL;
// Option A: Pop from freelist (if available)
if (meta->freelist != NULL) {
p = meta->freelist;
meta->freelist = tiny_next_read(class_idx, p);
meta->used++;
}
// Option B: Carve new block (if capacity available)
else if (meta->carved < meta->capacity) {
p = (void*)(slab_base + (meta->carved * stride));
meta->carved++;
meta->used++;
}
// Option C: Slab exhausted, need new slab
else {
ss = superslab_refill(class_idx);
if (!ss) break; // Failed to get new slab
// Reload TLS state after superslab_refill
tls = &g_tls_slabs[class_idx];
ss = tls->ss;
meta = tls->meta;
slab_idx = tls->slab_idx;
// Safety check after reload
if (!ss || !meta || slab_idx < 0) break;
// Update stride/base for new slab
stride = tiny_stride_for_class(class_idx);
slab_base = tiny_slab_base_for_geometry(ss, slab_idx);
continue; // Retry allocation from new slab
}
// ========== Step 3d: Restore Header (classes 1-6 only) ==========
#if HAKMEM_TINY_HEADER_CLASSIDX
// Phase E1-CORRECT: Restore headers for classes 1-6
// Rationale:
// - Class 0 (8B): Never had header (too small, 12.5% overhead)
// - Classes 1-6: Standard header (0.8-6% overhead)
// - Class 7 (1KB): Headerless by design (mimalloc compatibility)
//
// Note: Freelist operations may corrupt headers, so we restore them here
if (class_idx >= 1 && class_idx <= 6) {
*(uint8_t*)p = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
}
#endif
// ========== Step 3e: Push to FastCache ==========
if (!fastcache_push(class_idx, p)) {
// FastCache full, rollback state and exit
// Note: We don't need to update active counter yet (will do after loop)
meta->used--; // Rollback used count
if (meta->freelist == p) {
// This block came from freelist, push it back
// (This is a rare edge case - FC full is uncommon)
} else if (meta->carved > 0 && (void*)(slab_base + ((meta->carved - 1) * stride)) == p) {
// This block was just carved, rollback carve
meta->carved--;
}
break;
}
produced++;
}
// ========== Step 4: Update Active Counter ==========
if (produced > 0) {
ss_active_add(ss, (uint32_t)produced);
}
// ========== Step 5: Return ==========
return produced;
}
// ============================================================================
// Performance Notes
// ============================================================================
//
// Expected Performance Improvement:
// - Before (2-hop path): SS → SLL → FC
// * Overhead: SLL list traversal, cache misses, branch mispredicts
// * Latency: ~50-100 cycles per block
//
// - After (1-hop path): SS → FC
// * Overhead: Direct array push
// * Latency: ~10-20 cycles per block
// * Improvement: 50-80% reduction in refill latency
//
// Memory Impact:
// - Zero additional memory (reuses existing FastCache)
// - Reduced pressure on SLL (can potentially shrink SLL capacity)
//
// Thread Safety:
// - All operations on TLS structures (no locks needed)
// - Remote drain uses unsafe variant (OK for TLS context)
// - Active counter updates use atomic add (safe)
//
// ============================================================================
// Integration Notes
// ============================================================================
//
// Usage Example (from allocation hot path):
// void* p = fastcache_pop(class_idx);
// if (!p) {
// ss_refill_fc_fill(class_idx, 16); // Refill 16 blocks
// p = fastcache_pop(class_idx); // Try again
// }
//
// Tuning Parameters:
// - REMOTE_DRAIN_THRESHOLD: Default 32, override via build flag if needed
// - Want parameter: Recommended 8-32 blocks (balance overhead vs hit rate)
//
// ============================================================================
#endif // HAK_REFILL_SS_REFILL_FC_H