686 lines
21 KiB
Markdown
686 lines
21 KiB
Markdown
|
|
# L1D Cache Miss Optimization - Quick Start Implementation Guide
|
||
|
|
|
||
|
|
**Target**: +35-50% performance gain in 1-2 days
|
||
|
|
**Priority**: P0 (Critical Path)
|
||
|
|
**Difficulty**: Medium (6-8 hour implementation, 2-3 hour testing)
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Phase 1: Prefetch Optimization (2-3 hours, +8-12% gain)
|
||
|
|
|
||
|
|
### Step 1.1: Add Prefetch to Refill Path
|
||
|
|
|
||
|
|
**File**: `core/hakmem_tiny_refill_p0.inc.h`
|
||
|
|
**Function**: `sll_refill_batch_from_ss()`
|
||
|
|
**Line**: ~60-70
|
||
|
|
|
||
|
|
**Code Change**:
|
||
|
|
|
||
|
|
```c
|
||
|
|
static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
|
||
|
|
// ... existing validation ...
|
||
|
|
|
||
|
|
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
|
||
|
|
|
||
|
|
// ✅ NEW: Prefetch SuperSlab hot fields (slab_bitmap, nonempty_mask, freelist_mask)
|
||
|
|
if (tls->ss) {
|
||
|
|
// Prefetch cache line 0 of SuperSlab (contains all hot bitmasks)
|
||
|
|
// Temporal locality = 3 (high), write hint = 0 (read-only)
|
||
|
|
__builtin_prefetch(&tls->ss->slab_bitmap, 0, 3);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!tls->ss) {
|
||
|
|
if (!superslab_refill(class_idx)) {
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
// ✅ NEW: Prefetch again after refill (ss pointer changed)
|
||
|
|
if (tls->ss) {
|
||
|
|
__builtin_prefetch(&tls->ss->slab_bitmap, 0, 3);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
TinySlabMeta* meta = tls->meta;
|
||
|
|
if (!meta) return 0;
|
||
|
|
|
||
|
|
// ✅ NEW: Prefetch SlabMeta hot fields (freelist, used, capacity)
|
||
|
|
__builtin_prefetch(&meta->freelist, 0, 3);
|
||
|
|
|
||
|
|
// ... rest of refill logic ...
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
**Expected Impact**: -10-15% L1D miss rate, +8-12% throughput
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 1.2: Add Prefetch to Allocation Path
|
||
|
|
|
||
|
|
**File**: `core/tiny_alloc_fast.inc.h`
|
||
|
|
**Function**: `tiny_alloc_fast()`
|
||
|
|
**Line**: ~510-530
|
||
|
|
|
||
|
|
**Code Change**:
|
||
|
|
|
||
|
|
```c
|
||
|
|
static inline void* tiny_alloc_fast(size_t size) {
|
||
|
|
// ... size → class_idx conversion ...
|
||
|
|
|
||
|
|
// ✅ NEW: Prefetch TLS cache head (likely already in L1, but hints to CPU)
|
||
|
|
__builtin_prefetch(&g_tls_sll_head[class_idx], 0, 3);
|
||
|
|
|
||
|
|
void* ptr = NULL;
|
||
|
|
|
||
|
|
// Generic front (FastCache/SFC/SLL)
|
||
|
|
if (__builtin_expect(g_tls_sll_enable, 1)) {
|
||
|
|
if (class_idx <= 3) {
|
||
|
|
ptr = tiny_alloc_fast_pop(class_idx);
|
||
|
|
} else {
|
||
|
|
void* base = NULL;
|
||
|
|
if (tls_sll_pop(class_idx, &base)) ptr = base;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ✅ NEW: If we got a pointer, prefetch the block's next pointer
|
||
|
|
if (ptr) {
|
||
|
|
// Prefetch next freelist entry for future allocs
|
||
|
|
__builtin_prefetch(ptr, 0, 3);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (__builtin_expect(ptr != NULL, 1)) {
|
||
|
|
HAK_RET_ALLOC(class_idx, ptr);
|
||
|
|
}
|
||
|
|
|
||
|
|
// ... refill logic ...
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
**Expected Impact**: -5-8% L1D miss rate (next pointer prefetch), +4-6% throughput
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 1.3: Build & Test Prefetch Changes
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Build with prefetch enabled
|
||
|
|
./build.sh bench_random_mixed_hakmem
|
||
|
|
|
||
|
|
# Benchmark before (baseline)
|
||
|
|
perf stat -e L1-dcache-loads,L1-dcache-load-misses,cycles,instructions \
|
||
|
|
-r 10 ./out/release/bench_random_mixed_hakmem 1000000 256 42 \
|
||
|
|
2>&1 | tee /tmp/baseline_prefetch.txt
|
||
|
|
|
||
|
|
# Benchmark after (with prefetch)
|
||
|
|
# (no rebuild needed, prefetch is always-on)
|
||
|
|
perf stat -e L1-dcache-loads,L1-dcache-load-misses,cycles,instructions \
|
||
|
|
-r 10 ./out/release/bench_random_mixed_hakmem 1000000 256 42 \
|
||
|
|
2>&1 | tee /tmp/optimized_prefetch.txt
|
||
|
|
|
||
|
|
# Compare results
|
||
|
|
echo "=== L1D Miss Rate Comparison ==="
|
||
|
|
grep "L1-dcache-load-misses" /tmp/baseline_prefetch.txt
|
||
|
|
grep "L1-dcache-load-misses" /tmp/optimized_prefetch.txt
|
||
|
|
|
||
|
|
# Expected: Miss rate 1.69% → 1.45-1.55% (-10-15%)
|
||
|
|
```
|
||
|
|
|
||
|
|
**Validation**:
|
||
|
|
- L1D miss rate should decrease by 10-15%
|
||
|
|
- Throughput should increase by 8-12%
|
||
|
|
- No crashes, no memory leaks (run AddressSanitizer build)
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Phase 2: Hot/Cold SlabMeta Split (4-6 hours, +15-20% gain)
|
||
|
|
|
||
|
|
### Step 2.1: Define New Structures
|
||
|
|
|
||
|
|
**File**: `core/superslab/superslab_types.h`
|
||
|
|
**After**: Line 18 (after `TinySlabMeta` definition)
|
||
|
|
|
||
|
|
**Code Change**:
|
||
|
|
|
||
|
|
```c
|
||
|
|
// Original structure (DEPRECATED, keep for migration)
|
||
|
|
typedef struct TinySlabMeta {
|
||
|
|
void* freelist; // NULL = bump-only, non-NULL = freelist head
|
||
|
|
uint16_t used; // blocks currently allocated from this slab
|
||
|
|
uint16_t capacity; // total blocks this slab can hold
|
||
|
|
uint8_t class_idx; // owning tiny class (Phase 12: per-slab)
|
||
|
|
uint8_t carved; // carve/owner flags
|
||
|
|
uint8_t owner_tid_low; // low 8 bits of owner TID (debug / locality)
|
||
|
|
} TinySlabMeta;
|
||
|
|
|
||
|
|
// ✅ NEW: Split into HOT and COLD structures
|
||
|
|
|
||
|
|
// HOT fields (accessed on every alloc/free)
|
||
|
|
typedef struct TinySlabMetaHot {
|
||
|
|
void* freelist; // 8B ⭐ HOT: freelist head
|
||
|
|
uint16_t used; // 2B ⭐ HOT: current allocation count
|
||
|
|
uint16_t capacity; // 2B ⭐ HOT: total capacity
|
||
|
|
uint32_t _pad; // 4B (maintain 16B alignment for cache efficiency)
|
||
|
|
} __attribute__((aligned(16))) TinySlabMetaHot;
|
||
|
|
|
||
|
|
// COLD fields (accessed rarely: init, debug, stats)
|
||
|
|
typedef struct TinySlabMetaCold {
|
||
|
|
uint8_t class_idx; // 1B 🔥 COLD: size class (set once)
|
||
|
|
uint8_t carved; // 1B 🔥 COLD: carve flags (rarely changed)
|
||
|
|
uint8_t owner_tid_low; // 1B 🔥 COLD: owner TID (debug only)
|
||
|
|
uint8_t _reserved; // 1B (future use)
|
||
|
|
} __attribute__((packed)) TinySlabMetaCold;
|
||
|
|
|
||
|
|
// Validation: Ensure sizes are correct
|
||
|
|
_Static_assert(sizeof(TinySlabMetaHot) == 16, "TinySlabMetaHot must be 16 bytes");
|
||
|
|
_Static_assert(sizeof(TinySlabMetaCold) == 4, "TinySlabMetaCold must be 4 bytes");
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 2.2: Update SuperSlab Structure
|
||
|
|
|
||
|
|
**File**: `core/superslab/superslab_types.h`
|
||
|
|
**Replace**: Lines 49-83 (SuperSlab definition)
|
||
|
|
|
||
|
|
**Code Change**:
|
||
|
|
|
||
|
|
```c
|
||
|
|
// SuperSlab: backing region for multiple TinySlabMeta+data slices
|
||
|
|
typedef struct SuperSlab {
|
||
|
|
uint32_t magic; // SUPERSLAB_MAGIC
|
||
|
|
uint8_t lg_size; // log2(super slab size), 20=1MB, 21=2MB
|
||
|
|
uint8_t _pad0[3];
|
||
|
|
|
||
|
|
// Phase 12: per-SS size_class removed; classes are per-slab via TinySlabMeta.class_idx
|
||
|
|
_Atomic uint32_t total_active_blocks;
|
||
|
|
_Atomic uint32_t refcount;
|
||
|
|
_Atomic uint32_t listed;
|
||
|
|
|
||
|
|
uint32_t slab_bitmap; // active slabs (bit i = 1 → slab i in use)
|
||
|
|
uint32_t nonempty_mask; // non-empty slabs (for partial tracking)
|
||
|
|
uint32_t freelist_mask; // slabs with non-empty freelist (for fast scan)
|
||
|
|
uint8_t active_slabs; // count of active slabs
|
||
|
|
uint8_t publish_hint;
|
||
|
|
uint16_t partial_epoch;
|
||
|
|
|
||
|
|
struct SuperSlab* next_chunk; // legacy per-class chain
|
||
|
|
struct SuperSlab* partial_next; // partial list link
|
||
|
|
|
||
|
|
// LRU integration
|
||
|
|
uint64_t last_used_ns;
|
||
|
|
uint32_t generation;
|
||
|
|
struct SuperSlab* lru_prev;
|
||
|
|
struct SuperSlab* lru_next;
|
||
|
|
|
||
|
|
// Remote free queues (per slab)
|
||
|
|
_Atomic uintptr_t remote_heads[SLABS_PER_SUPERSLAB_MAX];
|
||
|
|
_Atomic uint32_t remote_counts[SLABS_PER_SUPERSLAB_MAX];
|
||
|
|
_Atomic uint32_t slab_listed[SLABS_PER_SUPERSLAB_MAX];
|
||
|
|
|
||
|
|
// ✅ NEW: Split hot/cold metadata arrays
|
||
|
|
TinySlabMetaHot slabs_hot[SLABS_PER_SUPERSLAB_MAX]; // 512B (hot path)
|
||
|
|
TinySlabMetaCold slabs_cold[SLABS_PER_SUPERSLAB_MAX]; // 128B (cold path)
|
||
|
|
|
||
|
|
// ❌ DEPRECATED: Remove original slabs[] array
|
||
|
|
// TinySlabMeta slabs[SLABS_PER_SUPERSLAB_MAX];
|
||
|
|
} SuperSlab;
|
||
|
|
|
||
|
|
// Validation: Check total size (should be ~1240 bytes now, was 1112 bytes)
|
||
|
|
_Static_assert(sizeof(SuperSlab) < 1300, "SuperSlab size increased unexpectedly");
|
||
|
|
```
|
||
|
|
|
||
|
|
**Note**: Total size increase: 1112 → 1240 bytes (+128 bytes for cold array separation). This is acceptable for the cache locality improvement.
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 2.3: Add Migration Accessors (Compatibility Layer)
|
||
|
|
|
||
|
|
**File**: `core/superslab/superslab_inline.h` (create if doesn't exist)
|
||
|
|
|
||
|
|
**Code**:
|
||
|
|
|
||
|
|
```c
|
||
|
|
#ifndef SUPERSLAB_INLINE_H
|
||
|
|
#define SUPERSLAB_INLINE_H
|
||
|
|
|
||
|
|
#include "superslab_types.h"
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Compatibility Layer: Migrate from TinySlabMeta to Hot/Cold Split
|
||
|
|
// ============================================================================
|
||
|
|
// Usage: Replace `ss->slabs[idx].field` with `ss_meta_get_*(ss, idx)`
|
||
|
|
// This allows gradual migration without breaking existing code.
|
||
|
|
|
||
|
|
// Get freelist pointer (HOT field)
|
||
|
|
static inline void* ss_meta_get_freelist(const SuperSlab* ss, int slab_idx) {
|
||
|
|
return ss->slabs_hot[slab_idx].freelist;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set freelist pointer (HOT field)
|
||
|
|
static inline void ss_meta_set_freelist(SuperSlab* ss, int slab_idx, void* ptr) {
|
||
|
|
ss->slabs_hot[slab_idx].freelist = ptr;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get used count (HOT field)
|
||
|
|
static inline uint16_t ss_meta_get_used(const SuperSlab* ss, int slab_idx) {
|
||
|
|
return ss->slabs_hot[slab_idx].used;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set used count (HOT field)
|
||
|
|
static inline void ss_meta_set_used(SuperSlab* ss, int slab_idx, uint16_t val) {
|
||
|
|
ss->slabs_hot[slab_idx].used = val;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Increment used count (HOT field, common operation)
|
||
|
|
static inline void ss_meta_inc_used(SuperSlab* ss, int slab_idx) {
|
||
|
|
ss->slabs_hot[slab_idx].used++;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Decrement used count (HOT field, common operation)
|
||
|
|
static inline void ss_meta_dec_used(SuperSlab* ss, int slab_idx) {
|
||
|
|
ss->slabs_hot[slab_idx].used--;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get capacity (HOT field)
|
||
|
|
static inline uint16_t ss_meta_get_capacity(const SuperSlab* ss, int slab_idx) {
|
||
|
|
return ss->slabs_hot[slab_idx].capacity;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set capacity (HOT field, set once at init)
|
||
|
|
static inline void ss_meta_set_capacity(SuperSlab* ss, int slab_idx, uint16_t val) {
|
||
|
|
ss->slabs_hot[slab_idx].capacity = val;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get class_idx (COLD field)
|
||
|
|
static inline uint8_t ss_meta_get_class_idx(const SuperSlab* ss, int slab_idx) {
|
||
|
|
return ss->slabs_cold[slab_idx].class_idx;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set class_idx (COLD field, set once at init)
|
||
|
|
static inline void ss_meta_set_class_idx(SuperSlab* ss, int slab_idx, uint8_t val) {
|
||
|
|
ss->slabs_cold[slab_idx].class_idx = val;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get carved flags (COLD field)
|
||
|
|
static inline uint8_t ss_meta_get_carved(const SuperSlab* ss, int slab_idx) {
|
||
|
|
return ss->slabs_cold[slab_idx].carved;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set carved flags (COLD field)
|
||
|
|
static inline void ss_meta_set_carved(SuperSlab* ss, int slab_idx, uint8_t val) {
|
||
|
|
ss->slabs_cold[slab_idx].carved = val;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get owner_tid_low (COLD field, debug only)
|
||
|
|
static inline uint8_t ss_meta_get_owner_tid_low(const SuperSlab* ss, int slab_idx) {
|
||
|
|
return ss->slabs_cold[slab_idx].owner_tid_low;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Set owner_tid_low (COLD field, debug only)
|
||
|
|
static inline void ss_meta_set_owner_tid_low(SuperSlab* ss, int slab_idx, uint8_t val) {
|
||
|
|
ss->slabs_cold[slab_idx].owner_tid_low = val;
|
||
|
|
}
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// Direct Access Macro (for performance-critical hot path)
|
||
|
|
// ============================================================================
|
||
|
|
// Use with caution: No bounds checking!
|
||
|
|
#define SS_META_HOT(ss, idx) (&(ss)->slabs_hot[idx])
|
||
|
|
#define SS_META_COLD(ss, idx) (&(ss)->slabs_cold[idx])
|
||
|
|
|
||
|
|
#endif // SUPERSLAB_INLINE_H
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 2.4: Migrate Critical Hot Path (Refill Code)
|
||
|
|
|
||
|
|
**File**: `core/hakmem_tiny_refill_p0.inc.h`
|
||
|
|
**Function**: `sll_refill_batch_from_ss()`
|
||
|
|
|
||
|
|
**Example Migration** (before/after):
|
||
|
|
|
||
|
|
```c
|
||
|
|
// BEFORE (direct field access):
|
||
|
|
if (meta->used >= meta->capacity) {
|
||
|
|
// slab full
|
||
|
|
}
|
||
|
|
meta->used += batch_count;
|
||
|
|
|
||
|
|
// AFTER (use accessors):
|
||
|
|
if (ss_meta_get_used(tls->ss, tls->slab_idx) >=
|
||
|
|
ss_meta_get_capacity(tls->ss, tls->slab_idx)) {
|
||
|
|
// slab full
|
||
|
|
}
|
||
|
|
ss_meta_set_used(tls->ss, tls->slab_idx,
|
||
|
|
ss_meta_get_used(tls->ss, tls->slab_idx) + batch_count);
|
||
|
|
|
||
|
|
// OPTIMAL (use hot pointer macro):
|
||
|
|
TinySlabMetaHot* hot = SS_META_HOT(tls->ss, tls->slab_idx);
|
||
|
|
if (hot->used >= hot->capacity) {
|
||
|
|
// slab full
|
||
|
|
}
|
||
|
|
hot->used += batch_count;
|
||
|
|
```
|
||
|
|
|
||
|
|
**Migration Strategy**:
|
||
|
|
1. Day 1 Morning: Add accessors (Step 2.3) + update SuperSlab struct (Step 2.2)
|
||
|
|
2. Day 1 Afternoon: Migrate 3-5 critical hot path functions (refill, alloc, free)
|
||
|
|
3. Day 1 Evening: Build, test, benchmark
|
||
|
|
|
||
|
|
**Files to Migrate** (Priority order):
|
||
|
|
1. ✅ `core/hakmem_tiny_refill_p0.inc.h` - Refill path (CRITICAL)
|
||
|
|
2. ✅ `core/tiny_free_fast.inc.h` - Free path (CRITICAL)
|
||
|
|
3. ✅ `core/hakmem_tiny_superslab.c` - Carve logic (HIGH)
|
||
|
|
4. 🟡 Other files can use legacy `meta->field` access (migrate gradually)
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 2.5: Build & Test Hot/Cold Split
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Build with hot/cold split
|
||
|
|
./build.sh bench_random_mixed_hakmem
|
||
|
|
|
||
|
|
# Run regression tests
|
||
|
|
./build.sh test_all
|
||
|
|
|
||
|
|
# Run AddressSanitizer build (catch memory errors)
|
||
|
|
./build.sh asan bench_random_mixed_hakmem
|
||
|
|
ASAN_OPTIONS=detect_leaks=1 ./out/asan/bench_random_mixed_hakmem 10000 256 42
|
||
|
|
|
||
|
|
# Benchmark
|
||
|
|
perf stat -e L1-dcache-loads,L1-dcache-load-misses,cycles,instructions \
|
||
|
|
-r 10 ./out/release/bench_random_mixed_hakmem 1000000 256 42 \
|
||
|
|
2>&1 | tee /tmp/optimized_hotcold.txt
|
||
|
|
|
||
|
|
# Compare with prefetch-only baseline
|
||
|
|
echo "=== L1D Miss Rate Comparison ==="
|
||
|
|
echo "Prefetch-only:"
|
||
|
|
grep "L1-dcache-load-misses" /tmp/optimized_prefetch.txt
|
||
|
|
echo "Prefetch + Hot/Cold Split:"
|
||
|
|
grep "L1-dcache-load-misses" /tmp/optimized_hotcold.txt
|
||
|
|
|
||
|
|
# Expected: Miss rate 1.45-1.55% → 1.2-1.3% (-15-20% additional)
|
||
|
|
```
|
||
|
|
|
||
|
|
**Validation Checklist**:
|
||
|
|
- ✅ L1D miss rate decreased by 15-20% (cumulative: -25-35% from baseline)
|
||
|
|
- ✅ Throughput increased by 15-20% (cumulative: +25-35% from baseline)
|
||
|
|
- ✅ No crashes in 1M iteration run
|
||
|
|
- ✅ No memory leaks (AddressSanitizer clean)
|
||
|
|
- ✅ No corruption (random seed fuzzing: 100 runs with different seeds)
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Phase 3: TLS Cache Merge (Day 2, 6-8 hours, +12-18% gain)
|
||
|
|
|
||
|
|
### Step 3.1: Define Merged TLS Cache Structure
|
||
|
|
|
||
|
|
**File**: `core/hakmem_tiny.h` (or create `core/tiny_tls_cache.h`)
|
||
|
|
|
||
|
|
**Code**:
|
||
|
|
|
||
|
|
```c
|
||
|
|
#ifndef TINY_TLS_CACHE_H
|
||
|
|
#define TINY_TLS_CACHE_H
|
||
|
|
|
||
|
|
#include <stdint.h>
|
||
|
|
|
||
|
|
// ============================================================================
|
||
|
|
// TLS Cache Entry (merged head + count + capacity)
|
||
|
|
// ============================================================================
|
||
|
|
// Design: Merge g_tls_sll_head[] and g_tls_sll_count[] into single structure
|
||
|
|
// to reduce cache line accesses from 2 → 1.
|
||
|
|
//
|
||
|
|
// Layout (16 bytes per class, 4 classes per cache line):
|
||
|
|
// Cache Line 0: Classes 0-3 (64 bytes)
|
||
|
|
// Cache Line 1: Classes 4-7 (64 bytes)
|
||
|
|
//
|
||
|
|
// Before: 2 cache lines (head[] and count[] separate)
|
||
|
|
// After: 1 cache line (merged, same line for head+count!)
|
||
|
|
|
||
|
|
typedef struct TLSCacheEntry {
|
||
|
|
void* head; // 8B ⭐ HOT: TLS freelist head pointer
|
||
|
|
uint32_t count; // 4B ⭐ HOT: current TLS freelist count
|
||
|
|
uint16_t capacity; // 2B ⭐ HOT: adaptive TLS capacity (Phase 2b)
|
||
|
|
uint16_t _pad; // 2B (alignment padding)
|
||
|
|
} __attribute__((aligned(16))) TLSCacheEntry;
|
||
|
|
|
||
|
|
// Validation
|
||
|
|
_Static_assert(sizeof(TLSCacheEntry) == 16, "TLSCacheEntry must be 16 bytes");
|
||
|
|
|
||
|
|
// TLS cache array (128 bytes total, 2 cache lines)
|
||
|
|
#define TINY_NUM_CLASSES 8
|
||
|
|
extern __thread TLSCacheEntry g_tls_cache[TINY_NUM_CLASSES] __attribute__((aligned(64)));
|
||
|
|
|
||
|
|
#endif // TINY_TLS_CACHE_H
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 3.2: Replace TLS Arrays in hakmem_tiny.c
|
||
|
|
|
||
|
|
**File**: `core/hakmem_tiny.c`
|
||
|
|
**Find**: Lines ~1019-1020 (TLS variable declarations)
|
||
|
|
|
||
|
|
**BEFORE**:
|
||
|
|
```c
|
||
|
|
__thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0};
|
||
|
|
__thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0};
|
||
|
|
```
|
||
|
|
|
||
|
|
**AFTER**:
|
||
|
|
```c
|
||
|
|
#include "tiny_tls_cache.h"
|
||
|
|
|
||
|
|
// ✅ NEW: Unified TLS cache (replaces g_tls_sll_head + g_tls_sll_count)
|
||
|
|
__thread TLSCacheEntry g_tls_cache[TINY_NUM_CLASSES] __attribute__((aligned(64))) = {{0}};
|
||
|
|
|
||
|
|
// ❌ DEPRECATED: Legacy TLS arrays (keep for gradual migration)
|
||
|
|
// Uncomment these if you want to support both old and new code paths simultaneously
|
||
|
|
// #define HAKMEM_TLS_MIGRATION_MODE 1
|
||
|
|
// #if HAKMEM_TLS_MIGRATION_MODE
|
||
|
|
// __thread void* g_tls_sll_head[TINY_NUM_CLASSES] = {0};
|
||
|
|
// __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES] = {0};
|
||
|
|
// #endif
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 3.3: Update Allocation Fast Path
|
||
|
|
|
||
|
|
**File**: `core/tiny_alloc_fast.inc.h`
|
||
|
|
**Function**: `tiny_alloc_fast_pop()`
|
||
|
|
|
||
|
|
**BEFORE**:
|
||
|
|
```c
|
||
|
|
static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||
|
|
void* ptr = g_tls_sll_head[class_idx]; // Cache line 0
|
||
|
|
if (!ptr) return NULL;
|
||
|
|
void* next = *(void**)ptr; // Random cache line
|
||
|
|
g_tls_sll_head[class_idx] = next; // Cache line 0
|
||
|
|
g_tls_sll_count[class_idx]--; // Cache line 1 ❌
|
||
|
|
return ptr;
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
**AFTER**:
|
||
|
|
```c
|
||
|
|
static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||
|
|
TLSCacheEntry* cache = &g_tls_cache[class_idx]; // Cache line 0 or 1
|
||
|
|
void* ptr = cache->head; // SAME cache line ✅
|
||
|
|
if (!ptr) return NULL;
|
||
|
|
void* next = *(void**)ptr; // Random (unchanged)
|
||
|
|
cache->head = next; // SAME cache line ✅
|
||
|
|
cache->count--; // SAME cache line ✅
|
||
|
|
return ptr;
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
**Performance Impact**: 2 cache lines → 1 cache line per allocation!
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 3.4: Update Free Fast Path
|
||
|
|
|
||
|
|
**File**: `core/tiny_free_fast.inc.h`
|
||
|
|
**Function**: `tiny_free_fast_ss()`
|
||
|
|
|
||
|
|
**BEFORE**:
|
||
|
|
```c
|
||
|
|
void* head = g_tls_sll_head[class_idx]; // Cache line 0
|
||
|
|
*(void**)base = head; // Write to block
|
||
|
|
g_tls_sll_head[class_idx] = base; // Cache line 0
|
||
|
|
g_tls_sll_count[class_idx]++; // Cache line 1 ❌
|
||
|
|
```
|
||
|
|
|
||
|
|
**AFTER**:
|
||
|
|
```c
|
||
|
|
TLSCacheEntry* cache = &g_tls_cache[class_idx]; // Cache line 0 or 1
|
||
|
|
void* head = cache->head; // SAME cache line ✅
|
||
|
|
*(void**)base = head; // Write to block
|
||
|
|
cache->head = base; // SAME cache line ✅
|
||
|
|
cache->count++; // SAME cache line ✅
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
### Step 3.5: Build & Test TLS Cache Merge
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Build with TLS cache merge
|
||
|
|
./build.sh bench_random_mixed_hakmem
|
||
|
|
|
||
|
|
# Regression tests
|
||
|
|
./build.sh test_all
|
||
|
|
./build.sh asan bench_random_mixed_hakmem
|
||
|
|
ASAN_OPTIONS=detect_leaks=1 ./out/asan/bench_random_mixed_hakmem 10000 256 42
|
||
|
|
|
||
|
|
# Benchmark
|
||
|
|
perf stat -e L1-dcache-loads,L1-dcache-load-misses,cycles,instructions \
|
||
|
|
-r 10 ./out/release/bench_random_mixed_hakmem 1000000 256 42 \
|
||
|
|
2>&1 | tee /tmp/optimized_tls_merge.txt
|
||
|
|
|
||
|
|
# Compare cumulative improvements
|
||
|
|
echo "=== Cumulative L1D Optimization Results ==="
|
||
|
|
echo "Baseline (no optimizations):"
|
||
|
|
cat /tmp/baseline_prefetch.txt | grep "dcache-load-misses\|operations per second"
|
||
|
|
echo ""
|
||
|
|
echo "After Prefetch:"
|
||
|
|
cat /tmp/optimized_prefetch.txt | grep "dcache-load-misses\|operations per second"
|
||
|
|
echo ""
|
||
|
|
echo "After Hot/Cold Split:"
|
||
|
|
cat /tmp/optimized_hotcold.txt | grep "dcache-load-misses\|operations per second"
|
||
|
|
echo ""
|
||
|
|
echo "After TLS Merge (FINAL):"
|
||
|
|
cat /tmp/optimized_tls_merge.txt | grep "dcache-load-misses\|operations per second"
|
||
|
|
```
|
||
|
|
|
||
|
|
**Expected Results**:
|
||
|
|
|
||
|
|
| Stage | L1D Miss Rate | Throughput | Improvement |
|
||
|
|
|-------|---------------|------------|-------------|
|
||
|
|
| Baseline | 1.69% | 24.9M ops/s | - |
|
||
|
|
| + Prefetch | 1.45-1.55% | 27-28M ops/s | +8-12% |
|
||
|
|
| + Hot/Cold Split | 1.2-1.3% | 31-34M ops/s | +25-35% |
|
||
|
|
| + TLS Merge | **1.0-1.1%** | **34-37M ops/s** | **+36-49%** 🎯 |
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Final Validation & Deployment
|
||
|
|
|
||
|
|
### Validation Checklist (Before Merge to main)
|
||
|
|
|
||
|
|
- [ ] **Performance**: Throughput > 34M ops/s (+36% minimum)
|
||
|
|
- [ ] **L1D Misses**: Miss rate < 1.1% (from 1.69%)
|
||
|
|
- [ ] **Correctness**: All tests pass (unit, integration, regression)
|
||
|
|
- [ ] **Memory Safety**: AddressSanitizer clean (no leaks, no overflows)
|
||
|
|
- [ ] **Stability**: 1 hour stress test (100M ops, no crashes)
|
||
|
|
- [ ] **Multi-threaded**: Larson 4T benchmark stable (no deadlocks)
|
||
|
|
|
||
|
|
### Rollback Plan
|
||
|
|
|
||
|
|
If any issues occur, rollback is simple (changes are incremental):
|
||
|
|
|
||
|
|
1. **Rollback TLS Merge** (Phase 3):
|
||
|
|
```bash
|
||
|
|
git revert <tls_merge_commit>
|
||
|
|
./build.sh bench_random_mixed_hakmem
|
||
|
|
```
|
||
|
|
|
||
|
|
2. **Rollback Hot/Cold Split** (Phase 2):
|
||
|
|
```bash
|
||
|
|
git revert <hotcold_split_commit>
|
||
|
|
./build.sh bench_random_mixed_hakmem
|
||
|
|
```
|
||
|
|
|
||
|
|
3. **Rollback Prefetch** (Phase 1):
|
||
|
|
```bash
|
||
|
|
git revert <prefetch_commit>
|
||
|
|
./build.sh bench_random_mixed_hakmem
|
||
|
|
```
|
||
|
|
|
||
|
|
All phases are independent and can be rolled back individually without breaking the build.
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Next Steps (After P1 Quick Wins)
|
||
|
|
|
||
|
|
Once P1 is complete and validated (+36-49% gain), proceed to **Priority 2 optimizations**:
|
||
|
|
|
||
|
|
1. **Proposal 2.1**: SuperSlab Hot Field Clustering (3-4 days, +18-25% additional)
|
||
|
|
2. **Proposal 2.2**: Dynamic SlabMeta Allocation (1-2 days, +20-28% additional)
|
||
|
|
|
||
|
|
**Cumulative target**: 42-50M ops/s (+70-100% total) within 1 week.
|
||
|
|
|
||
|
|
See `L1D_CACHE_MISS_ANALYSIS_REPORT.md` for full roadmap and Priority 2-3 details.
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Support & Troubleshooting
|
||
|
|
|
||
|
|
### Common Issues
|
||
|
|
|
||
|
|
1. **Build Error: `TinySlabMetaHot` undeclared**
|
||
|
|
- Ensure `#include "superslab/superslab_inline.h"` in affected files
|
||
|
|
- Check `superslab_types.h` has correct structure definitions
|
||
|
|
|
||
|
|
2. **Perf Regression: Throughput decreased**
|
||
|
|
- Likely cache line alignment issue
|
||
|
|
- Verify `__attribute__((aligned(64)))` on `g_tls_cache[]`
|
||
|
|
- Check `pahole` output for struct sizes
|
||
|
|
|
||
|
|
3. **AddressSanitizer Error: Stack buffer overflow**
|
||
|
|
- Check all `ss->slabs_hot[idx]` accesses have bounds checks
|
||
|
|
- Verify `SLABS_PER_SUPERSLAB_MAX` is correct (32)
|
||
|
|
|
||
|
|
4. **Segfault in refill path**
|
||
|
|
- Likely NULL pointer dereference (`tls->ss` or `meta`)
|
||
|
|
- Add NULL checks before prefetch calls
|
||
|
|
- Validate `slab_idx` is in range [0, 31]
|
||
|
|
|
||
|
|
### Debug Commands
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Check struct sizes and alignment
|
||
|
|
pahole ./out/release/bench_random_mixed_hakmem | grep -A 20 "struct SuperSlab"
|
||
|
|
pahole ./out/release/bench_random_mixed_hakmem | grep -A 10 "struct TLSCacheEntry"
|
||
|
|
|
||
|
|
# Profile L1D cache line access pattern
|
||
|
|
perf record -e mem_load_retired.l1_miss -c 1000 \
|
||
|
|
./out/release/bench_random_mixed_hakmem 100000 256 42
|
||
|
|
perf report --stdio --sort symbol
|
||
|
|
|
||
|
|
# Verify TLS cache alignment
|
||
|
|
gdb ./out/release/bench_random_mixed_hakmem
|
||
|
|
(gdb) break main
|
||
|
|
(gdb) run 1000 256 42
|
||
|
|
(gdb) info threads
|
||
|
|
(gdb) thread 1
|
||
|
|
(gdb) p &g_tls_cache[0]
|
||
|
|
# Address should be 64-byte aligned (last 6 bits = 0)
|
||
|
|
```
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
**Good luck!** 🚀 Expecting +36-49% gain within 1-2 days of focused implementation.
|