Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
134 lines
6.1 KiB
C
134 lines
6.1 KiB
C
#include <stdio.h>
|
||
|
||
int main() {
|
||
printf("=== WHERE DOES 24.4 BYTES/ALLOCATION COME FROM? ===\n\n");
|
||
|
||
// For 16B allocations (class 1)
|
||
int blocks_per_slab = 4096;
|
||
int slab_size = 64 * 1024;
|
||
|
||
printf("Slab configuration (16B class):\n");
|
||
printf(" Blocks per slab: %d\n", blocks_per_slab);
|
||
printf(" Slab size: %d KB\n\n", slab_size / 1024);
|
||
|
||
// Calculate per-block metadata overhead
|
||
printf("Per-block overhead breakdown:\n\n");
|
||
|
||
// 1. Primary bitmap
|
||
double bitmap_per_block = 1.0 / 8.0; // 1 bit per block = 0.125 bytes
|
||
printf("1. Primary bitmap: 1 bit/block = %.3f bytes\n", bitmap_per_block);
|
||
|
||
// 2. Summary bitmap
|
||
// 64 bitmap words → 1 summary word
|
||
// 4096 blocks → 64 bitmap words → 1 summary word (64 bits)
|
||
double summary_per_block = 64.0 / (blocks_per_slab * 8.0);
|
||
printf("2. Summary bitmap: %.3f bytes\n", summary_per_block);
|
||
|
||
// 3. TinySlab metadata
|
||
// 88 bytes per slab / 4096 blocks
|
||
double slab_meta_per_block = 88.0 / blocks_per_slab;
|
||
printf("3. TinySlab struct: 88B / %d = %.3f bytes\n", blocks_per_slab, slab_meta_per_block);
|
||
|
||
// 4. Registry entry (amortized)
|
||
// Assume 1 registry entry per slab
|
||
double registry_per_block = 16.0 / blocks_per_slab;
|
||
printf("4. Registry entry: 16B / %d = %.3f bytes\n", blocks_per_slab, registry_per_block);
|
||
|
||
// 5. TLS Magazine
|
||
// This is tricky - it's per-thread, not per-block
|
||
// But in single-threaded case: 128 KB / 1M blocks
|
||
double tls_mag_per_block = (128.0 * 1024) / 1000000.0;
|
||
printf("5. TLS Magazine: 128KB / 1M blocks = %.3f bytes (amortized)\n", tls_mag_per_block);
|
||
|
||
// 6. HIDDEN COST: Slab fragmentation
|
||
// Each slab wastes space due to 64KB alignment
|
||
int blocks_used = 1000000 % blocks_per_slab; // Last slab: partially filled
|
||
if (blocks_used == 0) blocks_used = blocks_per_slab;
|
||
int blocks_wasted_last_slab = blocks_per_slab - blocks_used;
|
||
|
||
printf("\n=== THE REAL CULPRIT ===\n\n");
|
||
|
||
// Calculate how much space is wasted
|
||
int slabs_needed = (1000000 + blocks_per_slab - 1) / blocks_per_slab; // 245 slabs
|
||
int total_blocks_allocated = slabs_needed * blocks_per_slab; // 245 * 4096 = 1,003,520
|
||
int wasted_blocks = total_blocks_allocated - 1000000; // 3,520 blocks
|
||
|
||
printf("Slab allocation analysis:\n");
|
||
printf(" Blocks needed: 1,000,000\n");
|
||
printf(" Slabs allocated: %d × %d blocks = %d total blocks\n",
|
||
slabs_needed, blocks_per_slab, total_blocks_allocated);
|
||
printf(" Wasted blocks: %d (%.1f%% waste)\n", wasted_blocks,
|
||
wasted_blocks * 100.0 / total_blocks_allocated);
|
||
printf(" Wasted space: %d blocks × 16B = %.2f KB\n\n",
|
||
wasted_blocks, wasted_blocks * 16.0 / 1024);
|
||
|
||
// But the real issue: oversized slabs!
|
||
printf("ROOT CAUSE: Oversized slab allocation\n");
|
||
printf(" Each slab: 64 KB (data + metadata + waste)\n");
|
||
printf(" Each slab actually uses: %d blocks × 16B = %.1f KB of data\n",
|
||
blocks_per_slab, blocks_per_slab * 16.0 / 1024);
|
||
printf(" Per-slab overhead: 64 KB - %.1f KB = %.1f KB\n\n",
|
||
blocks_per_slab * 16.0 / 1024, 64 - blocks_per_slab * 16.0 / 1024);
|
||
|
||
// Wait, that doesn't make sense for 16B class
|
||
// 4096 × 16 = 65536 = 64 KB exactly!
|
||
printf("Wait... 4096 × 16B = %d bytes = 64 KB exactly!\n", blocks_per_slab * 16);
|
||
printf("So there's NO wasted space in the slab data region.\n\n");
|
||
|
||
printf("=== RETHINKING THE PROBLEM ===\n\n");
|
||
|
||
// Let me check if TLS Magazine is the issue
|
||
printf("TLS Magazine deep dive:\n");
|
||
printf(" Capacity: 2048 items per class\n");
|
||
printf(" Classes: 8\n");
|
||
printf(" Size per item: 8 bytes (pointer)\n");
|
||
printf(" Total per thread: 2048 × 8B × 8 = %.0f KB\n", 2048 * 8 * 8 / 1024.0);
|
||
printf(" For 1 thread: %.0f KB = %.2f MB\n\n", 2048 * 8 * 8 / 1024.0, 2048 * 8 * 8 / (1024.0 * 1024));
|
||
|
||
// This is 128 KB per thread - matches our calculation
|
||
// But spread over 1M allocations, that's only 0.13 bytes per allocation!
|
||
|
||
printf("=== MYSTERY: Where are the other 24 bytes? ===\n\n");
|
||
|
||
// Let me check if it's ACTIVE allocations vs TOTAL allocations
|
||
printf("Hypothesis: TLS Magazine is HOLDING allocations\n");
|
||
printf(" If TLS Magazine holds 2048 × 16B = %.1f KB per class\n", 2048 * 16.0 / 1024);
|
||
printf(" For class 1 (16B): 2048 items = %.1f KB of DATA\n", 2048 * 16.0 / 1024);
|
||
printf(" But we measured TOTAL RSS, which includes magazine contents!\n\n");
|
||
|
||
printf("Testing theory:\n");
|
||
printf(" At 1M allocations:\n");
|
||
printf(" - Active in program: 1M × 16B = 15.26 MB\n");
|
||
printf(" - Held in TLS mag: ~2048 × 16B × 8 classes = %.2f MB\n",
|
||
2048 * 16 * 8 / (1024.0 * 1024));
|
||
printf(" - But wait, TLS mag only holds FREED items, not allocated!\n\n");
|
||
|
||
// The real issue must be something else
|
||
printf("Let me check the init code...\n");
|
||
printf("From hakmem_tiny.c line 568-574:\n");
|
||
printf(" Pre-allocate slabs for classes 0-3 (8B, 16B, 32B, 64B)\n");
|
||
printf(" That's 4 × 64KB = 256 KB upfront!\n\n");
|
||
|
||
printf("Pre-allocation cost:\n");
|
||
printf(" 4 slabs × 64 KB = %.2f MB\n", 4 * 64 / 1024.0);
|
||
printf(" But this is FIXED, not per-allocation.\n\n");
|
||
|
||
printf("=== THE ANSWER ===\n");
|
||
printf("The 24.4 bytes/allocation must be in the PROGRAM's working set,\n");
|
||
printf("not HAKMEM's metadata. Let me check if it's the POINTER ARRAY!\n\n");
|
||
|
||
printf("Pointer array overhead:\n");
|
||
printf(" void** ptrs = malloc(1M × 8 bytes) = %.2f MB\n", 1000000 * 8 / (1024.0 * 1024));
|
||
printf(" This is 8 bytes per allocation!\n\n");
|
||
|
||
printf("Revised calculation:\n");
|
||
printf(" Data: 1M × 16B = 15.26 MB\n");
|
||
printf(" Pointer array: 1M × 8B = 7.63 MB\n");
|
||
printf(" Expected total (data + ptrs): 22.89 MB\n");
|
||
printf(" Actual measured: 39.60 MB\n");
|
||
printf(" Real overhead: 39.60 - 22.89 = 16.71 MB\n");
|
||
printf(" Per-allocation: 16.71 MB / 1M = %.1f bytes\n\n", 16.71 * 1024 * 1024 / 1000000.0);
|
||
|
||
return 0;
|
||
}
|