Files
hakmem/archive/tools/find_24_bytes.c
Moe Charm (CI) 52386401b3 Debug Counters Implementation - Clean History
Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 12:31:14 +09:00

134 lines
6.1 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
int main() {
printf("=== WHERE DOES 24.4 BYTES/ALLOCATION COME FROM? ===\n\n");
// For 16B allocations (class 1)
int blocks_per_slab = 4096;
int slab_size = 64 * 1024;
printf("Slab configuration (16B class):\n");
printf(" Blocks per slab: %d\n", blocks_per_slab);
printf(" Slab size: %d KB\n\n", slab_size / 1024);
// Calculate per-block metadata overhead
printf("Per-block overhead breakdown:\n\n");
// 1. Primary bitmap
double bitmap_per_block = 1.0 / 8.0; // 1 bit per block = 0.125 bytes
printf("1. Primary bitmap: 1 bit/block = %.3f bytes\n", bitmap_per_block);
// 2. Summary bitmap
// 64 bitmap words → 1 summary word
// 4096 blocks → 64 bitmap words → 1 summary word (64 bits)
double summary_per_block = 64.0 / (blocks_per_slab * 8.0);
printf("2. Summary bitmap: %.3f bytes\n", summary_per_block);
// 3. TinySlab metadata
// 88 bytes per slab / 4096 blocks
double slab_meta_per_block = 88.0 / blocks_per_slab;
printf("3. TinySlab struct: 88B / %d = %.3f bytes\n", blocks_per_slab, slab_meta_per_block);
// 4. Registry entry (amortized)
// Assume 1 registry entry per slab
double registry_per_block = 16.0 / blocks_per_slab;
printf("4. Registry entry: 16B / %d = %.3f bytes\n", blocks_per_slab, registry_per_block);
// 5. TLS Magazine
// This is tricky - it's per-thread, not per-block
// But in single-threaded case: 128 KB / 1M blocks
double tls_mag_per_block = (128.0 * 1024) / 1000000.0;
printf("5. TLS Magazine: 128KB / 1M blocks = %.3f bytes (amortized)\n", tls_mag_per_block);
// 6. HIDDEN COST: Slab fragmentation
// Each slab wastes space due to 64KB alignment
int blocks_used = 1000000 % blocks_per_slab; // Last slab: partially filled
if (blocks_used == 0) blocks_used = blocks_per_slab;
int blocks_wasted_last_slab = blocks_per_slab - blocks_used;
printf("\n=== THE REAL CULPRIT ===\n\n");
// Calculate how much space is wasted
int slabs_needed = (1000000 + blocks_per_slab - 1) / blocks_per_slab; // 245 slabs
int total_blocks_allocated = slabs_needed * blocks_per_slab; // 245 * 4096 = 1,003,520
int wasted_blocks = total_blocks_allocated - 1000000; // 3,520 blocks
printf("Slab allocation analysis:\n");
printf(" Blocks needed: 1,000,000\n");
printf(" Slabs allocated: %d × %d blocks = %d total blocks\n",
slabs_needed, blocks_per_slab, total_blocks_allocated);
printf(" Wasted blocks: %d (%.1f%% waste)\n", wasted_blocks,
wasted_blocks * 100.0 / total_blocks_allocated);
printf(" Wasted space: %d blocks × 16B = %.2f KB\n\n",
wasted_blocks, wasted_blocks * 16.0 / 1024);
// But the real issue: oversized slabs!
printf("ROOT CAUSE: Oversized slab allocation\n");
printf(" Each slab: 64 KB (data + metadata + waste)\n");
printf(" Each slab actually uses: %d blocks × 16B = %.1f KB of data\n",
blocks_per_slab, blocks_per_slab * 16.0 / 1024);
printf(" Per-slab overhead: 64 KB - %.1f KB = %.1f KB\n\n",
blocks_per_slab * 16.0 / 1024, 64 - blocks_per_slab * 16.0 / 1024);
// Wait, that doesn't make sense for 16B class
// 4096 × 16 = 65536 = 64 KB exactly!
printf("Wait... 4096 × 16B = %d bytes = 64 KB exactly!\n", blocks_per_slab * 16);
printf("So there's NO wasted space in the slab data region.\n\n");
printf("=== RETHINKING THE PROBLEM ===\n\n");
// Let me check if TLS Magazine is the issue
printf("TLS Magazine deep dive:\n");
printf(" Capacity: 2048 items per class\n");
printf(" Classes: 8\n");
printf(" Size per item: 8 bytes (pointer)\n");
printf(" Total per thread: 2048 × 8B × 8 = %.0f KB\n", 2048 * 8 * 8 / 1024.0);
printf(" For 1 thread: %.0f KB = %.2f MB\n\n", 2048 * 8 * 8 / 1024.0, 2048 * 8 * 8 / (1024.0 * 1024));
// This is 128 KB per thread - matches our calculation
// But spread over 1M allocations, that's only 0.13 bytes per allocation!
printf("=== MYSTERY: Where are the other 24 bytes? ===\n\n");
// Let me check if it's ACTIVE allocations vs TOTAL allocations
printf("Hypothesis: TLS Magazine is HOLDING allocations\n");
printf(" If TLS Magazine holds 2048 × 16B = %.1f KB per class\n", 2048 * 16.0 / 1024);
printf(" For class 1 (16B): 2048 items = %.1f KB of DATA\n", 2048 * 16.0 / 1024);
printf(" But we measured TOTAL RSS, which includes magazine contents!\n\n");
printf("Testing theory:\n");
printf(" At 1M allocations:\n");
printf(" - Active in program: 1M × 16B = 15.26 MB\n");
printf(" - Held in TLS mag: ~2048 × 16B × 8 classes = %.2f MB\n",
2048 * 16 * 8 / (1024.0 * 1024));
printf(" - But wait, TLS mag only holds FREED items, not allocated!\n\n");
// The real issue must be something else
printf("Let me check the init code...\n");
printf("From hakmem_tiny.c line 568-574:\n");
printf(" Pre-allocate slabs for classes 0-3 (8B, 16B, 32B, 64B)\n");
printf(" That's 4 × 64KB = 256 KB upfront!\n\n");
printf("Pre-allocation cost:\n");
printf(" 4 slabs × 64 KB = %.2f MB\n", 4 * 64 / 1024.0);
printf(" But this is FIXED, not per-allocation.\n\n");
printf("=== THE ANSWER ===\n");
printf("The 24.4 bytes/allocation must be in the PROGRAM's working set,\n");
printf("not HAKMEM's metadata. Let me check if it's the POINTER ARRAY!\n\n");
printf("Pointer array overhead:\n");
printf(" void** ptrs = malloc(1M × 8 bytes) = %.2f MB\n", 1000000 * 8 / (1024.0 * 1024));
printf(" This is 8 bytes per allocation!\n\n");
printf("Revised calculation:\n");
printf(" Data: 1M × 16B = 15.26 MB\n");
printf(" Pointer array: 1M × 8B = 7.63 MB\n");
printf(" Expected total (data + ptrs): 22.89 MB\n");
printf(" Actual measured: 39.60 MB\n");
printf(" Real overhead: 39.60 - 22.89 = 16.71 MB\n");
printf(" Per-allocation: 16.71 MB / 1M = %.1f bytes\n\n", 16.71 * 1024 * 1024 / 1000000.0);
return 0;
}