Phase 5-Step2: Mid Free Route Box (+28.9x free perf, 1.53x faster than system)
Fix critical 19x free() slowdown in Mid MT allocator (1KB-8KB range). Root Cause: - Mid MT registers chunks in MidGlobalRegistry - Free path searches Pool's mid_desc registry (different registry!) - Result: 100% lookup failure → 4x cascading lookups → libc fallback Solution (Box Pattern): - Created core/box/mid_free_route_box.h - Try Mid MT registry BEFORE classify_ptr() in free() - Direct route to mid_mt_free() if found - Fall through to existing path if not found Performance Results (bench_mid_mt_gap, 1KB-8KB allocs): - Before: 1.49 M ops/s (19x slower than system malloc) - After: 41.0 M ops/s (+28.9x improvement) - vs System malloc: 1.53x faster (41.0 vs 26.8 M ops/s) Files: - core/box/mid_free_route_box.h (NEW) - Mid Free Route Box - core/box/hak_wrappers.inc.h - Add mid_free_route_try() call - core/hakmem_mid_mt.h - Fix mid_get_min_size() (1024 not 2048) - bench_mid_mt_gap.c (NEW) - Targeted 1KB-8KB benchmark - Makefile - Add bench_mid_mt_gap targets Box Pattern: ✅ Single responsibility, clear contract, testable, minimal change 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
13
Makefile
13
Makefile
@ -616,6 +616,19 @@ bench_random_mixed_hakmem: bench_random_mixed_hakmem.o $(TINY_BENCH_OBJS)
|
|||||||
bench_random_mixed_system: bench_random_mixed_system.o
|
bench_random_mixed_system: bench_random_mixed_system.o
|
||||||
$(CC) -o $@ $^ $(LDFLAGS)
|
$(CC) -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
|
# Mid MT gap benchmark (1KB-8KB allocations) - Phase 5-Step2 verification
|
||||||
|
bench_mid_mt_gap_hakmem.o: bench_mid_mt_gap.c hakmem.h
|
||||||
|
$(CC) $(CFLAGS) -DUSE_HAKMEM -c -o $@ $<
|
||||||
|
|
||||||
|
bench_mid_mt_gap_system.o: bench_mid_mt_gap.c
|
||||||
|
$(CC) $(CFLAGS) -c -o $@ $<
|
||||||
|
|
||||||
|
bench_mid_mt_gap_hakmem: bench_mid_mt_gap_hakmem.o $(TINY_BENCH_OBJS)
|
||||||
|
$(CC) -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
|
bench_mid_mt_gap_system: bench_mid_mt_gap_system.o
|
||||||
|
$(CC) -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
# Fixed-size microbench (direct link variants)
|
# Fixed-size microbench (direct link variants)
|
||||||
bench_fixed_size_hakmem.o: benchmarks/src/fixed/bench_fixed_size.c hakmem.h
|
bench_fixed_size_hakmem.o: benchmarks/src/fixed/bench_fixed_size.c hakmem.h
|
||||||
$(CC) $(CFLAGS) -DUSE_HAKMEM -c -o $@ $<
|
$(CC) $(CFLAGS) -DUSE_HAKMEM -c -o $@ $<
|
||||||
|
|||||||
137
bench_mid_mt_gap.c
Normal file
137
bench_mid_mt_gap.c
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
// bench_mid_mt_gap.c - Targeted benchmark for Mid MT allocation gap fix
|
||||||
|
// Tests 1KB-8KB allocations that were falling through to mmap() before fix
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// ./bench_mid_mt_gap_hakmem [cycles] [ws] [seed]
|
||||||
|
//
|
||||||
|
// Size distribution:
|
||||||
|
// - 1KB (1024B)
|
||||||
|
// - 2KB (2048B)
|
||||||
|
// - 4KB (4096B)
|
||||||
|
// - 8KB (8192B)
|
||||||
|
//
|
||||||
|
// Expected improvement: 100-1000x faster (mmap → Mid MT)
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifdef USE_HAKMEM
|
||||||
|
#include "hakmem.h"
|
||||||
|
|
||||||
|
// Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper)
|
||||||
|
extern void* __libc_calloc(size_t, size_t);
|
||||||
|
extern void __libc_free(void*);
|
||||||
|
#define BENCH_META_CALLOC __libc_calloc
|
||||||
|
#define BENCH_META_FREE __libc_free
|
||||||
|
|
||||||
|
// Phase 20-2: BenchFast mode - prealloc pool init
|
||||||
|
#include "core/box/bench_fast_box.h"
|
||||||
|
#else
|
||||||
|
// System malloc build: use standard libc
|
||||||
|
#define BENCH_META_CALLOC calloc
|
||||||
|
#define BENCH_META_FREE free
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static inline uint64_t now_ns(void) {
|
||||||
|
struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return (uint64_t)ts.tv_sec*1000000000ull + (uint64_t)ts.tv_nsec;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t xorshift32(uint32_t* s){
|
||||||
|
uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv){
|
||||||
|
int cycles = (argc>1)? atoi(argv[1]) : 1000000; // 1M cycles (faster than 10M)
|
||||||
|
int ws = (argc>2)? atoi(argv[2]) : 256; // working-set slots
|
||||||
|
uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u;
|
||||||
|
|
||||||
|
if (cycles <= 0) cycles = 1;
|
||||||
|
if (ws <= 0) ws = 256;
|
||||||
|
|
||||||
|
#ifdef USE_HAKMEM
|
||||||
|
// Phase 20-2: BenchFast prealloc pool initialization
|
||||||
|
int prealloc_count = bench_fast_init();
|
||||||
|
if (prealloc_count > 0) {
|
||||||
|
fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// System malloc also needs warmup for fair comparison
|
||||||
|
(void)malloc(1); // Force libc initialization
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Box BenchMeta: Use __libc_calloc to bypass hakmem wrapper
|
||||||
|
void** slots = (void**)BENCH_META_CALLOC((size_t)ws, sizeof(void*));
|
||||||
|
if (!slots) { fprintf(stderr, "alloc failed (slots)\n"); return 1; }
|
||||||
|
|
||||||
|
// Size distribution: 1KB, 2KB, 4KB, 8KB (evenly distributed)
|
||||||
|
const size_t sizes[4] = {1024, 2048, 4096, 8192};
|
||||||
|
|
||||||
|
// Warmup run (exclude from timing)
|
||||||
|
const char* warmup_env = getenv("HAKMEM_BENCH_WARMUP");
|
||||||
|
int warmup_cycles = warmup_env ? atoi(warmup_env) : 0;
|
||||||
|
if (warmup_cycles > 0) {
|
||||||
|
fprintf(stderr, "[BENCH_WARMUP] Running %d warmup cycles (not timed)...\n", warmup_cycles);
|
||||||
|
uint32_t warmup_seed = seed;
|
||||||
|
for (int i=0; i<warmup_cycles; i++){
|
||||||
|
uint32_t r = xorshift32(&warmup_seed);
|
||||||
|
int idx = (int)(r % (uint32_t)ws);
|
||||||
|
if (slots[idx]){
|
||||||
|
free(slots[idx]);
|
||||||
|
slots[idx] = NULL;
|
||||||
|
} else {
|
||||||
|
// Pick size from 1KB, 2KB, 4KB, 8KB
|
||||||
|
size_t sz = sizes[r % 4];
|
||||||
|
void* p = malloc(sz);
|
||||||
|
if (p) {
|
||||||
|
((unsigned char*)p)[0] = (unsigned char)r;
|
||||||
|
slots[idx] = p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Drain warmup allocations
|
||||||
|
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
|
||||||
|
fprintf(stderr, "[BENCH_WARMUP] Warmup completed. Starting timed run...\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t start = now_ns();
|
||||||
|
int frees = 0, allocs = 0;
|
||||||
|
for (int i=0; i<cycles; i++){
|
||||||
|
uint32_t r = xorshift32(&seed);
|
||||||
|
int idx = (int)(r % (uint32_t)ws);
|
||||||
|
if (slots[idx]){
|
||||||
|
free(slots[idx]);
|
||||||
|
slots[idx] = NULL;
|
||||||
|
frees++;
|
||||||
|
} else {
|
||||||
|
// Pick size from 1KB, 2KB, 4KB, 8KB (25% each)
|
||||||
|
size_t sz = sizes[r % 4];
|
||||||
|
void* p = malloc(sz);
|
||||||
|
if (p) {
|
||||||
|
((unsigned char*)p)[0] = (unsigned char)r;
|
||||||
|
slots[idx] = p;
|
||||||
|
allocs++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uint64_t end = now_ns();
|
||||||
|
|
||||||
|
// Drain remaining allocations
|
||||||
|
for (int i=0; i<ws; i++){
|
||||||
|
if (slots[i]) { free(slots[i]); slots[i]=NULL; }
|
||||||
|
}
|
||||||
|
|
||||||
|
double ns = (double)(end - start);
|
||||||
|
double ops_per_s = (double)cycles / (ns / 1e9);
|
||||||
|
|
||||||
|
printf("Throughput = %.2f M operations per second, relative time: %.6f s.\n",
|
||||||
|
ops_per_s / 1e6, ns / 1e9);
|
||||||
|
fprintf(stderr, "[BENCH] Cycles=%d, Allocs=%d, Frees=%d, WS=%d\n",
|
||||||
|
cycles, allocs, frees, ws);
|
||||||
|
|
||||||
|
BENCH_META_FREE(slots);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
@ -32,6 +32,7 @@ void* realloc(void* ptr, size_t size) {
|
|||||||
#include "front_gate_classifier.h" // Box FG: pointer classification (header/reg)
|
#include "front_gate_classifier.h" // Box FG: pointer classification (header/reg)
|
||||||
#include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification
|
#include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification
|
||||||
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
|
#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination
|
||||||
|
#include "mid_free_route_box.h" // Phase 5-Step2: Mid MT free routing fix
|
||||||
|
|
||||||
// malloc wrapper - intercepts system malloc() calls
|
// malloc wrapper - intercepts system malloc() calls
|
||||||
__thread uint64_t g_malloc_total_calls = 0;
|
__thread uint64_t g_malloc_total_calls = 0;
|
||||||
@ -202,6 +203,11 @@ void free(void* ptr) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Phase 5-Step2: Mid Free Route Box (BEFORE classify_ptr)
|
||||||
|
// Quick fix for 19x free() slowdown: Try Mid MT registry first
|
||||||
|
// If found, route directly to mid_mt_free() and return
|
||||||
|
if (mid_free_route_try(ptr)) return;
|
||||||
|
|
||||||
// Classify pointer BEFORE early libc fallbacks to avoid misrouting Tiny pointers
|
// Classify pointer BEFORE early libc fallbacks to avoid misrouting Tiny pointers
|
||||||
// This is safe: classifier uses header probe and registry; does not allocate.
|
// This is safe: classifier uses header probe and registry; does not allocate.
|
||||||
int is_hakmem_owned = 0;
|
int is_hakmem_owned = 0;
|
||||||
|
|||||||
104
core/box/mid_free_route_box.h
Normal file
104
core/box/mid_free_route_box.h
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
/**
|
||||||
|
* mid_free_route_box.h
|
||||||
|
*
|
||||||
|
* Box: Mid Free Route Box
|
||||||
|
* Responsibility: Route Mid MT allocations to correct free path
|
||||||
|
* Contract: Try Mid MT registry lookup, return success/failure
|
||||||
|
*
|
||||||
|
* Part of Phase 5-Step2 fix for 19x free() slowdown
|
||||||
|
*
|
||||||
|
* Problem:
|
||||||
|
* - Mid MT allocator registers chunks in MidGlobalRegistry
|
||||||
|
* - Free path searches Pool's mid_desc registry (different registry!)
|
||||||
|
* - Result: 100% lookup failure → 4x cascading lookups → 19x slower
|
||||||
|
*
|
||||||
|
* Solution:
|
||||||
|
* - Add Mid MT registry lookup BEFORE Pool registry lookup
|
||||||
|
* - Route directly to mid_mt_free() if found
|
||||||
|
* - Fall through to existing path if not found
|
||||||
|
*
|
||||||
|
* Performance Impact:
|
||||||
|
* - Before: 1.42 M ops/s (19x slower than system malloc)
|
||||||
|
* - After: 14-21 M ops/s (Option B quick fix, 10-15x improvement)
|
||||||
|
*
|
||||||
|
* Created: 2025-11-29 (Phase 5-Step2 Mid MT Gap Fix)
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MID_FREE_ROUTE_BOX_H
|
||||||
|
#define MID_FREE_ROUTE_BOX_H
|
||||||
|
|
||||||
|
#include "../hakmem_mid_mt.h"
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Box Contract: Mid MT Free Routing
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* mid_free_route_try - Try Mid MT free path first
|
||||||
|
*
|
||||||
|
* @param ptr Pointer to free
|
||||||
|
* @return true if handled by Mid MT, false to fall through
|
||||||
|
*
|
||||||
|
* Box Responsibilities:
|
||||||
|
* 1. Query Mid MT registry (mid_registry_lookup)
|
||||||
|
* 2. If found: Call mid_mt_free() and return true
|
||||||
|
* 3. If not found: Return false (let existing path handle it)
|
||||||
|
*
|
||||||
|
* Box Guarantees:
|
||||||
|
* - Zero side effects if returning false
|
||||||
|
* - Correct free if returning true
|
||||||
|
* - Thread-safe (Mid MT registry has mutex protection)
|
||||||
|
*
|
||||||
|
* Performance:
|
||||||
|
* - Mid MT hit: O(log N) registry lookup + O(1) free = ~50 cycles
|
||||||
|
* - Mid MT miss: O(log N) registry lookup only = ~50 cycles
|
||||||
|
* - Compare to current broken path: 4 lookups + libc = ~750 cycles
|
||||||
|
*
|
||||||
|
* Usage Example:
|
||||||
|
* void free(void* ptr) {
|
||||||
|
* if (mid_free_route_try(ptr)) return; // Mid MT handled
|
||||||
|
* // Fall through to existing free path...
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
__attribute__((always_inline))
|
||||||
|
static inline bool mid_free_route_try(void* ptr) {
|
||||||
|
if (!ptr) return false; // NULL ptr, not Mid MT
|
||||||
|
|
||||||
|
// Query Mid MT registry (binary search + mutex)
|
||||||
|
size_t block_size = 0;
|
||||||
|
int class_idx = 0;
|
||||||
|
|
||||||
|
if (mid_registry_lookup(ptr, &block_size, &class_idx)) {
|
||||||
|
// Found in Mid MT registry, route to mid_mt_free()
|
||||||
|
mid_mt_free(ptr, block_size);
|
||||||
|
return true; // Handled
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not in Mid MT registry, fall through to existing path
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Box Observability (Debug/Profiling)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
#if MID_DEBUG
|
||||||
|
/**
|
||||||
|
* mid_free_route_stats - Print Mid Free Route Box statistics
|
||||||
|
*
|
||||||
|
* Only available in debug builds (MID_DEBUG=1)
|
||||||
|
* Tracks hit/miss rates for performance analysis
|
||||||
|
*/
|
||||||
|
void mid_free_route_stats(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // MID_FREE_ROUTE_BOX_H
|
||||||
@ -41,12 +41,16 @@ extern "C" {
|
|||||||
// - HAKMEM_TINY_MAX_CLASS=5 → Tiny up to 255B → Mid starts at 256B
|
// - HAKMEM_TINY_MAX_CLASS=5 → Tiny up to 255B → Mid starts at 256B
|
||||||
#include "hakmem_tiny.h" // For tiny_get_max_size()
|
#include "hakmem_tiny.h" // For tiny_get_max_size()
|
||||||
|
|
||||||
static inline size_t mid_get_min_size(void) {
|
|
||||||
return tiny_get_max_size() + 1; // Mid starts where Tiny ends
|
|
||||||
}
|
|
||||||
|
|
||||||
#define MID_MIN_SIZE_STATIC (1024) // Static fallback (C7 default)
|
#define MID_MIN_SIZE_STATIC (1024) // Static fallback (C7 default)
|
||||||
#define MID_MAX_SIZE (32 * 1024) // 32KB
|
#define MID_MAX_SIZE (32 * 1024) // 32KB
|
||||||
|
|
||||||
|
static inline size_t mid_get_min_size(void) {
|
||||||
|
// Phase 5-Step2 FIX: Use static 1024 instead of tiny_get_max_size() + 1
|
||||||
|
// Bug: tiny_get_max_size() returns 2047 (C7 usable), making min = 2048
|
||||||
|
// This caused 1KB-2KB allocations to fall through to mmap() (100-1000x slower!)
|
||||||
|
// Fix: Use MID_MIN_SIZE_STATIC (1024) to align with actual Tiny/Mid boundary
|
||||||
|
return MID_MIN_SIZE_STATIC; // 1024 = TINY_MAX_SIZE
|
||||||
|
}
|
||||||
#define MID_CHUNK_SIZE (4 * 1024 * 1024) // 4MB chunks (same as mimalloc segments)
|
#define MID_CHUNK_SIZE (4 * 1024 * 1024) // 4MB chunks (same as mimalloc segments)
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|||||||
Reference in New Issue
Block a user