Files
hakmem/core/tiny_mmap_gate.h
Moe Charm (CI) 4978340c02 Tiny/SuperSlab: implement per-class registry optimization for fast refill scan
Replace 262K linear registry scan with per-class indexed registry:
- Add g_super_reg_by_class[TINY_NUM_CLASSES][16384] for O(class_size) scan
- Update hak_super_register/unregister to maintain both hash table + per-class index
- Optimize refill scan in hakmem_tiny_free.inc (262K → ~10-100 entries per class)
- Optimize mmap gate scan in tiny_mmap_gate.h (same optimization)

Performance impact (Larson benchmark):
- threads=1: 2.59M → 2.61M ops/s (+0.8%)
- threads=4: 3.62M → 4.19M ops/s (+15.7%) 🎉

Root cause analysis via perf:
- superslab_refill consumed 28.51% CPU time (97.65% in loop instructions)
- 262,144-entry linear scan with 2 atomic loads per iteration
- Per-class registry reduces scan target by 98.4% (262K → 16K per class)

Registry capacity:
- SUPER_REG_PER_CLASS = 16384 (increased from 4096 to avoid exhaustion)
- Total: 8 classes × 16384 = 128K entries (vs 262K unified registry)

Design:
- Dual registry: Hash table (address lookup) + Per-class index (refill scan)
- O(1) registration/unregistration with swap-with-last removal
- Lock-free reads, mutex-protected writes (same as before)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 17:02:31 +09:00

79 lines
2.7 KiB
C

// tiny_mmap_gate.h - Mmap Gate (must-adopt-before-mmap)
#pragma once
#include "hakmem_tiny_superslab.h"
#include "tiny_refill.h"
#include "hakmem_super_registry.h"
// Returns adopted SuperSlab* or NULL
static inline SuperSlab* tiny_must_adopt_gate(int class_idx, TinyTLSSlab* tls) {
// Env: enable gate
static int en = -1;
if (__builtin_expect(en == -1, 0)) {
const char* s = getenv("HAKMEM_TINY_MUST_ADOPT");
en = (s && atoi(s) != 0) ? 1 : 0;
}
if (!en) return NULL;
// Try fast adopt once
SuperSlab* ss = tiny_refill_try_fast(class_idx, tls);
if (ss) return ss;
// Optional light remote drain to surface supply
if (!ss) {
// If TLS holds an SS, lightly drain its remotes to expose freelist
SuperSlab* cur = tls->ss;
if (cur && cur->magic == SUPERSLAB_MAGIC) {
ss_remote_drain_light(cur);
}
}
// Optional yield between attempts
static int yv = -1;
if (__builtin_expect(yv == -1, 0)) {
const char* y = getenv("HAKMEM_TINY_MMAP_YIELD");
yv = (y && atoi(y) != 0) ? 1 : 0;
}
if (yv) sched_yield();
// Try again after yield
ss = tiny_refill_try_fast(class_idx, tls);
if (ss) return ss;
// Registry small-window adopt (one pass, limited scan)
// Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
{
// Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
uint32_t self_tid = tiny_self_u32();
const int scan_max = tiny_reg_scan_max();
int reg_size = g_super_reg_class_size[class_idx];
int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;
for (int i = 0; i < scan_limit; i++) {
SuperSlab* cand = g_super_reg_by_class[class_idx][i];
if (!cand || cand->magic != SUPERSLAB_MAGIC) continue;
// Note: class_idx check is not needed (per-class registry!)
int cap = ss_slabs_capacity(cand);
for (int s = 0; s < cap; s++) {
// Box: Try to acquire ownership
SlabHandle h = slab_try_acquire(cand, s, self_tid);
if (slab_is_valid(&h)) {
// Box: Safe to drain - ownership guaranteed
slab_drain_remote_full(&h);
if (slab_freelist(&h)) {
tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
return h.ss;
}
slab_release(&h);
}
}
}
}
return NULL;
}