Fix: workset=8192 SEGV - Unify SuperSlab geometry to Box3 (partial fix)

Problem:
- bench_random_mixed_hakmem with workset=8192 causes SEGV
- workset=256 works fine
- Root cause identified by ChatGPT analysis

Root Cause:
SuperSlab geometry double definition caused slab_base misalignment:
- Old: tiny_slab_base_for() used SLAB0_OFFSET + idx * SLAB_SIZE
- New: Box3 tiny_slab_base_for_geometry() uses offset only for idx=0
- Result: slab_idx > 0 had +2048 byte offset error
- Impact: Unified Cache carve stepped beyond slab boundary → SEGV

Fix 1: core/superslab/superslab_inline.h
========================================
Delegate SuperSlab base calculation to Box3:

  static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) {
      if (!ss || slab_idx < 0) return NULL;
      return tiny_slab_base_for_geometry(ss, slab_idx);  // ← Box3 unified
  }

Effect:
- All tiny_slab_base_for() calls now use single Box3 implementation
- TLS slab_base and Box3 calculations perfectly aligned
- Eliminates geometry mismatch between layers

Fix 2: core/front/tiny_unified_cache.c
========================================
Enhanced fail-fast validation (debug builds only):
- unified_refill_validate_base(): Use TLS as source of truth
- Cross-check with registry lookup for safety
- Validate: slab_base range, alignment, meta consistency
- Box3 + TLS boundary consolidated to one place

Fix 3: core/hakmem_tiny_superslab.h
========================================
Added forward declaration:
- SuperSlab* superslab_refill(int class_idx);
- Required by tiny_unified_cache.c

Test Results:
=============
workset=8192 SEGV threshold improved:

Before fix:
   Immediate SEGV at any iteration count

After fix:
   100K iterations: OK (9.8M ops/s)
   200K iterations: OK (15.5M ops/s)
   300K iterations: SEGV (different bug exposed)

Conclusion:
- Box3 geometry unification fixed primary SEGV
- Stability improved: 0 → 200K iterations
- Remaining issue: 300K+ iterations hit different bug
- Likely causes: memory pressure, different corruption pattern

Known Issues:
- Debug warnings still present: FREE_FAST_HDR_META_MISMATCH, NXT_HDR_MISMATCH
- These are separate header consistency issues (not related to geometry)
- 300K+ SEGV requires further investigation

Performance:
- No performance regression observed in stable range
- workset=256 unaffected: 60M+ ops/s maintained

Credit: Root cause analysis and fix strategy by ChatGPT

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-11-22 07:40:35 +09:00
parent 38e4e8d4c2
commit 2fe970252a
3 changed files with 148 additions and 13 deletions

View File

@ -4,15 +4,15 @@
#include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta #include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta
#include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry #include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
#include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal) #include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal)
#include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab #include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab, superslab_refill()
#include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add #include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity
#include "../hakmem_super_registry.h" // For hak_super_lookup (pointer→SuperSlab)
#include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats) #include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
// Phase 23-E: Forward declarations // Phase 23-E: Forward declarations
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c
extern int superslab_refill(int class_idx); // From hakmem_tiny_superslab.c
// ============================================================================ // ============================================================================
// TLS Variables (defined here, extern in header) // TLS Variables (defined here, extern in header)
@ -131,6 +131,136 @@ void unified_cache_print_stats(void) {
// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass) // Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
// ============================================================================ // ============================================================================
// Fail-fast helper: verify that a candidate BASE pointer belongs to a valid
// Tiny slab within a SuperSlab. This is intentionally defensive and only
// compiled in debug builds to avoid hot-path overhead in release.
static inline int unified_refill_validate_base(int class_idx,
TinyTLSSlab* tls,
TinySlabMeta* meta,
void* base,
const char* stage)
{
#if HAKMEM_BUILD_RELEASE
(void)class_idx; (void)tls; (void)base; (void)stage;
return 1;
#else
if (!base) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=NULL tls_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
(void*)(tls ? tls->ss : NULL),
(void*)meta);
abort();
}
SuperSlab* tls_ss = tls ? tls->ss : NULL;
if (!tls_ss || tls_ss->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p meta=%p (invalid TLS ss)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)meta);
abort();
}
// Cross-check registry lookup for additional safety.
SuperSlab* ss_lookup = hak_super_lookup(base);
if (!ss_lookup || ss_lookup->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup,
(void*)meta);
abort();
}
if (ss_lookup != tls_ss) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p (mismatch)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup);
abort();
}
int slab_idx = tls ? (int)tls->slab_idx : -1;
int cap = ss_slabs_capacity(tls_ss);
if (slab_idx < 0 || slab_idx >= cap) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d cap=%d meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
cap,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
// Ensure meta matches TLS view for this slab.
TinySlabMeta* expected_meta = &tls_ss->slabs[slab_idx];
if (meta && meta != expected_meta) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d meta=%p expected_meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
(void*)meta,
(void*)expected_meta);
abort();
}
uint8_t* slab_base = tiny_slab_base_for_geometry(tls_ss, slab_idx);
size_t stride = tiny_stride_for_class(class_idx);
size_t usable = tiny_usable_bytes_for_slab(slab_idx);
uint8_t* slab_end = slab_base + usable;
if ((uint8_t*)base < slab_base || (uint8_t*)base >= slab_end) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p range=[%p,%p) stride=%zu meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)slab_base,
(void*)slab_end,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
ptrdiff_t offset = (uint8_t*)base - slab_base;
if (offset % (ptrdiff_t)stride != 0) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p offset=%td stride=%zu (misaligned) meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
offset,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
return 1;
#endif
}
// Batch refill from SuperSlab (called on cache miss) // Batch refill from SuperSlab (called on cache miss)
// Returns: BASE pointer (first block), or NULL if failed // Returns: BASE pointer (first block), or NULL if failed
// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer) // Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
@ -171,6 +301,9 @@ void* unified_cache_refill(int class_idx) {
void* p = m->freelist; void* p = m->freelist;
m->freelist = tiny_next_read(class_idx, p); m->freelist = tiny_next_read(class_idx, p);
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_freelist");
// PageFaultTelemetry: record page touch for this BASE // PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p); pagefault_telemetry_touch(class_idx, p);
@ -186,6 +319,9 @@ void* unified_cache_refill(int class_idx) {
// Linear carve (fresh block, no freelist link) // Linear carve (fresh block, no freelist link)
void* p = (void*)(base + ((size_t)m->carved * bs)); void* p = (void*)(base + ((size_t)m->carved * bs));
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_carve");
// PageFaultTelemetry: record page touch for this BASE // PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p); pagefault_telemetry_touch(class_idx, p);

View File

@ -118,6 +118,11 @@ int superslab_find_free_slab(SuperSlab* ss);
// Free a SuperSlab (unregister and return to pool or munmap) // Free a SuperSlab (unregister and return to pool or munmap)
void superslab_free(SuperSlab* ss); void superslab_free(SuperSlab* ss);
// Refill TLS slab for given tiny class from shared SuperSlab pool.
// Returns: SuperSlab* on success (also updates g_tls_slabs[class_idx]),
// NULL on failure (no change to TLS state).
SuperSlab* superslab_refill(int class_idx);
// Statistics // Statistics
void superslab_print_stats(SuperSlab* ss); void superslab_print_stats(SuperSlab* ss);

View File

@ -2,6 +2,7 @@
#define SUPERSLAB_INLINE_H #define SUPERSLAB_INLINE_H
#include "superslab_types.h" #include "superslab_types.h"
#include "../tiny_box_geometry.h" // Box 3 geometry helpers (stride/base/capacity)
// Forward declaration for unsafe remote drain used by refill/handle paths // Forward declaration for unsafe remote drain used by refill/handle paths
// Implemented in hakmem_tiny_superslab.c // Implemented in hakmem_tiny_superslab.c
@ -19,20 +20,13 @@ static inline int ss_slabs_capacity(SuperSlab* ss)
} }
// Compute slab base pointer for given (ss, slab_idx). // Compute slab base pointer for given (ss, slab_idx).
// Box 5 wrapper: delegate to Box 3 canonical geometry helper.
static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx)
{ {
if (!ss || slab_idx < 0) return NULL; if (!ss || slab_idx < 0) {
if (slab_idx == 0) {
return (uint8_t*)ss + SUPERSLAB_SLAB0_DATA_OFFSET;
}
size_t off = SUPERSLAB_SLAB0_DATA_OFFSET + (size_t)slab_idx * SLAB_SIZE;
size_t ss_size = (size_t)1 << ss->lg_size;
if (off >= ss_size) {
return NULL; return NULL;
} }
return (uint8_t*)ss + off; return tiny_slab_base_for_geometry(ss, slab_idx);
} }
// Compute slab index for a pointer inside ss. // Compute slab index for a pointer inside ss.