Tiny: fix header/stride mismatch and harden refill paths

- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.
This commit is contained in:
Moe Charm (CI)
2025-11-09 18:55:50 +09:00
parent ab68ee536d
commit 1010a961fb
171 changed files with 10238 additions and 634 deletions

View File

@ -0,0 +1,130 @@
// ace_pool_connector.c - ACE-Pool Connection Box Implementation
#include "ace_pool_connector.h"
#include "../hakmem_pool.h"
#include "../hakmem_ace_controller.h"
#include <stdio.h>
#include <string.h>
// External references (from Pool)
extern struct Pool {
int initialized;
// ... other fields
} g_pool;
extern size_t g_class_sizes[7]; // Pool class sizes
extern int g_wrap_l2_enabled;
// ============================================================================
// Box Implementation
// ============================================================================
AcePoolHealth ace_pool_get_health(void) {
AcePoolHealth health;
memset(&health, 0, sizeof(health));
// Check Pool initialization
health.pool_initialized = g_pool.initialized;
// Check ACE status
const char* ace_env = getenv("HAKMEM_ACE_ENABLED");
health.ace_enabled = (ace_env && atoi(ace_env) == 1);
// Check WRAP_L2 status
health.wrap_l2_enabled = g_wrap_l2_enabled;
// Check Bridge classes
health.bridge_class_5_size = (int)g_class_sizes[5];
health.bridge_class_6_size = (int)g_class_sizes[6];
// TODO: Track pre-allocated pages count
health.preallocated_pages = 0; // Not yet tracked
// Determine overall status
if (!health.pool_initialized) {
health.status = ACE_POOL_NOT_INIT;
health.message = "Pool not initialized";
} else if (!health.ace_enabled) {
health.status = ACE_POOL_NOT_INIT;
health.message = "ACE not enabled (set HAKMEM_ACE_ENABLED=1)";
} else if (!health.wrap_l2_enabled) {
health.status = ACE_POOL_WRAPPER_BLOCKED;
health.message = "WRAP_L2 not enabled (set HAKMEM_WRAP_L2=1)";
} else if (health.bridge_class_5_size == 0 && health.bridge_class_6_size == 0) {
health.status = ACE_POOL_SIZE_MISMATCH;
health.message = "Bridge classes disabled (class 5 and 6 are 0)";
} else if (health.preallocated_pages == 0) {
health.status = ACE_POOL_NO_PAGES;
health.message = "No pre-allocated pages (performance will be degraded)";
} else {
health.status = ACE_POOL_OK;
health.message = "ACE-Pool connection healthy";
}
return health;
}
int ace_pool_validate_connection(AcePoolStatus* out_status) {
AcePoolHealth health = ace_pool_get_health();
if (out_status) {
*out_status = health.status;
}
// Only OK status is considered "ready"
// NO_PAGES is warning but still functional
return (health.status == ACE_POOL_OK || health.status == ACE_POOL_NO_PAGES);
}
void* ace_pool_try_alloc(size_t size, uintptr_t site_id, AcePoolStatus* out_status) {
// Validate connection first
AcePoolStatus status;
if (!ace_pool_validate_connection(&status)) {
if (out_status) *out_status = status;
// Log why allocation failed
AcePoolHealth health = ace_pool_get_health();
static int logged_once = 0;
if (!logged_once) {
fprintf(stderr, "[ACE-Pool Connector] BLOCKED: %s\n", health.message);
logged_once = 1;
}
return NULL;
}
// Connection validated, try Pool allocation
void* ptr = hak_pool_try_alloc(size, site_id);
if (ptr) {
if (out_status) *out_status = ACE_POOL_OK;
} else {
if (out_status) *out_status = ACE_POOL_ALLOC_FAILED;
// Log allocation failure (but only once to avoid spam)
static int fail_logged = 0;
if (!fail_logged) {
fprintf(stderr, "[ACE-Pool Connector] Pool allocation failed for size=%zu (will fallback to mmap)\n", size);
fail_logged = 1;
}
}
return ptr;
}
void ace_pool_print_health(void) {
AcePoolHealth health = ace_pool_get_health();
fprintf(stderr, "\n=== ACE-Pool Connector Health Check ===\n");
fprintf(stderr, "Pool Initialized: %s\n", health.pool_initialized ? "YES" : "NO");
fprintf(stderr, "ACE Enabled: %s\n", health.ace_enabled ? "YES" : "NO");
fprintf(stderr, "WRAP_L2 Enabled: %s\n", health.wrap_l2_enabled ? "YES" : "NO");
fprintf(stderr, "Bridge Class 5: %d KB (%s)\n",
health.bridge_class_5_size / 1024,
health.bridge_class_5_size > 0 ? "ENABLED" : "DISABLED");
fprintf(stderr, "Bridge Class 6: %d KB (%s)\n",
health.bridge_class_6_size / 1024,
health.bridge_class_6_size > 0 ? "ENABLED" : "DISABLED");
fprintf(stderr, "Pre-allocated Pages: %d\n", health.preallocated_pages);
fprintf(stderr, "Status: %s\n", health.message);
fprintf(stderr, "========================================\n\n");
}

View File

@ -0,0 +1,70 @@
// ace_pool_connector.h - ACE-Pool Connection Box
// Box Theory: Single Responsibility - Validate and route ACE ↔ Pool connections
//
// Purpose:
// - Make ACE-Pool connection VISIBLE and VALIDATED
// - Centralize error handling and logging
// - Health check API for diagnostics
//
// Responsibilities:
// ✅ Validate Pool is initialized before ACE uses it
// ✅ Log connection status (success/failure/reason)
// ✅ Provide health check API
// ❌ NOT responsible for: allocation logic, size rounding, or memory management
//
// Box Boundaries:
// INPUT: ACE requests allocation from Pool (size, site_id)
// OUTPUT: Pool allocation result (ptr or NULL) + reason code
// ERROR: Clear error messages (not silent failures!)
#ifndef ACE_POOL_CONNECTOR_H
#define ACE_POOL_CONNECTOR_H
#include <stddef.h>
#include <stdint.h>
// ============================================================================
// Box API: ACE-Pool Connection
// ============================================================================
// Connection status codes
typedef enum {
ACE_POOL_OK = 0, // Connection healthy
ACE_POOL_NOT_INIT, // Pool not initialized
ACE_POOL_NO_PAGES, // Pool has no pre-allocated pages
ACE_POOL_WRAPPER_BLOCKED, // Wrapper protection blocking
ACE_POOL_SIZE_MISMATCH, // Size not in Pool range
ACE_POOL_ALLOC_FAILED, // Pool allocation returned NULL
} AcePoolStatus;
// Health check result
typedef struct {
int pool_initialized; // 1 if Pool is initialized
int ace_enabled; // 1 if ACE is enabled
int wrap_l2_enabled; // 1 if WRAP_L2 is enabled
int bridge_class_5_size; // Size of Bridge class 5 (40KB expected)
int bridge_class_6_size; // Size of Bridge class 6 (52KB expected)
int preallocated_pages; // Number of pre-allocated pages (should be > 0)
AcePoolStatus status; // Overall status
const char* message; // Human-readable status message
} AcePoolHealth;
// ============================================================================
// Box Functions
// ============================================================================
// Get health status (for debugging and monitoring)
AcePoolHealth ace_pool_get_health(void);
// Validate connection is ready (called by ACE before using Pool)
// Returns: 1 if ready, 0 if not (sets reason code)
int ace_pool_validate_connection(AcePoolStatus* out_status);
// Connect ACE to Pool (wrapper around hak_pool_try_alloc with validation)
// Returns: Allocated pointer or NULL (logs reason if NULL)
void* ace_pool_try_alloc(size_t size, uintptr_t site_id, AcePoolStatus* out_status);
// Print health status (for debugging)
void ace_pool_print_health(void);
#endif // ACE_POOL_CONNECTOR_H

24
core/box/free_local_box.d Normal file
View File

@ -0,0 +1,24 @@
core/box/free_local_box.o: core/box/free_local_box.c \
core/box/free_local_box.h core/hakmem_tiny_superslab.h \
core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \
core/superslab/superslab_inline.h core/superslab/superslab_types.h \
core/tiny_debug_ring.h core/tiny_remote.h core/tiny_debug_ring.h \
core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \
core/box/free_publish_box.h core/hakmem_tiny.h core/hakmem_build_flags.h \
core/hakmem_trace.h core/hakmem_tiny_mini_mag.h
core/box/free_local_box.h:
core/hakmem_tiny_superslab.h:
core/superslab/superslab_types.h:
core/hakmem_tiny_superslab_constants.h:
core/superslab/superslab_inline.h:
core/superslab/superslab_types.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/hakmem_tiny_superslab_constants.h:
core/box/free_publish_box.h:
core/hakmem_tiny.h:
core/hakmem_build_flags.h:
core/hakmem_trace.h:
core/hakmem_tiny_mini_mag.h:

View File

@ -0,0 +1,28 @@
core/box/free_publish_box.o: core/box/free_publish_box.c \
core/box/free_publish_box.h core/hakmem_tiny_superslab.h \
core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \
core/superslab/superslab_inline.h core/superslab/superslab_types.h \
core/tiny_debug_ring.h core/tiny_remote.h core/tiny_debug_ring.h \
core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \
core/hakmem_tiny.h core/hakmem_build_flags.h core/hakmem_trace.h \
core/hakmem_tiny_mini_mag.h core/tiny_route.h core/tiny_ready.h \
core/hakmem_tiny.h core/box/mailbox_box.h
core/box/free_publish_box.h:
core/hakmem_tiny_superslab.h:
core/superslab/superslab_types.h:
core/hakmem_tiny_superslab_constants.h:
core/superslab/superslab_inline.h:
core/superslab/superslab_types.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/hakmem_tiny_superslab_constants.h:
core/hakmem_tiny.h:
core/hakmem_build_flags.h:
core/hakmem_trace.h:
core/hakmem_tiny_mini_mag.h:
core/tiny_route.h:
core/tiny_ready.h:
core/hakmem_tiny.h:
core/box/mailbox_box.h:

View File

@ -0,0 +1,24 @@
core/box/free_remote_box.o: core/box/free_remote_box.c \
core/box/free_remote_box.h core/hakmem_tiny_superslab.h \
core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \
core/superslab/superslab_inline.h core/superslab/superslab_types.h \
core/tiny_debug_ring.h core/tiny_remote.h core/tiny_debug_ring.h \
core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \
core/box/free_publish_box.h core/hakmem_tiny.h core/hakmem_build_flags.h \
core/hakmem_trace.h core/hakmem_tiny_mini_mag.h
core/box/free_remote_box.h:
core/hakmem_tiny_superslab.h:
core/superslab/superslab_types.h:
core/hakmem_tiny_superslab_constants.h:
core/superslab/superslab_inline.h:
core/superslab/superslab_types.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/hakmem_tiny_superslab_constants.h:
core/box/free_publish_box.h:
core/hakmem_tiny.h:
core/hakmem_build_flags.h:
core/hakmem_trace.h:
core/hakmem_tiny_mini_mag.h:

11
core/box/front_gate_box.d Normal file
View File

@ -0,0 +1,11 @@
core/box/front_gate_box.o: core/box/front_gate_box.c \
core/box/front_gate_box.h core/hakmem_tiny.h core/hakmem_build_flags.h \
core/hakmem_trace.h core/hakmem_tiny_mini_mag.h \
core/tiny_alloc_fast_sfc.inc.h core/hakmem_tiny.h
core/box/front_gate_box.h:
core/hakmem_tiny.h:
core/hakmem_build_flags.h:
core/hakmem_trace.h:
core/hakmem_tiny_mini_mag.h:
core/tiny_alloc_fast_sfc.inc.h:
core/hakmem_tiny.h:

View File

@ -6,6 +6,19 @@
#include "../pool_tls.h"
#endif
// Centralized OS mapping boundary to keep syscalls in one place
static inline void* hak_os_map_boundary(size_t size, uintptr_t site_id) {
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_mmap);
#endif
void* p = hak_alloc_mmap_impl(size);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
#endif
(void)site_id; // reserved for future accounting/learning
return p;
}
__attribute__((always_inline))
inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
#if HAKMEM_DEBUG_TIMING
@ -144,33 +157,24 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
//
// Solution: Use mmap for gap when ACE failed (ACE disabled or OOM)
// Track final fallback mmaps globally
extern _Atomic uint64_t g_final_fallback_mmap_count;
void* ptr;
if (size >= threshold) {
// Large allocation (>= 2MB default): use mmap
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_mmap);
#endif
ptr = hak_alloc_mmap_impl(size);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
#endif
// Large allocation (>= 2MB default): descend via single boundary
atomic_fetch_add(&g_final_fallback_mmap_count, 1);
ptr = hak_os_map_boundary(size, site_id);
} else if (size >= TINY_MAX_SIZE) {
// Mid-range allocation (1KB-2MB): try mmap as final fallback
// This handles the gap when ACE is disabled or failed
atomic_fetch_add(&g_final_fallback_mmap_count, 1);
static _Atomic int gap_alloc_count = 0;
int count = atomic_fetch_add(&gap_alloc_count, 1);
#if HAKMEM_DEBUG_VERBOSE
if (count < 3) {
fprintf(stderr, "[HAKMEM] INFO: Using mmap for mid-range size=%zu (ACE disabled or failed)\n", size);
}
if (count < 3) fprintf(stderr, "[HAKMEM] INFO: mid-gap fallback size=%zu\n", size);
#endif
#if HAKMEM_DEBUG_TIMING
HKM_TIME_START(t_mmap);
#endif
ptr = hak_alloc_mmap_impl(size);
#if HAKMEM_DEBUG_TIMING
HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
#endif
ptr = hak_os_map_boundary(size, site_id);
} else {
// Should never reach here (size <= TINY_MAX_SIZE should be handled by Tiny)
static _Atomic int oom_count = 0;

View File

@ -117,6 +117,39 @@ static void hak_init_impl(void) {
HAKMEM_LOG("Sampling rate: 1/%d\n", SAMPLING_RATE);
HAKMEM_LOG("Max sites: %d\n", MAX_SITES);
// Build banner (one-shot)
do {
const char* bf = "UNKNOWN";
#ifdef HAKMEM_BUILD_RELEASE
bf = "RELEASE";
#elif defined(HAKMEM_BUILD_DEBUG)
bf = "DEBUG";
#endif
HAKMEM_LOG("[Build] Flavor=%s Flags: HEADER_CLASSIDX=%d, AGGRESSIVE_INLINE=%d, POOL_TLS_PHASE1=%d, POOL_TLS_PREWARM=%d\n",
bf,
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
1,
#else
0,
#endif
#ifdef HAKMEM_TINY_AGGRESSIVE_INLINE
1,
#else
0,
#endif
#ifdef HAKMEM_POOL_TLS_PHASE1
1,
#else
0,
#endif
#ifdef HAKMEM_POOL_TLS_PREWARM
1
#else
0
#endif
);
} while (0);
// Bench preset: Tiny-only (disable non-essential subsystems)
{
char* bt = getenv("HAKMEM_BENCH_TINY_ONLY");

23
core/box/mailbox_box.d Normal file
View File

@ -0,0 +1,23 @@
core/box/mailbox_box.o: core/box/mailbox_box.c core/box/mailbox_box.h \
core/hakmem_tiny_superslab.h core/superslab/superslab_types.h \
core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \
core/superslab/superslab_types.h core/tiny_debug_ring.h \
core/tiny_remote.h core/tiny_debug_ring.h core/tiny_remote.h \
core/hakmem_tiny_superslab_constants.h core/hakmem_tiny.h \
core/hakmem_build_flags.h core/hakmem_trace.h \
core/hakmem_tiny_mini_mag.h
core/box/mailbox_box.h:
core/hakmem_tiny_superslab.h:
core/superslab/superslab_types.h:
core/hakmem_tiny_superslab_constants.h:
core/superslab/superslab_inline.h:
core/superslab/superslab_types.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/tiny_debug_ring.h:
core/tiny_remote.h:
core/hakmem_tiny_superslab_constants.h:
core/hakmem_tiny.h:
core/hakmem_build_flags.h:
core/hakmem_trace.h:
core/hakmem_tiny_mini_mag.h:

View File

@ -3,15 +3,54 @@
#define POOL_API_INC_H
void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {
// Debug: IMMEDIATE output to verify function is called
static int first_call = 1;
if (first_call) {
fprintf(stderr, "[Pool] hak_pool_try_alloc FIRST CALL EVER!\n");
first_call = 0;
}
if (size == 40960) { // Exactly 40KB
fprintf(stderr, "[Pool] hak_pool_try_alloc called with 40KB (Bridge class 5)\n");
}
hak_pool_init(); // pthread_once() ensures thread-safe init (no data race!)
// Debug for 33-41KB allocations
if (size >= 33000 && size <= 41000) {
fprintf(stderr, "[Pool] hak_pool_try_alloc: size=%zu (after init)\n", size);
}
// P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe)
extern int hak_in_wrapper(void);
if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL;
if (!hak_pool_is_poolable(size)) return NULL;
if (hak_in_wrapper() && !g_wrap_l2_enabled) {
if (size >= 33000 && size <= 41000) {
fprintf(stderr, "[Pool] REJECTED: in_wrapper=%d, wrap_l2=%d\n",
hak_in_wrapper(), g_wrap_l2_enabled);
}
return NULL;
}
if (!hak_pool_is_poolable(size)) {
if (size >= 33000 && size <= 41000) {
fprintf(stderr, "[Pool] REJECTED: not poolable (min=%d, max=%d)\n",
POOL_MIN_SIZE, POOL_MAX_SIZE);
}
return NULL;
}
// Get class and shard indices
int class_idx = hak_pool_get_class_index(size);
if (class_idx < 0) return NULL;
if (class_idx < 0) {
if (size >= 33000 && size <= 41000) {
fprintf(stderr, "[Pool] REJECTED: class_idx=%d (size=%zu not mapped)\n",
class_idx, size);
}
return NULL;
}
if (size >= 33000 && size <= 41000) {
fprintf(stderr, "[Pool] ACCEPTED: class_idx=%d, proceeding with allocation\n", class_idx);
}
// MF2: Per-Page Sharding path
if (g_mf2_enabled) {

View File

@ -5,7 +5,14 @@
// Thread-safe initialization using pthread_once
static pthread_once_t hak_pool_init_once_control = PTHREAD_ONCE_INIT;
static void hak_pool_init_impl(void) {
fprintf(stderr, "[Pool] hak_pool_init_impl() EXECUTING - Bridge class fix applied\n");
const FrozenPolicy* pol = hkm_policy_get();
// Phase 6.21 CRITICAL FIX: Bridge classes are hardcoded in g_class_sizes,
// NOT from Policy. DO NOT overwrite them with 0!
// The code below was disabling Bridge classes by setting them to 0
// because Policy returns mid_dyn1_bytes=0 and mid_dyn2_bytes=0.
/*
if (pol && pol->mid_dyn1_bytes >= POOL_MIN_SIZE && pol->mid_dyn1_bytes <= POOL_MAX_SIZE) {
g_class_sizes[5] = pol->mid_dyn1_bytes;
} else {
@ -16,6 +23,8 @@ static void hak_pool_init_impl(void) {
} else {
g_class_sizes[6] = 0;
}
*/
// Bridge classes remain as initialized: 40KB and 52KB
for (int c = 0; c < POOL_NUM_CLASSES; c++) {
for (int s = 0; s < POOL_NUM_SHARDS; s++) {
g_pool.freelist[c][s] = NULL;
@ -82,20 +91,65 @@ static void hak_pool_init_impl(void) {
HAKMEM_LOG("[MF2] max_queues=%d, lease_ms=%d, idle_threshold_us=%d\n", g_mf2_max_queues, g_mf2_lease_ms, g_mf2_idle_threshold_us);
}
g_pool.initialized = 1;
fprintf(stderr, "[Pool] Initialized (L2 Hybrid Pool) - Bridge classes SHOULD be enabled\n");
fprintf(stderr, "[Pool] Class 5 (40KB): %zu\n", g_class_sizes[5]);
fprintf(stderr, "[Pool] Class 6 (52KB): %zu\n", g_class_sizes[6]);
HAKMEM_LOG("[Pool] Initialized (L2 Hybrid Pool)\n");
if (g_class_sizes[5] != 0 || g_class_sizes[6] != 0) {
HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB%s%s%s\n",
g_class_sizes[5] ? ", dyn1=" : "",
g_class_sizes[5] ? "" : (g_class_sizes[6]?",":""),
(g_class_sizes[5]||g_class_sizes[6]) ? "" : "");
} else {
HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB\n");
#ifdef HAKMEM_DEBUG_VERBOSE
// Debug: Show actual class sizes after initialization
HAKMEM_LOG("[Pool] Class configuration:\n");
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
if (g_class_sizes[i] != 0) {
HAKMEM_LOG(" Class %d: %zu KB (ENABLED)\n", i, g_class_sizes[i]/1024);
} else {
HAKMEM_LOG(" Class %d: DISABLED\n", i);
}
}
#endif
HAKMEM_LOG("[Pool] Page size: %d KB\n", POOL_PAGE_SIZE / 1024);
HAKMEM_LOG("[Pool] Shards: %d (site-based)\n", POOL_NUM_SHARDS);
// ACE Performance Fix: Pre-allocate pages for Bridge classes to avoid cold start
// This ensures ACE can serve Mid-Large allocations (33KB) immediately without mmap fallback
extern int refill_freelist(int class_idx, int shard_idx);
int prewarm_pages = 4; // Pre-allocate 4 pages per shard for hot classes
// Pre-warm Bridge class 5 (40KB) - Critical for 33KB allocations
if (g_class_sizes[5] != 0) {
int allocated = 0;
for (int s = 0; s < prewarm_pages && s < POOL_NUM_SHARDS; s++) {
if (refill_freelist(5, s) != 0) { // FIX: Check for SUCCESS (1), not FAILURE (0)
allocated++;
}
}
fprintf(stderr, "[Pool] Pre-allocated %d pages for Bridge class 5 (%zu KB) - Critical for 33KB allocs\n",
allocated, g_class_sizes[5]/1024);
} else {
fprintf(stderr, "[Pool] WARNING: Bridge class 5 (40KB) is DISABLED - 33KB allocations will fail!\n");
}
// Pre-warm Bridge class 6 (52KB)
if (g_class_sizes[6] != 0) {
int allocated = 0;
for (int s = 0; s < prewarm_pages && s < POOL_NUM_SHARDS; s++) {
if (refill_freelist(6, s) != 0) { // FIX: Check for SUCCESS (1), not FAILURE (0)
allocated++;
}
}
fprintf(stderr, "[Pool] Pre-allocated %d pages for Bridge class 6 (%zu KB)\n",
allocated, g_class_sizes[6]/1024);
}
}
void hak_pool_init(void) { pthread_once(&hak_pool_init_once_control, hak_pool_init_impl); }
void hak_pool_init(void) {
// Always print this to see if it's being called
static int called = 0;
if (called++ == 0) {
fprintf(stderr, "[Pool] hak_pool_init() called for the first time\n");
}
pthread_once(&hak_pool_init_once_control, hak_pool_init_impl);
}
static void mf2_print_debug_stats(void) {
if (!g_mf2_enabled) return;

View File

@ -1,3 +1,4 @@
#include <stdio.h>
#include "hakmem_ace.h"
#include "hakmem_pool.h"
#include "hakmem_l25_pool.h"
@ -50,9 +51,24 @@ void* hkm_ace_alloc(size_t size, uintptr_t site_id, const FrozenPolicy* pol) {
double wmax_large = (pol ? pol->w_max_large : 1.25);
// MidPool: 252KiB (Phase 6.21: with Bridge classes for W_MAX rounding)
if (size >= 33000 && size <= 34000) {
fprintf(stderr, "[ACE] Processing 33KB: size=%zu, POOL_MAX_SIZE=%d\n", size, POOL_MAX_SIZE);
}
if (size <= POOL_MAX_SIZE) {
size_t r = round_to_mid_class(size, wmax_mid, pol);
if (size >= 33000 && size <= 34000) {
fprintf(stderr, "[ACE] round_to_mid_class returned: %zu (0 means no valid class)\n", r);
}
if (r != 0) {
// Debug: Log 33KB allocation routing (only in debug builds)
#ifdef HAKMEM_DEBUG_VERBOSE
if (size >= 33000 && size <= 34000) {
HAKMEM_LOG("[ACE] 33KB alloc: size=%zu → rounded=%zu (class 5: 40KB)\n", size, r);
}
#endif
if (size >= 33000 && size <= 34000) {
fprintf(stderr, "[ACE] Calling hak_pool_try_alloc with size=%zu\n", r);
}
HKM_TIME_START(t_mid_get);
void* p = hak_pool_try_alloc(r, site_id);
HKM_TIME_END(HKM_CAT_POOL_GET, t_mid_get);
@ -74,7 +90,7 @@ void* hkm_ace_alloc(size_t size, uintptr_t site_id, const FrozenPolicy* pol) {
}
} else if (size > POOL_MAX_SIZE && size < L25_MIN_SIZE) {
// Gap 3264KiB: try rounding up to 64KiB if permitted
size_t r = round_to_large_class(L25_MIN_SIZE, wmax_large); // check 64KiB vs size
// size_t r = round_to_large_class(L25_MIN_SIZE, wmax_large); // check 64KiB vs size (unused)
if ((double)L25_MIN_SIZE <= wmax_large * (double)size) {
HKM_TIME_START(t_l25_get2);
void* p = hak_l25_pool_try_alloc(L25_MIN_SIZE, site_id);

View File

@ -237,6 +237,21 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
int scan_limit = tiny_reg_scan_max();
if (scan_limit > reg_size) scan_limit = reg_size;
uint32_t self_tid = tiny_self_u32();
// Local helper (mirror adopt_bind_if_safe) to avoid including alloc inline here
auto int adopt_bind_if_safe_local(TinyTLSSlab* tls_l, SuperSlab* ss, int slab_idx, int class_idx_l) {
uint32_t self_tid = tiny_self_u32();
SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
if (!slab_is_valid(&h)) return 0;
slab_drain_remote_full(&h);
if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) {
tiny_tls_bind_slab(tls_l, h.ss, h.slab_idx);
slab_release(&h);
return 1;
}
slab_release(&h);
return 0;
}
for (int i = 0; i < scan_limit; i++) {
SuperSlab* cand = g_super_reg_by_class[class_idx][i];
if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue;
@ -248,25 +263,16 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
}
if (mask == 0) continue; // No visible freelists in this SS
int cap = ss_slabs_capacity(cand);
// Iterate set bits only
while (mask) {
int sidx = __builtin_ctz(mask);
mask &= (mask - 1); // clear lowest set bit
mask &= (mask - 1);
if (sidx >= cap) continue;
SlabHandle h = slab_try_acquire(cand, sidx, self_tid);
if (!slab_is_valid(&h)) continue;
if (slab_remote_pending(&h)) {
slab_drain_remote_full(&h);
}
if (slab_is_safe_to_bind(&h)) {
tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
if (adopt_bind_if_safe_local(tls, cand, sidx, class_idx)) {
g_adopt_gate_success[class_idx]++;
g_reg_scan_hits[class_idx]++;
ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07);
slab_release(&h);
return h.ss;
return cand;
}
slab_release(&h);
}
}
return NULL;
@ -1455,7 +1461,7 @@ static inline int ultra_batch_for_class(int class_idx) {
case 1: return 96; // 16BA/B最良
case 2: return 96; // 32BA/B最良
case 3: return 224; // 64BA/B最良
case 4: return 64; // 128B
case 4: return 96; // 128B (promote front refill a bit)
case 5: return 64; // 256B (promote front refill)
case 6: return 64; // 512B (promote front refill)
default: return 32; // 1024B and others

View File

@ -23,7 +23,7 @@ int hak_is_initializing(void);
#define TINY_NUM_CLASSES 8
#define TINY_SLAB_SIZE (64 * 1024) // 64KB per slab
#define TINY_MAX_SIZE 1024 // Maximum allocation size (1KB)
#define TINY_MAX_SIZE 1536 // Maximum allocation size (1.5KB, accommodate 1024B + header)
// ============================================================================
// Size Classes
@ -244,12 +244,14 @@ void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold);
static inline int hak_tiny_size_to_class(size_t size) {
if (size == 0 || size > TINY_MAX_SIZE) return -1;
#if HAKMEM_TINY_HEADER_CLASSIDX
// Phase 7 CRITICAL FIX (2025-11-08): Add 1-byte header overhead BEFORE class lookup
// Bug: 64B request was mapped to class 3 (64B blocks), leaving only 63B usable → BUS ERROR
// Fix: 64B request → alloc_size=65 → class 4 (128B blocks) → 127B usable ✓
size_t alloc_size = size + 1; // Add header overhead
if (alloc_size > TINY_MAX_SIZE) return -1; // 1024B request becomes 1025B, reject to Mid
return g_size_to_class_lut_1k[alloc_size]; // Look up with header-adjusted size
// Phase 7 header adds +1 byte. Special-case 1024B to remain in Tiny (no header).
// Rationale: Avoid forcing 1024B to Mid/OS which causes frequent mmap/madvise.
if (size == TINY_MAX_SIZE) {
return g_size_to_class_lut_1k[size]; // class 7 (1024B blocks)
}
size_t alloc_size = size + 1; // Add header for other sizes
if (alloc_size > TINY_MAX_SIZE) return -1;
return g_size_to_class_lut_1k[alloc_size];
#else
return g_size_to_class_lut_1k[size]; // 1..1024: single load
#endif

View File

@ -414,6 +414,10 @@ void hak_tiny_init(void) {
char* m = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
if (m) { int v = atoi(m); if (v < 0) v = 0; if (v > 256) v = 256; g_refill_count_mid = v; }
}
// Sensible default for class 7 (1024B): favor larger refill to reduce refills/syscalls
if (g_refill_count_class[7] == 0) {
g_refill_count_class[7] = 64; // can be overridden by env HAKMEM_TINY_REFILL_COUNT_C7
}
{
char* fast_env = getenv("HAKMEM_TINY_FAST");
if (fast_env && atoi(fast_env) == 0) g_fast_enable = 0;

View File

@ -204,14 +204,20 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
TinySlabMeta* meta = tls->meta;
if (!meta) return 0;
// Class 5/6/7 special-case: simple batch refill (favor linear carve, minimal branching)
if (__builtin_expect(class_idx >= 5, 0)) {
// Class 4/5/6/7 special-case: simple batch refill (favor linear carve, minimal branching)
// Optional gate for class3 via env: HAKMEM_TINY_SIMPLE_REFILL_C3=1
static int g_simple_c3 = -1;
if (__builtin_expect(g_simple_c3 == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_SIMPLE_REFILL_C3");
g_simple_c3 = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(class_idx >= 4 || (class_idx == 3 && g_simple_c3), 0)) {
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
if (room <= 0) return 0;
int take = max_take < room ? max_take : room;
int taken = 0;
size_t bs = g_tiny_class_sizes[class_idx];
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
for (; taken < take;) {
// Linear first (LIKELY for class7)
if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
@ -251,7 +257,7 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
int take = max_take < room ? max_take : room;
int taken = 0;
size_t bs = g_tiny_class_sizes[class_idx];
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
while (taken < take) {
void* p = NULL;
if (__builtin_expect(meta->freelist != NULL, 0)) {
@ -311,7 +317,7 @@ static inline void* superslab_tls_bump_fast(int class_idx) {
uint32_t avail = (uint32_t)cap - (uint32_t)used;
uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);
if (chunk > avail) chunk = avail;
size_t bs = g_tiny_class_sizes[tls->ss->size_class];
size_t bs = g_tiny_class_sizes[tls->ss->size_class] + ((tls->ss->size_class != 7) ? 1 : 0);
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
uint8_t* start = base + ((size_t)used * bs);
// Reserve the chunk once in header (keeps remote-free accounting valid)
@ -412,7 +418,7 @@ static inline void ultra_refill_sll(int class_idx) {
}
}
if (slab) {
size_t bs = g_tiny_class_sizes[class_idx];
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
int remaining = need;
while (remaining > 0 && slab->free_count > 0) {
if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;

View File

@ -90,7 +90,8 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
return 0;
}
size_t bs = g_tiny_class_sizes[class_idx];
// Effective stride: class block size + 1-byte header for classes 0..6
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
int total_taken = 0;
// === P0 Batch Carving Loop ===

View File

@ -184,8 +184,13 @@ static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) {
g_hakmem_lock_depth--; // Now safe to restore (all libc calls complete)
}
// Global counters for debugging (non-static for external access)
_Atomic uint64_t g_ss_mmap_count = 0;
_Atomic uint64_t g_final_fallback_mmap_count = 0;
static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) {
void* ptr = NULL;
static int log_count = 0;
#ifdef MAP_ALIGNED_SUPER
int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER;
@ -199,6 +204,7 @@ static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask
map_flags,
-1, 0);
if (ptr != MAP_FAILED) {
atomic_fetch_add(&g_ss_mmap_count, 1);
if (((uintptr_t)ptr & ss_mask) == 0) {
ss_stats_os_alloc(size_class, ss_size);
return ptr;
@ -221,6 +227,14 @@ static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask
PROT_READ | PROT_WRITE,
flags,
-1, 0);
if (raw != MAP_FAILED) {
uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1;
if (log_count < 10) {
fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n",
(unsigned long)count, size_class, ss_size);
log_count++;
}
}
if (raw == MAP_FAILED) {
log_superslab_oom_once(ss_size, alloc_size, errno);
return NULL;
@ -717,15 +731,22 @@ void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_
//
// Phase 6-2.5: Use constants from hakmem_tiny_superslab_constants.h
size_t usable_size = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
int capacity = (int)(usable_size / block_size);
// Header-aware stride: include 1-byte header for classes 0-6 when enabled
size_t stride = block_size;
#if HAKMEM_TINY_HEADER_CLASSIDX
if (__builtin_expect(ss->size_class != 7, 1)) {
stride += 1;
}
#endif
int capacity = (int)(usable_size / stride);
// Diagnostic: Verify capacity for class 7 slab 0 (one-shot)
if (ss->size_class == 7 && slab_idx == 0) {
static _Atomic int g_cap_log_printed = 0;
if (atomic_load(&g_cap_log_printed) == 0 &&
atomic_exchange(&g_cap_log_printed, 1) == 0) {
fprintf(stderr, "[SUPERSLAB_INIT] class 7 slab 0: usable_size=%zu block_size=%zu capacity=%d\n",
usable_size, block_size, capacity);
fprintf(stderr, "[SUPERSLAB_INIT] class 7 slab 0: usable_size=%zu stride=%zu capacity=%d\n",
usable_size, stride, capacity);
fprintf(stderr, "[SUPERSLAB_INIT] Expected: 63488 / 1024 = 62 blocks\n");
if (capacity != 62) {
fprintf(stderr, "[SUPERSLAB_INIT] WARNING: capacity=%d (expected 62!)\n", capacity);

View File

@ -25,6 +25,7 @@
#include "tiny_debug_ring.h"
#include "tiny_remote.h"
#include "hakmem_tiny_superslab_constants.h" // Phase 6-2.5: Centralized layout constants
#include "hakmem_build_flags.h"
// Debug instrumentation flags (defined in hakmem_tiny.c)
extern int g_debug_remote_guard;
@ -33,6 +34,31 @@ extern _Atomic uint64_t g_ss_active_dec_calls;
uint32_t tiny_remote_drain_threshold(void);
// ============================================================================
// Tiny block stride helper (Phase 7 header-aware)
// ============================================================================
// Returns the effective per-block stride used for linear carving within slabs.
// When header-based class indexing is enabled, classes 0-6 reserve an extra
// byte per block for the header. Class 7 (1024B) remains headerless by design.
static inline size_t tiny_block_stride_for_class(int class_idx) {
size_t bs = g_tiny_class_sizes[class_idx];
#if HAKMEM_TINY_HEADER_CLASSIDX
if (__builtin_expect(class_idx != 7, 1)) bs += 1;
#endif
#if !HAKMEM_BUILD_RELEASE
// One-shot debug: confirm stride behavior at runtime for class 0
static _Atomic int g_stride_dbg = 0;
if (class_idx == 0) {
int exp = 0;
if (atomic_compare_exchange_strong(&g_stride_dbg, &exp, 1)) {
fprintf(stderr, "[STRIDE_DBG] HEADER_CLASSIDX=%d class=%d stride=%zu\n",
(int)HAKMEM_TINY_HEADER_CLASSIDX, class_idx, bs);
}
}
#endif
return bs;
}
// ============================================================================
// Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads
// ============================================================================

View File

@ -0,0 +1,105 @@
#include "pool_refill.h"
#include "pool_tls.h"
#include <sys/mman.h>
#include <stdint.h>
#include <errno.h>
// Get refill count from Box 1
extern int pool_get_refill_count(int class_idx);
// Refill and return first block
void* pool_refill_and_alloc(int class_idx) {
int count = pool_get_refill_count(class_idx);
if (count <= 0) return NULL;
// Batch allocate from existing Pool backend
void* chain = backend_batch_carve(class_idx, count);
if (!chain) return NULL; // OOM
// Pop first block for return
void* ret = chain;
chain = *(void**)chain;
count--;
#if POOL_USE_HEADERS
// Write header for the block we're returning
*((uint8_t*)ret - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx;
#endif
// Install rest in TLS (if any)
if (count > 0 && chain) {
pool_install_chain(class_idx, chain, count);
}
return ret;
}
// Backend batch carve - Phase 1: Direct mmap allocation
void* backend_batch_carve(int class_idx, int count) {
if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES || count <= 0) {
return NULL;
}
// Get the class size
size_t block_size = POOL_CLASS_SIZES[class_idx];
// For Phase 1: Allocate a single large chunk via mmap
// and carve it into blocks
#if POOL_USE_HEADERS
size_t total_block_size = block_size + POOL_HEADER_SIZE;
#else
size_t total_block_size = block_size;
#endif
// Allocate enough for all requested blocks
size_t total_size = total_block_size * count;
// Round up to page size
size_t page_size = 4096;
total_size = (total_size + page_size - 1) & ~(page_size - 1);
// Allocate memory via mmap
void* chunk = mmap(NULL, total_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (chunk == MAP_FAILED) {
return NULL;
}
// Carve into blocks and chain them
void* head = NULL;
void* tail = NULL;
char* ptr = (char*)chunk;
for (int i = 0; i < count; i++) {
#if POOL_USE_HEADERS
// Skip header space - user data starts after header
void* user_ptr = ptr + POOL_HEADER_SIZE;
#else
void* user_ptr = ptr;
#endif
// Chain the blocks
if (!head) {
head = user_ptr;
tail = user_ptr;
} else {
*(void**)tail = user_ptr;
tail = user_ptr;
}
// Move to next block
ptr += total_block_size;
// Stop if we'd go past the allocated chunk
if ((ptr + total_block_size) > ((char*)chunk + total_size)) {
break;
}
}
// Terminate chain
if (tail) {
*(void**)tail = NULL;
}
return head;
}

View File

@ -2,6 +2,14 @@
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/syscall.h>
#include <unistd.h>
#include "pool_tls_registry.h"
static inline pid_t gettid_cached(void){
static __thread pid_t t=0; if (__builtin_expect(t==0,0)) t=(pid_t)syscall(SYS_gettid); return t;
}
#include <stdio.h>
// Class sizes: 8KB, 16KB, 24KB, 32KB, 40KB, 48KB, 52KB
const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
@ -12,11 +20,27 @@ const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES] = {
__thread void* g_tls_pool_head[POOL_SIZE_CLASSES];
__thread uint32_t g_tls_pool_count[POOL_SIZE_CLASSES];
// Phase 1.5b: Lazy pre-warm flag (per-thread)
#ifdef HAKMEM_POOL_TLS_PREWARM
__thread int g_tls_pool_prewarmed = 0;
#endif
// Fixed refill counts (Phase 1: no learning)
static const uint32_t DEFAULT_REFILL_COUNT[POOL_SIZE_CLASSES] = {
64, 48, 32, 32, 24, 16, 16 // Larger classes = smaller refill
};
// Pre-warm counts optimized for memory usage (Phase 1.5b)
// Total memory: ~1.6MB per thread
// Hot classes (8-24KB): 16 blocks - common in real workloads
// Warm classes (32-40KB): 8 blocks
// Cold classes (48-52KB): 4 blocks - rare
static const int PREWARM_COUNTS[POOL_SIZE_CLASSES] = {
16, 16, 12, // Hot: 8KB, 16KB, 24KB
8, 8, // Warm: 32KB, 40KB
4, 4 // Cold: 48KB, 52KB
};
// Forward declare refill function (from Box 2)
extern void* pool_refill_and_alloc(int class_idx);
@ -36,12 +60,34 @@ static inline int pool_size_to_class(size_t size) {
// Ultra-fast allocation (5-6 cycles)
void* pool_alloc(size_t size) {
// Phase 1.5b: Lazy pre-warm on first allocation per thread
#ifdef HAKMEM_POOL_TLS_PREWARM
if (__builtin_expect(!g_tls_pool_prewarmed, 0)) {
g_tls_pool_prewarmed = 1; // Set flag FIRST to prevent recursion!
pool_tls_prewarm(); // Pre-populate TLS caches
}
#endif
// Quick bounds check
if (size < 8192 || size > 53248) return NULL;
int class_idx = pool_size_to_class(size);
if (class_idx < 0) return NULL;
// Drain a small batch of remote frees for this class
extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
void* chain = NULL;
int drained = pool_remote_pop_chain(class_idx, 32, &chain);
if (drained > 0 && chain) {
// Splice into TLS freelist
void* tail = chain;
int n = 1;
while (*(void**)tail) { tail = *(void**)tail; n++; }
*(void**)tail = g_tls_pool_head[class_idx];
g_tls_pool_head[class_idx] = chain;
g_tls_pool_count[class_idx] += n;
}
void* head = g_tls_pool_head[class_idx];
if (__builtin_expect(head != NULL, 1)) { // LIKELY
@ -54,6 +100,17 @@ void* pool_alloc(size_t size) {
*((uint8_t*)head - POOL_HEADER_SIZE) = POOL_MAGIC | class_idx;
#endif
// Low-water integration: if TLS count is low, opportunistically drain remotes
if (g_tls_pool_count[class_idx] < 4) {
extern int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
void* chain2 = NULL; int got = pool_remote_pop_chain(class_idx, 32, &chain2);
if (got > 0 && chain2) {
void* tail = chain2; while (*(void**)tail) tail = *(void**)tail;
*(void**)tail = g_tls_pool_head[class_idx];
g_tls_pool_head[class_idx] = chain2;
g_tls_pool_count[class_idx] += got;
}
}
return head;
}
@ -78,8 +135,18 @@ void pool_free(void* ptr) {
// Need registry lookup (slower fallback) - not implemented in Phase 1
return;
#endif
// Owner resolution via page registry
pid_t owner_tid=0; int reg_cls=-1;
if (pool_reg_lookup(ptr, &owner_tid, &reg_cls)){
pid_t me = gettid_cached();
if (owner_tid != me){
extern int pool_remote_push(int class_idx, void* ptr, int owner_tid);
(void)pool_remote_push(class_idx, ptr, owner_tid);
return;
}
}
// Push to freelist (2-3 instructions)
// Same-thread: Push to TLS freelist (2-3 instructions)
*(void**)ptr = g_tls_pool_head[class_idx];
g_tls_pool_head[class_idx] = ptr;
g_tls_pool_count[class_idx]++;
@ -109,4 +176,25 @@ void pool_thread_init(void) {
void pool_thread_cleanup(void) {
// Phase 1: No cleanup (keep it simple)
// TODO: Drain back to global pool
}
}
// Pre-warm TLS cache (Phase 1.5b optimization)
// Eliminates cold-start penalty by pre-populating TLS freelists
// Expected improvement: +180-740% (based on Phase 7 Task 3 success)
void pool_tls_prewarm(void) {
// Forward declare refill function (from Box 2)
extern void* backend_batch_carve(int class_idx, int count);
for (int class_idx = 0; class_idx < POOL_SIZE_CLASSES; class_idx++) {
int count = PREWARM_COUNTS[class_idx];
// Directly refill TLS cache (bypass alloc/free during init)
// This avoids issues with g_initializing=1 affecting routing
void* chain = backend_batch_carve(class_idx, count);
if (chain) {
// Install entire chain directly into TLS
pool_install_chain(class_idx, chain, count);
}
// If OOM, continue with other classes (graceful degradation)
}
}

View File

@ -14,10 +14,17 @@ void pool_free(void* ptr);
void pool_thread_init(void);
void pool_thread_cleanup(void);
// Pre-warm TLS cache (Phase 1.5b - call once at thread init)
void pool_tls_prewarm(void);
// Internal API (for Box 2 only)
void pool_install_chain(int class_idx, void* chain, int count);
int pool_get_refill_count(int class_idx);
// Remote queue (cross-thread free) API — Phase 1.5c
int pool_remote_push(int class_idx, void* ptr, int owner_tid);
int pool_remote_drain_light(int class_idx);
// Feature flags
#define POOL_USE_HEADERS 1 // 1-byte headers for O(1) free
@ -26,4 +33,4 @@ int pool_get_refill_count(int class_idx);
#define POOL_HEADER_SIZE 1
#endif
#endif // POOL_TLS_H
#endif // POOL_TLS_H

172
core/pool_tls_arena.c Normal file
View File

@ -0,0 +1,172 @@
#include "pool_tls_arena.h"
#include "pool_tls.h" // For POOL_HEADER_SIZE, POOL_USE_HEADERS
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
// TLS storage (automatically zero-initialized)
__thread PoolChunk g_tls_arena[POOL_SIZE_CLASSES];
int g_arena_max_growth_level = 3; // 0:1MB,1:2MB,2:4MB,3:8MB
size_t g_arena_initial_chunk_size = (size_t)1 << 20; // 1MB
static pthread_once_t g_arena_cfg_once = PTHREAD_ONCE_INIT;
static void arena_read_env(void){
const char* s_init = getenv("HAKMEM_POOL_TLS_ARENA_MB_INIT");
const char* s_max = getenv("HAKMEM_POOL_TLS_ARENA_MB_MAX");
const char* s_gl = getenv("HAKMEM_POOL_TLS_ARENA_GROWTH_LEVELS");
if (s_init){ long v = atol(s_init); if (v>=1 && v<=64) g_arena_initial_chunk_size = (size_t)v << 20; }
if (s_max){ long v = atol(s_max); if (v>=1 && v<=1024){
size_t max_bytes = (size_t)v << 20; size_t sz = g_arena_initial_chunk_size; int lvl=0; while (sz < max_bytes && lvl<30){ sz <<= 1; lvl++; }
g_arena_max_growth_level = lvl; if (g_arena_max_growth_level<0) g_arena_max_growth_level=0; }
}
if (s_gl){ long v = atol(s_gl); if (v>=0 && v<=30) g_arena_max_growth_level = (int)v; }
}
// External imports (from pool config)
extern const size_t POOL_CLASS_SIZES[POOL_SIZE_CLASSES];
// Debug stats
#ifdef POOL_TLS_ARENA_DEBUG
static __thread struct {
uint64_t mmap_calls;
uint64_t total_carved;
uint64_t chunk_exhaustions;
} g_arena_stats;
#endif
// Ensure chunk has space for at least 'needed' bytes
// Returns 0 on success, -1 on mmap failure
static int chunk_ensure(PoolChunk* chunk, size_t needed) {
// Check if current chunk has space
if (chunk->chunk_base && (chunk->offset + needed <= chunk->chunk_size)) {
return 0; // Space available
}
// Need new chunk - calculate size with exponential growth
pthread_once(&g_arena_cfg_once, arena_read_env);
size_t new_size;
if (chunk->growth_level >= g_arena_max_growth_level) {
new_size = g_arena_initial_chunk_size << g_arena_max_growth_level;
} else {
new_size = g_arena_initial_chunk_size << chunk->growth_level;
chunk->growth_level++;
}
// CRITICAL FIX: DO NOT munmap old chunk!
// Reason: Live allocations may still point into it. Arena chunks are kept
// alive for the thread's lifetime and only freed at thread exit.
// This is standard arena behavior - grow but never shrink.
//
// REMOVED BUGGY CODE:
// if (chunk->chunk_base) {
// munmap(chunk->chunk_base, chunk->chunk_size); // ← SEGV! Live ptrs exist!
// }
//
// OLD CHUNK IS LEAKED INTENTIONALLY - it contains live allocations
#ifdef POOL_TLS_ARENA_DEBUG
if (chunk->chunk_base) {
g_arena_stats.chunk_exhaustions++;
}
#endif
// Allocate new chunk
void* new_base = mmap(NULL, new_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (new_base == MAP_FAILED) {
return -1; // OOM
}
#ifdef POOL_TLS_ARENA_DEBUG
g_arena_stats.mmap_calls++;
#endif
// Register range for owner resolution
pid_t tid = (pid_t)syscall(SYS_gettid);
pool_reg_register(new_base, new_size, tid, -1); // class-less at arena level
chunk->chunk_base = new_base;
chunk->chunk_size = new_size;
chunk->offset = 0;
return 0;
}
// Carve blocks from TLS Arena
int arena_batch_carve(int class_idx, void** out_blocks, int count) {
if (class_idx < 0 || class_idx >= POOL_SIZE_CLASSES) {
return 0; // Invalid class
}
PoolChunk* chunk = &g_tls_arena[class_idx];
size_t block_size = POOL_CLASS_SIZES[class_idx];
// Calculate allocation size with header space
#if POOL_USE_HEADERS
size_t alloc_size = block_size + POOL_HEADER_SIZE;
#else
size_t alloc_size = block_size;
#endif
// Ensure chunk has space for all blocks
size_t needed = alloc_size * count;
if (chunk_ensure(chunk, needed) != 0) {
return 0; // OOM
}
// Carve blocks from chunk
int carved = 0;
for (int i = 0; i < count; i++) {
if (chunk->offset + alloc_size > chunk->chunk_size) {
break; // Chunk exhausted (shouldn't happen after ensure)
}
// Return pointer AFTER header space
out_blocks[i] = (char*)chunk->chunk_base + chunk->offset
#if POOL_USE_HEADERS
+ POOL_HEADER_SIZE
#endif
;
chunk->offset += alloc_size;
carved++;
#ifdef POOL_TLS_ARENA_DEBUG
g_arena_stats.total_carved++;
#endif
}
return carved;
}
// Thread cleanup
static void __attribute__((destructor)) arena_cleanup(void) {
arena_cleanup_thread();
}
void arena_cleanup_thread(void) {
for (int i = 0; i < POOL_SIZE_CLASSES; i++) {
PoolChunk* chunk = &g_tls_arena[i];
if (chunk->chunk_base) {
pid_t tid = (pid_t)syscall(SYS_gettid);
pool_reg_unregister(chunk->chunk_base, chunk->chunk_size, tid);
munmap(chunk->chunk_base, chunk->chunk_size);
chunk->chunk_base = NULL;
}
}
}
#ifdef POOL_TLS_ARENA_DEBUG
#include <stdio.h>
void arena_print_stats(void) {
printf("[Pool TLS Arena Stats]\n");
printf(" mmap calls: %lu\n", g_arena_stats.mmap_calls);
printf(" blocks carved: %lu\n", g_arena_stats.total_carved);
printf(" chunk exhaustions: %lu\n", g_arena_stats.chunk_exhaustions);
}
#endif

4
core/pool_tls_arena.d Normal file
View File

@ -0,0 +1,4 @@
core/pool_tls_arena.o: core/pool_tls_arena.c core/pool_tls_arena.h \
core/pool_tls.h
core/pool_tls_arena.h:
core/pool_tls.h:

31
core/pool_tls_arena.h Normal file
View File

@ -0,0 +1,31 @@
#ifndef HAKMEM_POOL_TLS_ARENA_H
#define HAKMEM_POOL_TLS_ARENA_H
#include <stddef.h>
// Configuration
#define POOL_SIZE_CLASSES 7
extern int g_arena_max_growth_level; // 0..N (3 => 8MB cap)
extern size_t g_arena_initial_chunk_size; // bytes (default 1MB)
// TLS Arena Chunk
typedef struct {
void* chunk_base; // mmap base address (page-aligned)
size_t chunk_size; // Current chunk size (1/2/4/8 MB)
size_t offset; // Next carve offset
int growth_level; // 0=1MB, 1=2MB, 2=4MB, 3=8MB
} PoolChunk;
// API
// Carve 'count' blocks from TLS Arena for 'class_idx'
// Returns number of blocks carved (0 on OOM)
int arena_batch_carve(int class_idx, void** out_blocks, int count);
// Thread cleanup (munmap all chunks)
void arena_cleanup_thread(void);
#ifdef POOL_TLS_ARENA_DEBUG
void arena_print_stats(void);
#endif
#endif // HAKMEM_POOL_TLS_ARENA_H

68
core/pool_tls_registry.c Normal file
View File

@ -0,0 +1,68 @@
#include "pool_tls_registry.h"
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
typedef struct RegEntry {
void* base;
void* end;
pid_t tid;
int class_idx;
struct RegEntry* next;
} RegEntry;
#define REG_BUCKETS 1024
static RegEntry* g_buckets[REG_BUCKETS];
static pthread_mutex_t g_locks[REG_BUCKETS];
static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;
static void reg_init(void){
for (int i=0;i<REG_BUCKETS;i++) pthread_mutex_init(&g_locks[i], NULL);
}
static inline uint64_t hash_ptr(void* p){
uintptr_t x=(uintptr_t)p; x ^= x>>33; x*=0xff51afd7ed558ccdULL; x ^= x>>33; x*=0xc4ceb9fe1a85ec53ULL; x ^= x>>33; return x;
}
void pool_reg_register(void* base, size_t size, pid_t tid, int class_idx){
pthread_once(&g_init_once, reg_init);
void* end = (void*)((char*)base + size);
uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
pthread_mutex_lock(&g_locks[h]);
RegEntry* e = (RegEntry*)malloc(sizeof(RegEntry));
e->base = base; e->end = end; e->tid = tid; e->class_idx = class_idx; e->next = g_buckets[h];
g_buckets[h] = e;
pthread_mutex_unlock(&g_locks[h]);
}
void pool_reg_unregister(void* base, size_t size, pid_t tid){
pthread_once(&g_init_once, reg_init);
uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
pthread_mutex_lock(&g_locks[h]);
RegEntry** pp = &g_buckets[h];
while (*pp){
RegEntry* e = *pp;
if (e->base == base && e->tid == tid){
*pp = e->next; free(e); break;
}
pp = &e->next;
}
pthread_mutex_unlock(&g_locks[h]);
}
int pool_reg_lookup(void* ptr, pid_t* tid_out, int* class_idx_out){
pthread_once(&g_init_once, reg_init);
uint64_t h = hash_ptr(ptr) & (REG_BUCKETS-1);
pthread_mutex_lock(&g_locks[h]);
for (RegEntry* e = g_buckets[h]; e; e=e->next){
if (ptr >= e->base && ptr < e->end){
if (tid_out) *tid_out = e->tid;
if (class_idx_out) *class_idx_out = e->class_idx;
pthread_mutex_unlock(&g_locks[h]);
return 1;
}
}
pthread_mutex_unlock(&g_locks[h]);
return 0;
}

16
core/pool_tls_registry.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef HAKMEM_POOL_TLS_REGISTRY_H
#define HAKMEM_POOL_TLS_REGISTRY_H
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
// Register an arena chunk range with owner thread id and class index
void pool_reg_register(void* base, size_t size, pid_t tid, int class_idx);
// Unregister a previously registered chunk
void pool_reg_unregister(void* base, size_t size, pid_t tid);
// Lookup owner for a pointer; returns 1 if found, 0 otherwise
int pool_reg_lookup(void* ptr, pid_t* tid_out, int* class_idx_out);
#endif

72
core/pool_tls_remote.c Normal file
View File

@ -0,0 +1,72 @@
#include "pool_tls_remote.h"
#include <pthread.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>
#define REMOTE_BUCKETS 256
typedef struct RemoteRec {
int tid;
void* head[7];
int count[7];
struct RemoteRec* next;
} RemoteRec;
static RemoteRec* g_buckets[REMOTE_BUCKETS];
static pthread_mutex_t g_locks[REMOTE_BUCKETS];
static pthread_once_t g_once = PTHREAD_ONCE_INIT;
static void rq_init(void){
for (int i=0;i<REMOTE_BUCKETS;i++) pthread_mutex_init(&g_locks[i], NULL);
}
static inline unsigned hb(int tid){ return (unsigned)tid & (REMOTE_BUCKETS-1); }
int pool_remote_push(int class_idx, void* ptr, int owner_tid){
if (class_idx < 0 || class_idx > 6 || ptr == NULL) return 0;
pthread_once(&g_once, rq_init);
unsigned b = hb(owner_tid);
pthread_mutex_lock(&g_locks[b]);
RemoteRec* r = g_buckets[b];
while (r && r->tid != owner_tid) r = r->next;
if (!r){
r = (RemoteRec*)calloc(1, sizeof(RemoteRec));
r->tid = owner_tid; r->next = g_buckets[b]; g_buckets[b] = r;
}
*(void**)ptr = r->head[class_idx];
r->head[class_idx] = ptr;
r->count[class_idx]++;
pthread_mutex_unlock(&g_locks[b]);
return 1;
}
// Drain up to a small batch for this thread and class
int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain){
if (class_idx < 0 || class_idx > 6 || out_chain==NULL) return 0;
pthread_once(&g_once, rq_init);
int mytid = (int)syscall(SYS_gettid);
unsigned b = hb(mytid);
pthread_mutex_lock(&g_locks[b]);
RemoteRec* r = g_buckets[b];
while (r && r->tid != mytid) r = r->next;
int drained = 0;
if (r){
// Pop up to max_take nodes and return chain
void* head = r->head[class_idx];
int batch = 0; if (max_take <= 0) max_take = 32;
void* chain = NULL; void* tail = NULL;
while (head && batch < max_take){
void* nxt = *(void**)head;
if (!chain){ chain = head; tail = head; }
else { *(void**)tail = head; tail = head; }
head = nxt; batch++;
}
r->head[class_idx] = head;
r->count[class_idx] -= batch;
drained = batch;
*out_chain = chain;
}
pthread_mutex_unlock(&g_locks[b]);
return drained;
}

9
core/pool_tls_remote.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef POOL_TLS_REMOTE_H
#define POOL_TLS_REMOTE_H
#include <stdint.h>
int pool_remote_push(int class_idx, void* ptr, int owner_tid);
int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain);
#endif

View File

@ -336,6 +336,8 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
// Previous: Complex precedence logic on every miss (5-10 cycles overhead)
// Now: Simple TLS cache lookup (1-2 cycles)
static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
// Simple adaptive booster: bump per-class refill size when refills are frequent.
static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
int cnt = s_refill_count[class_idx];
if (__builtin_expect(cnt == 0, 0)) {
// First miss: Initialize from globals (parsed at init time)
@ -375,6 +377,26 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
// Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss()
int refilled = sll_refill_small_from_ss(class_idx, cnt);
// Lightweight adaptation: if refills keep happening, increase per-class refill.
// Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
if (refilled > 0) {
uint8_t c = ++s_refill_calls[class_idx];
if (class_idx == 7) {
// Every 4 refills, increase target by +16 up to 128 (unless overridden).
if ((c & 0x03u) == 0) {
int target = s_refill_count[class_idx];
if (target < 128) {
target += 16;
if (target > 128) target = 128;
s_refill_count[class_idx] = target;
}
}
}
} else {
// No refill performed (capacity full): slowly decay the counter.
if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
}
// Phase 2b: Track refill and adapt cache size
if (refilled > 0) {
track_refill_for_adaptation(class_idx);

View File

@ -60,7 +60,8 @@ static inline void trc_splice_to_sll(int class_idx, TinyRefillChain* c,
// CORRUPTION DEBUG: Validate chain before splicing
if (__builtin_expect(trc_refill_guard_enabled(), 0)) {
extern const size_t g_tiny_class_sizes[];
size_t blk = g_tiny_class_sizes[class_idx];
// Validate alignment using effective stride (include header for classes 0..6)
size_t blk = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
fprintf(stderr, "[SPLICE_TO_SLL] cls=%d head=%p tail=%p count=%u\n",
class_idx, c->head, c->tail, c->count);
@ -187,7 +188,13 @@ static inline uint32_t trc_linear_carve(uint8_t* base, size_t bs,
}
// FIX: Use carved counter (monotonic) instead of used (which decrements on free)
uint8_t* cursor = base + ((size_t)meta->carved * bs);
// Effective stride: account for Tiny header when enabled (classes 0..6)
#if HAKMEM_TINY_HEADER_CLASSIDX
size_t stride = (bs == 1024 ? bs : (bs + 1));
#else
size_t stride = bs;
#endif
uint8_t* cursor = base + ((size_t)meta->carved * stride);
void* head = (void*)cursor;
// CORRUPTION DEBUG: Log carve operation
@ -197,7 +204,7 @@ static inline uint32_t trc_linear_carve(uint8_t* base, size_t bs,
}
for (uint32_t i = 1; i < batch; i++) {
uint8_t* next = cursor + bs;
uint8_t* next = cursor + stride;
*(void**)cursor = (void*)next;
cursor = next;
}

View File

@ -44,17 +44,18 @@
static inline void* tiny_region_id_write_header(void* base, int class_idx) {
if (!base) return base;
// Special-case class 7 (1024B blocks): return full block without header.
// Rationale: 1024B requests must not pay an extra 1-byte header (would overflow)
// and routing them to Mid/OS causes excessive mmap/madvise. We keep Tiny owner
// and let free() take the slow path (headerless → slab lookup).
if (__builtin_expect(class_idx == 7, 0)) {
return base; // no header written; user gets full 1024B
}
// Write header at block start
uint8_t* header_ptr = (uint8_t*)base;
// CRITICAL (Phase 7-1.3): ALWAYS write magic byte for safety
// Reason: Free path ALWAYS validates magic (even in release) to detect
// non-Tiny allocations. Without magic, all frees would fail validation.
// Performance: Magic write is FREE (same 1-byte write, just different value)
*header_ptr = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
// Return user pointer (skip header)
return header_ptr + 1;
return header_ptr + 1; // skip header for user pointer
}
// ========== Read Header (Free) ==========

View File

@ -13,6 +13,7 @@
// ============================================================================
// Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
#include "hakmem_tiny_superslab_constants.h"
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
@ -70,13 +71,36 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
// This avoids the 4000-8000 cycle cost of building freelist on init
if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
// Linear allocation: use canonical tiny_slab_base_for() only
size_t block_size = g_tiny_class_sizes[ss->size_class];
size_t unit_sz = g_tiny_class_sizes[ss->size_class]
#if HAKMEM_TINY_HEADER_CLASSIDX
+ ((ss->size_class != 7) ? 1 : 0)
#endif
;
uint8_t* base = tiny_slab_base_for(ss, slab_idx);
void* block = (void*)(base + ((size_t)meta->used * block_size));
void* block_base = (void*)(base + ((size_t)meta->used * unit_sz));
#if !HAKMEM_BUILD_RELEASE
// Debug safety: Ensure we never carve past slab usable region (capacity mismatch guard)
size_t dbg_usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
uintptr_t dbg_off = (uintptr_t)((uint8_t*)block_base - base);
if (__builtin_expect(dbg_off + unit_sz > dbg_usable, 0)) {
fprintf(stderr, "[TINY_ALLOC_BOUNDS] cls=%u slab=%d used=%u cap=%u unit=%zu off=%lu usable=%zu\n",
ss->size_class, slab_idx, meta->used, meta->capacity, unit_sz,
(unsigned long)dbg_off, dbg_usable);
return NULL;
}
#endif
meta->used++;
tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0);
tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0);
return block; // Fast path: O(1) pointer arithmetic
void* user =
#if HAKMEM_TINY_HEADER_CLASSIDX
tiny_region_id_write_header(block_base, ss->size_class);
#else
block_base;
#endif
if (__builtin_expect(g_debug_remote_guard, 0)) {
tiny_remote_track_on_alloc(ss, slab_idx, user, "linear_alloc", 0);
tiny_remote_assert_not_remote(ss, slab_idx, user, "linear_alloc_ret", 0);
}
return user; // Fast path: O(1) pointer arithmetic
}
// Freelist mode (after first free())
@ -125,8 +149,10 @@ static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
}
}
tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
if (__builtin_expect(g_debug_remote_guard, 0)) {
tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
}
return block;
}