Files
hakmem/core/pool_tls_remote.c

75 lines
2.3 KiB
C
Raw Normal View History

#include "pool_tls_remote.h"
#include <pthread.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <unistd.h>
Front-Direct implementation: SS→FC direct refill + SLL complete bypass ## Summary Implemented Front-Direct architecture with complete SLL bypass: - Direct SuperSlab → FastCache refill (1-hop, bypasses SLL) - SLL-free allocation/free paths when Front-Direct enabled - Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only) ## New Modules - core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point - Remote drain → Freelist → Carve priority - Header restoration for C1-C6 (NOT C0/C7) - ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN - core/front/fast_cache.h: FastCache (L1) type definition - core/front/quick_slot.h: QuickSlot (L0) type definition ## Allocation Path (core/tiny_alloc_fast.inc.h) - Added s_front_direct_alloc TLS flag (lazy ENV check) - SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc - Refill dispatch: - Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop) - Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only) - SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in) ## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h) - FC priority: Try fastcache_push() first (same-thread free) - tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable - Fallback: Magazine/slow path (safe, bypasses SLL) ## Legacy Sealing - SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1) - Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak - Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry ## ENV Controls - HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct) - HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name) - HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct) - HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF) - HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE) ## Benchmarks (Front-Direct Enabled) ```bash ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1 HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1 HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96 HAKMEM_TINY_BUMP_CHUNK=256 bench_random_mixed (16-1040B random, 200K iter): 256 slots: 1.44M ops/s (STABLE, 0 SEGV) 128 slots: 1.44M ops/s (STABLE, 0 SEGV) bench_fixed_size (fixed size, 200K iter): 256B: 4.06M ops/s (has debug logs, expected >10M without logs) 128B: Similar (debug logs affect) ``` ## Verification - TRACE_RING test (10K iter): **0 SLL events** detected ✅ - Complete SLL bypass confirmed when Front-Direct=1 - Stable execution: 200K iterations × multiple sizes, 0 SEGV ## Next Steps - Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range) - Re-benchmark with clean Release build (target: 10-15M ops/s) - 128/256B shortcut path optimization (FC hit rate improvement) Co-Authored-By: ChatGPT <chatgpt@openai.com> Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
#include "box/tiny_next_ptr_box.h" // Box API: preserve header by using class-aware next offset
#define REMOTE_BUCKETS 256
typedef struct RemoteRec {
int tid;
void* head[7];
int count[7];
struct RemoteRec* next;
} RemoteRec;
static RemoteRec* g_buckets[REMOTE_BUCKETS];
static pthread_mutex_t g_locks[REMOTE_BUCKETS];
static pthread_once_t g_once = PTHREAD_ONCE_INIT;
static void rq_init(void){
for (int i=0;i<REMOTE_BUCKETS;i++) pthread_mutex_init(&g_locks[i], NULL);
}
static inline unsigned hb(int tid){ return (unsigned)tid & (REMOTE_BUCKETS-1); }
int pool_remote_push(int class_idx, void* ptr, int owner_tid){
if (class_idx < 0 || class_idx > 6 || ptr == NULL) return 0;
pthread_once(&g_once, rq_init);
unsigned b = hb(owner_tid);
pthread_mutex_lock(&g_locks[b]);
RemoteRec* r = g_buckets[b];
while (r && r->tid != owner_tid) r = r->next;
if (!r){
r = (RemoteRec*)calloc(1, sizeof(RemoteRec));
r->tid = owner_tid; r->next = g_buckets[b]; g_buckets[b] = r;
}
Front-Direct implementation: SS→FC direct refill + SLL complete bypass ## Summary Implemented Front-Direct architecture with complete SLL bypass: - Direct SuperSlab → FastCache refill (1-hop, bypasses SLL) - SLL-free allocation/free paths when Front-Direct enabled - Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only) ## New Modules - core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point - Remote drain → Freelist → Carve priority - Header restoration for C1-C6 (NOT C0/C7) - ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN - core/front/fast_cache.h: FastCache (L1) type definition - core/front/quick_slot.h: QuickSlot (L0) type definition ## Allocation Path (core/tiny_alloc_fast.inc.h) - Added s_front_direct_alloc TLS flag (lazy ENV check) - SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc - Refill dispatch: - Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop) - Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only) - SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in) ## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h) - FC priority: Try fastcache_push() first (same-thread free) - tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable - Fallback: Magazine/slow path (safe, bypasses SLL) ## Legacy Sealing - SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1) - Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak - Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry ## ENV Controls - HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct) - HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name) - HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct) - HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF) - HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE) ## Benchmarks (Front-Direct Enabled) ```bash ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1 HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1 HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96 HAKMEM_TINY_BUMP_CHUNK=256 bench_random_mixed (16-1040B random, 200K iter): 256 slots: 1.44M ops/s (STABLE, 0 SEGV) 128 slots: 1.44M ops/s (STABLE, 0 SEGV) bench_fixed_size (fixed size, 200K iter): 256B: 4.06M ops/s (has debug logs, expected >10M without logs) 128B: Similar (debug logs affect) ``` ## Verification - TRACE_RING test (10K iter): **0 SLL events** detected ✅ - Complete SLL bypass confirmed when Front-Direct=1 - Stable execution: 200K iterations × multiple sizes, 0 SEGV ## Next Steps - Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range) - Re-benchmark with clean Release build (target: 10-15M ops/s) - 128/256B shortcut path optimization (FC hit rate improvement) Co-Authored-By: ChatGPT <chatgpt@openai.com> Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
// Use Box next-pointer API to avoid clobbering header (classes 1-6 store next at base+1)
tiny_next_write(class_idx, ptr, r->head[class_idx]);
r->head[class_idx] = ptr;
r->count[class_idx]++;
pthread_mutex_unlock(&g_locks[b]);
return 1;
}
// Drain up to a small batch for this thread and class
int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain){
if (class_idx < 0 || class_idx > 6 || out_chain==NULL) return 0;
pthread_once(&g_once, rq_init);
int mytid = (int)syscall(SYS_gettid);
unsigned b = hb(mytid);
pthread_mutex_lock(&g_locks[b]);
RemoteRec* r = g_buckets[b];
while (r && r->tid != mytid) r = r->next;
int drained = 0;
if (r){
// Pop up to max_take nodes and return chain
void* head = r->head[class_idx];
int batch = 0; if (max_take <= 0) max_take = 32;
void* chain = NULL; void* tail = NULL;
while (head && batch < max_take){
Front-Direct implementation: SS→FC direct refill + SLL complete bypass ## Summary Implemented Front-Direct architecture with complete SLL bypass: - Direct SuperSlab → FastCache refill (1-hop, bypasses SLL) - SLL-free allocation/free paths when Front-Direct enabled - Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only) ## New Modules - core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point - Remote drain → Freelist → Carve priority - Header restoration for C1-C6 (NOT C0/C7) - ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN - core/front/fast_cache.h: FastCache (L1) type definition - core/front/quick_slot.h: QuickSlot (L0) type definition ## Allocation Path (core/tiny_alloc_fast.inc.h) - Added s_front_direct_alloc TLS flag (lazy ENV check) - SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc - Refill dispatch: - Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop) - Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only) - SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in) ## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h) - FC priority: Try fastcache_push() first (same-thread free) - tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable - Fallback: Magazine/slow path (safe, bypasses SLL) ## Legacy Sealing - SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1) - Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak - Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry ## ENV Controls - HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct) - HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name) - HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct) - HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF) - HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE) ## Benchmarks (Front-Direct Enabled) ```bash ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1 HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1 HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96 HAKMEM_TINY_BUMP_CHUNK=256 bench_random_mixed (16-1040B random, 200K iter): 256 slots: 1.44M ops/s (STABLE, 0 SEGV) 128 slots: 1.44M ops/s (STABLE, 0 SEGV) bench_fixed_size (fixed size, 200K iter): 256B: 4.06M ops/s (has debug logs, expected >10M without logs) 128B: Similar (debug logs affect) ``` ## Verification - TRACE_RING test (10K iter): **0 SLL events** detected ✅ - Complete SLL bypass confirmed when Front-Direct=1 - Stable execution: 200K iterations × multiple sizes, 0 SEGV ## Next Steps - Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range) - Re-benchmark with clean Release build (target: 10-15M ops/s) - 128/256B shortcut path optimization (FC hit rate improvement) Co-Authored-By: ChatGPT <chatgpt@openai.com> Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
void* nxt = tiny_next_read(class_idx, head);
if (!chain){ chain = head; tail = head; }
Front-Direct implementation: SS→FC direct refill + SLL complete bypass ## Summary Implemented Front-Direct architecture with complete SLL bypass: - Direct SuperSlab → FastCache refill (1-hop, bypasses SLL) - SLL-free allocation/free paths when Front-Direct enabled - Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only) ## New Modules - core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point - Remote drain → Freelist → Carve priority - Header restoration for C1-C6 (NOT C0/C7) - ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN - core/front/fast_cache.h: FastCache (L1) type definition - core/front/quick_slot.h: QuickSlot (L0) type definition ## Allocation Path (core/tiny_alloc_fast.inc.h) - Added s_front_direct_alloc TLS flag (lazy ENV check) - SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc - Refill dispatch: - Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop) - Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only) - SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in) ## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h) - FC priority: Try fastcache_push() first (same-thread free) - tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable - Fallback: Magazine/slow path (safe, bypasses SLL) ## Legacy Sealing - SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1) - Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak - Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry ## ENV Controls - HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct) - HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name) - HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct) - HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF) - HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE) ## Benchmarks (Front-Direct Enabled) ```bash ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1 HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1 HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96 HAKMEM_TINY_BUMP_CHUNK=256 bench_random_mixed (16-1040B random, 200K iter): 256 slots: 1.44M ops/s (STABLE, 0 SEGV) 128 slots: 1.44M ops/s (STABLE, 0 SEGV) bench_fixed_size (fixed size, 200K iter): 256B: 4.06M ops/s (has debug logs, expected >10M without logs) 128B: Similar (debug logs affect) ``` ## Verification - TRACE_RING test (10K iter): **0 SLL events** detected ✅ - Complete SLL bypass confirmed when Front-Direct=1 - Stable execution: 200K iterations × multiple sizes, 0 SEGV ## Next Steps - Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range) - Re-benchmark with clean Release build (target: 10-15M ops/s) - 128/256B shortcut path optimization (FC hit rate improvement) Co-Authored-By: ChatGPT <chatgpt@openai.com> Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
else { tiny_next_write(class_idx, tail, head); tail = head; }
head = nxt; batch++;
}
r->head[class_idx] = head;
r->count[class_idx] -= batch;
drained = batch;
*out_chain = chain;
}
pthread_mutex_unlock(&g_locks[b]);
return drained;
}