2025-11-09 18:55:50 +09:00
|
|
|
#include "pool_tls_remote.h"
|
|
|
|
|
#include <pthread.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <sys/syscall.h>
|
|
|
|
|
#include <unistd.h>
|
2025-11-14 14:29:05 +09:00
|
|
|
#include <stdatomic.h>
|
Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00
|
|
|
#include "box/tiny_next_ptr_box.h" // Box API: preserve header by using class-aware next offset
|
2025-11-09 18:55:50 +09:00
|
|
|
|
|
|
|
|
#define REMOTE_BUCKETS 256
|
|
|
|
|
|
2025-11-14 14:29:05 +09:00
|
|
|
// Lock-free MPSC queue per class
|
|
|
|
|
typedef struct RemoteQueue {
|
|
|
|
|
_Atomic(void*) head; // Atomic head for lock-free push (LIFO)
|
|
|
|
|
_Atomic uint32_t count; // Approximate count
|
|
|
|
|
} RemoteQueue;
|
|
|
|
|
|
2025-11-09 18:55:50 +09:00
|
|
|
typedef struct RemoteRec {
|
|
|
|
|
int tid;
|
2025-11-14 14:29:05 +09:00
|
|
|
RemoteQueue queues[7]; // One queue per class (lock-free)
|
2025-11-09 18:55:50 +09:00
|
|
|
struct RemoteRec* next;
|
|
|
|
|
} RemoteRec;
|
|
|
|
|
|
|
|
|
|
static RemoteRec* g_buckets[REMOTE_BUCKETS];
|
2025-11-14 14:29:05 +09:00
|
|
|
static pthread_mutex_t g_locks[REMOTE_BUCKETS]; // Only for RemoteRec creation
|
2025-11-09 18:55:50 +09:00
|
|
|
static pthread_once_t g_once = PTHREAD_ONCE_INIT;
|
|
|
|
|
|
|
|
|
|
static void rq_init(void){
|
|
|
|
|
for (int i=0;i<REMOTE_BUCKETS;i++) pthread_mutex_init(&g_locks[i], NULL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline unsigned hb(int tid){ return (unsigned)tid & (REMOTE_BUCKETS-1); }
|
|
|
|
|
|
|
|
|
|
int pool_remote_push(int class_idx, void* ptr, int owner_tid){
|
|
|
|
|
if (class_idx < 0 || class_idx > 6 || ptr == NULL) return 0;
|
|
|
|
|
pthread_once(&g_once, rq_init);
|
|
|
|
|
unsigned b = hb(owner_tid);
|
2025-11-14 14:29:05 +09:00
|
|
|
|
|
|
|
|
// Find or create RemoteRec (only this part needs mutex)
|
2025-11-09 18:55:50 +09:00
|
|
|
RemoteRec* r = g_buckets[b];
|
|
|
|
|
while (r && r->tid != owner_tid) r = r->next;
|
|
|
|
|
if (!r){
|
2025-11-14 14:29:05 +09:00
|
|
|
pthread_mutex_lock(&g_locks[b]);
|
|
|
|
|
// Double-check after acquiring lock
|
|
|
|
|
r = g_buckets[b];
|
|
|
|
|
while (r && r->tid != owner_tid) r = r->next;
|
|
|
|
|
if (!r){
|
|
|
|
|
r = (RemoteRec*)calloc(1, sizeof(RemoteRec));
|
|
|
|
|
r->tid = owner_tid;
|
|
|
|
|
r->next = g_buckets[b];
|
|
|
|
|
g_buckets[b] = r;
|
|
|
|
|
}
|
|
|
|
|
pthread_mutex_unlock(&g_locks[b]);
|
2025-11-09 18:55:50 +09:00
|
|
|
}
|
2025-11-14 14:29:05 +09:00
|
|
|
|
|
|
|
|
// Lock-free push using CAS (this is the hot path!)
|
|
|
|
|
RemoteQueue* q = &r->queues[class_idx];
|
|
|
|
|
void* old_head = atomic_load_explicit(&q->head, memory_order_relaxed);
|
|
|
|
|
do {
|
|
|
|
|
// Link new node to current head using Box API (preserves header)
|
|
|
|
|
tiny_next_write(class_idx, ptr, old_head);
|
|
|
|
|
} while (!atomic_compare_exchange_weak_explicit(
|
|
|
|
|
&q->head, &old_head, ptr,
|
|
|
|
|
memory_order_release, memory_order_relaxed));
|
|
|
|
|
|
|
|
|
|
atomic_fetch_add_explicit(&q->count, 1, memory_order_relaxed);
|
2025-11-09 18:55:50 +09:00
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Drain up to a small batch for this thread and class
|
|
|
|
|
int pool_remote_pop_chain(int class_idx, int max_take, void** out_chain){
|
|
|
|
|
if (class_idx < 0 || class_idx > 6 || out_chain==NULL) return 0;
|
|
|
|
|
pthread_once(&g_once, rq_init);
|
|
|
|
|
int mytid = (int)syscall(SYS_gettid);
|
|
|
|
|
unsigned b = hb(mytid);
|
2025-11-14 14:29:05 +09:00
|
|
|
|
|
|
|
|
// Find my RemoteRec (no lock needed for reading)
|
2025-11-09 18:55:50 +09:00
|
|
|
RemoteRec* r = g_buckets[b];
|
|
|
|
|
while (r && r->tid != mytid) r = r->next;
|
2025-11-14 14:29:05 +09:00
|
|
|
if (!r) return 0; // No remote queue for this thread
|
|
|
|
|
|
|
|
|
|
// Lock-free pop using atomic exchange
|
|
|
|
|
RemoteQueue* q = &r->queues[class_idx];
|
|
|
|
|
void* head = atomic_exchange_explicit(&q->head, NULL, memory_order_acquire);
|
|
|
|
|
if (!head) return 0; // Queue was empty
|
|
|
|
|
|
|
|
|
|
// Count nodes and take up to max_take (traverse LIFO chain)
|
|
|
|
|
if (max_take <= 0) max_take = 32;
|
|
|
|
|
void* chain = NULL;
|
|
|
|
|
void* tail = NULL;
|
|
|
|
|
int batch = 0;
|
|
|
|
|
|
|
|
|
|
// Pop up to max_take from the stolen chain
|
|
|
|
|
while (head && batch < max_take){
|
|
|
|
|
void* nxt = tiny_next_read(class_idx, head);
|
|
|
|
|
|
|
|
|
|
// Build output chain (reverse for FIFO order)
|
|
|
|
|
if (!chain){
|
|
|
|
|
chain = head;
|
|
|
|
|
tail = head;
|
|
|
|
|
} else {
|
|
|
|
|
tiny_next_write(class_idx, tail, head);
|
|
|
|
|
tail = head;
|
2025-11-09 18:55:50 +09:00
|
|
|
}
|
2025-11-14 14:29:05 +09:00
|
|
|
|
|
|
|
|
head = nxt;
|
|
|
|
|
batch++;
|
2025-11-09 18:55:50 +09:00
|
|
|
}
|
2025-11-14 14:29:05 +09:00
|
|
|
|
|
|
|
|
// If we didn't take all nodes, push remainder back (lock-free)
|
|
|
|
|
if (head){
|
|
|
|
|
void* old_head = atomic_load_explicit(&q->head, memory_order_relaxed);
|
|
|
|
|
do {
|
|
|
|
|
// Find tail of remainder chain
|
|
|
|
|
void* remainder_tail = head;
|
|
|
|
|
while (tiny_next_read(class_idx, remainder_tail)) {
|
|
|
|
|
remainder_tail = tiny_next_read(class_idx, remainder_tail);
|
|
|
|
|
}
|
|
|
|
|
// Link remainder to current head
|
|
|
|
|
tiny_next_write(class_idx, remainder_tail, old_head);
|
|
|
|
|
} while (!atomic_compare_exchange_weak_explicit(
|
|
|
|
|
&q->head, &old_head, head,
|
|
|
|
|
memory_order_release, memory_order_relaxed));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
atomic_fetch_sub_explicit(&q->count, batch, memory_order_relaxed);
|
|
|
|
|
*out_chain = chain;
|
|
|
|
|
return batch;
|
2025-11-09 18:55:50 +09:00
|
|
|
}
|