Problem: pool_remote_push mutex contention (67% of syscall time in futex) Solution: Lock-free MPSC queue using atomic CAS operations Changes: 1. core/pool_tls_remote.c - Lock-free MPSC queue - Push: atomic_compare_exchange_weak (CAS loop, no locks!) - Pop: atomic_exchange (steal entire chain) - Mutex only for RemoteRec creation (rare, first-push-to-thread) 2. core/pool_tls_registry.c - Lock-free lookup - Buckets and next pointers now atomic: _Atomic(RegEntry*) - Lookup uses memory_order_acquire loads (no locks on hot path) - Registration/unregistration still use mutex (rare operations) Results: - futex calls: 209 → 7 (-97% reduction!) - Throughput: 0.97M → 1.0M ops/s (+3%) - Remaining gap: 5.8x slower than System malloc (5.8M ops/s) Key Finding: - futex was NOT the primary bottleneck (only small % of total runtime) - True bottleneck: 8% cache miss rate + registry lookup overhead Thread Safety: - MPSC: Multi-producer (CAS), Single-consumer (owner thread) - Memory ordering: release/acquire for correctness - No ABA problem (pointers used once, no reuse) Next: P0 registry lookup elimination via POOL_TLS_BIND_BOX 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
92 lines
2.9 KiB
C
92 lines
2.9 KiB
C
#include "pool_tls_registry.h"
|
|
#include <pthread.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdatomic.h>
|
|
|
|
typedef struct RegEntry {
|
|
void* base;
|
|
void* end;
|
|
pid_t tid;
|
|
int class_idx;
|
|
_Atomic(struct RegEntry*) next; // Atomic for lock-free reads
|
|
} RegEntry;
|
|
|
|
#define REG_BUCKETS 1024
|
|
static _Atomic(RegEntry*) g_buckets[REG_BUCKETS]; // Atomic buckets for lock-free reads
|
|
static pthread_mutex_t g_locks[REG_BUCKETS]; // Only for registration/unregistration
|
|
static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;
|
|
|
|
static void reg_init(void){
|
|
for (int i=0;i<REG_BUCKETS;i++) {
|
|
pthread_mutex_init(&g_locks[i], NULL);
|
|
atomic_store_explicit(&g_buckets[i], NULL, memory_order_relaxed);
|
|
}
|
|
}
|
|
|
|
static inline uint64_t hash_ptr(void* p){
|
|
uintptr_t x=(uintptr_t)p; x ^= x>>33; x*=0xff51afd7ed558ccdULL; x ^= x>>33; x*=0xc4ceb9fe1a85ec53ULL; x ^= x>>33; return x;
|
|
}
|
|
|
|
void pool_reg_register(void* base, size_t size, pid_t tid, int class_idx){
|
|
pthread_once(&g_init_once, reg_init);
|
|
void* end = (void*)((char*)base + size);
|
|
uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
|
|
pthread_mutex_lock(&g_locks[h]);
|
|
RegEntry* e = (RegEntry*)malloc(sizeof(RegEntry));
|
|
e->base = base; e->end = end; e->tid = tid; e->class_idx = class_idx;
|
|
RegEntry* old_head = atomic_load_explicit(&g_buckets[h], memory_order_relaxed);
|
|
atomic_store_explicit(&e->next, old_head, memory_order_relaxed);
|
|
atomic_store_explicit(&g_buckets[h], e, memory_order_release);
|
|
pthread_mutex_unlock(&g_locks[h]);
|
|
}
|
|
|
|
void pool_reg_unregister(void* base, size_t size, pid_t tid){
|
|
pthread_once(&g_init_once, reg_init);
|
|
uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
|
|
pthread_mutex_lock(&g_locks[h]);
|
|
|
|
// Need to carefully update atomic pointers
|
|
_Atomic(RegEntry*)* pp = &g_buckets[h];
|
|
RegEntry* e = atomic_load_explicit(pp, memory_order_relaxed);
|
|
RegEntry* prev = NULL;
|
|
|
|
while (e){
|
|
if (e->base == base && e->tid == tid){
|
|
RegEntry* next = atomic_load_explicit(&e->next, memory_order_relaxed);
|
|
if (prev == NULL) {
|
|
atomic_store_explicit(&g_buckets[h], next, memory_order_release);
|
|
} else {
|
|
atomic_store_explicit(&prev->next, next, memory_order_release);
|
|
}
|
|
free(e);
|
|
break;
|
|
}
|
|
prev = e;
|
|
e = atomic_load_explicit(&e->next, memory_order_relaxed);
|
|
}
|
|
pthread_mutex_unlock(&g_locks[h]);
|
|
}
|
|
|
|
int pool_reg_lookup(void* ptr, pid_t* tid_out, int* class_idx_out){
|
|
pthread_once(&g_init_once, reg_init);
|
|
uint64_t h = hash_ptr(ptr) & (REG_BUCKETS-1);
|
|
|
|
// Lock-free lookup! No mutex needed for reads
|
|
RegEntry* e = atomic_load_explicit(&g_buckets[h], memory_order_acquire);
|
|
while (e) {
|
|
// Load entry fields (they're stable after registration)
|
|
void* base = e->base;
|
|
void* end = e->end;
|
|
|
|
if (ptr >= base && ptr < end){
|
|
if (tid_out) *tid_out = e->tid;
|
|
if (class_idx_out) *class_idx_out = e->class_idx;
|
|
return 1;
|
|
}
|
|
e = atomic_load_explicit(&e->next, memory_order_acquire);
|
|
}
|
|
return 0;
|
|
}
|
|
|