Files
hakmem/core/pool_tls_registry.c
Moe Charm (CI) f40be1a5ba Pool TLS: Lock-free MPSC remote queue implementation
Problem: pool_remote_push mutex contention (67% of syscall time in futex)
Solution: Lock-free MPSC queue using atomic CAS operations

Changes:
1. core/pool_tls_remote.c - Lock-free MPSC queue
   - Push: atomic_compare_exchange_weak (CAS loop, no locks!)
   - Pop: atomic_exchange (steal entire chain)
   - Mutex only for RemoteRec creation (rare, first-push-to-thread)

2. core/pool_tls_registry.c - Lock-free lookup
   - Buckets and next pointers now atomic: _Atomic(RegEntry*)
   - Lookup uses memory_order_acquire loads (no locks on hot path)
   - Registration/unregistration still use mutex (rare operations)

Results:
- futex calls: 209 → 7 (-97% reduction!)
- Throughput: 0.97M → 1.0M ops/s (+3%)
- Remaining gap: 5.8x slower than System malloc (5.8M ops/s)

Key Finding:
- futex was NOT the primary bottleneck (only small % of total runtime)
- True bottleneck: 8% cache miss rate + registry lookup overhead

Thread Safety:
- MPSC: Multi-producer (CAS), Single-consumer (owner thread)
- Memory ordering: release/acquire for correctness
- No ABA problem (pointers used once, no reuse)

Next: P0 registry lookup elimination via POOL_TLS_BIND_BOX

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-14 14:29:05 +09:00

92 lines
2.9 KiB
C

#include "pool_tls_registry.h"
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
typedef struct RegEntry {
void* base;
void* end;
pid_t tid;
int class_idx;
_Atomic(struct RegEntry*) next; // Atomic for lock-free reads
} RegEntry;
#define REG_BUCKETS 1024
static _Atomic(RegEntry*) g_buckets[REG_BUCKETS]; // Atomic buckets for lock-free reads
static pthread_mutex_t g_locks[REG_BUCKETS]; // Only for registration/unregistration
static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;
static void reg_init(void){
for (int i=0;i<REG_BUCKETS;i++) {
pthread_mutex_init(&g_locks[i], NULL);
atomic_store_explicit(&g_buckets[i], NULL, memory_order_relaxed);
}
}
static inline uint64_t hash_ptr(void* p){
uintptr_t x=(uintptr_t)p; x ^= x>>33; x*=0xff51afd7ed558ccdULL; x ^= x>>33; x*=0xc4ceb9fe1a85ec53ULL; x ^= x>>33; return x;
}
void pool_reg_register(void* base, size_t size, pid_t tid, int class_idx){
pthread_once(&g_init_once, reg_init);
void* end = (void*)((char*)base + size);
uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
pthread_mutex_lock(&g_locks[h]);
RegEntry* e = (RegEntry*)malloc(sizeof(RegEntry));
e->base = base; e->end = end; e->tid = tid; e->class_idx = class_idx;
RegEntry* old_head = atomic_load_explicit(&g_buckets[h], memory_order_relaxed);
atomic_store_explicit(&e->next, old_head, memory_order_relaxed);
atomic_store_explicit(&g_buckets[h], e, memory_order_release);
pthread_mutex_unlock(&g_locks[h]);
}
void pool_reg_unregister(void* base, size_t size, pid_t tid){
pthread_once(&g_init_once, reg_init);
uint64_t h = hash_ptr(base) & (REG_BUCKETS-1);
pthread_mutex_lock(&g_locks[h]);
// Need to carefully update atomic pointers
_Atomic(RegEntry*)* pp = &g_buckets[h];
RegEntry* e = atomic_load_explicit(pp, memory_order_relaxed);
RegEntry* prev = NULL;
while (e){
if (e->base == base && e->tid == tid){
RegEntry* next = atomic_load_explicit(&e->next, memory_order_relaxed);
if (prev == NULL) {
atomic_store_explicit(&g_buckets[h], next, memory_order_release);
} else {
atomic_store_explicit(&prev->next, next, memory_order_release);
}
free(e);
break;
}
prev = e;
e = atomic_load_explicit(&e->next, memory_order_relaxed);
}
pthread_mutex_unlock(&g_locks[h]);
}
int pool_reg_lookup(void* ptr, pid_t* tid_out, int* class_idx_out){
pthread_once(&g_init_once, reg_init);
uint64_t h = hash_ptr(ptr) & (REG_BUCKETS-1);
// Lock-free lookup! No mutex needed for reads
RegEntry* e = atomic_load_explicit(&g_buckets[h], memory_order_acquire);
while (e) {
// Load entry fields (they're stable after registration)
void* base = e->base;
void* end = e->end;
if (ptr >= base && ptr < end){
if (tid_out) *tid_out = e->tid;
if (class_idx_out) *class_idx_out = e->class_idx;
return 1;
}
e = atomic_load_explicit(&e->next, memory_order_acquire);
}
return 0;
}