# HAKMEM Tiny Allocator リファクタリング実装ガイド ## クイックスタート このドキュメントは、REFACTOR_PLAN.md の実装手順を段階的に説明します。 --- ## Priority 1: Fast Path リファクタリング (Week 1) ### Phase 1.1: tiny_atomic.h (新規作成, 80行) **目的**: Atomic操作の統一インターフェース **ファイル**: `core/tiny_atomic.h` ```c #ifndef HAKMEM_TINY_ATOMIC_H #define HAKMEM_TINY_ATOMIC_H #include // ============================================================================ // TINY_ATOMIC: 統一インターフェース for atomics with memory ordering // ============================================================================ /** * tiny_atomic_load - Load with acquire semantics (default) * @ptr: pointer to atomic variable * @order: memory_order (default: memory_order_acquire) * * Returns: Loaded value */ #define tiny_atomic_load(ptr, order) \ atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, order) #define tiny_atomic_load_acq(ptr) \ atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_acquire) #define tiny_atomic_load_rel(ptr) \ atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_release) #define tiny_atomic_load_relax(ptr) \ atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_relaxed) /** * tiny_atomic_store - Store with release semantics (default) */ #define tiny_atomic_store(ptr, val, order) \ atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, order) #define tiny_atomic_store_rel(ptr, val) \ atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_release) #define tiny_atomic_store_acq(ptr, val) \ atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_acquire) #define tiny_atomic_store_relax(ptr, val) \ atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_relaxed) /** * tiny_atomic_cas - Compare and swap with seq_cst semantics * @ptr: pointer to atomic variable * @expected: expected value (in/out) * @desired: desired value * Returns: true if successful */ #define tiny_atomic_cas(ptr, expected, desired) \ atomic_compare_exchange_strong_explicit( \ (_Atomic typeof(*ptr)*)ptr, expected, desired, \ memory_order_seq_cst, memory_order_relaxed) /** * tiny_atomic_cas_weak - Weak CAS for loops */ #define tiny_atomic_cas_weak(ptr, expected, desired) \ atomic_compare_exchange_weak_explicit( \ (_Atomic typeof(*ptr)*)ptr, expected, desired, \ memory_order_seq_cst, memory_order_relaxed) /** * tiny_atomic_exchange - Atomic exchange */ #define tiny_atomic_exchange(ptr, desired) \ atomic_exchange_explicit((_Atomic typeof(*ptr)*)ptr, desired, \ memory_order_seq_cst) /** * tiny_atomic_fetch_add - Fetch and add */ #define tiny_atomic_fetch_add(ptr, val) \ atomic_fetch_add_explicit((_Atomic typeof(*ptr)*)ptr, val, \ memory_order_seq_cst) /** * tiny_atomic_increment - Increment (returns new value) */ #define tiny_atomic_increment(ptr) \ (atomic_fetch_add_explicit((_Atomic typeof(*ptr)*)ptr, 1, \ memory_order_seq_cst) + 1) #endif // HAKMEM_TINY_ATOMIC_H ``` **テスト**: ```c // test_tiny_atomic.c #include "tiny_atomic.h" void test_tiny_atomic_load_store() { _Atomic int x = 0; tiny_atomic_store(&x, 42, memory_order_release); assert(tiny_atomic_load(&x, memory_order_acquire) == 42); } void test_tiny_atomic_cas() { _Atomic int x = 1; int expected = 1; assert(tiny_atomic_cas(&x, &expected, 2) == true); assert(tiny_atomic_load(&x, memory_order_relaxed) == 2); } ``` --- ### Phase 1.2: tiny_alloc_fast.inc.h (新規作成, 250行) **目的**: 3-4命令のfast path allocation **ファイル**: `core/tiny_alloc_fast.inc.h` ```c #ifndef HAKMEM_TINY_ALLOC_FAST_INC_H #define HAKMEM_TINY_ALLOC_FAST_INC_H #include "tiny_atomic.h" // ============================================================================ // TINY_ALLOC_FAST: Ultra-simple fast path (3-4 命令) // ============================================================================ // TLS storage (defined in hakmem_tiny.c) extern void* g_tls_alloc_cache[TINY_NUM_CLASSES]; extern int g_tls_alloc_count[TINY_NUM_CLASSES]; extern int g_tls_alloc_cap[TINY_NUM_CLASSES]; /** * tiny_alloc_fast_pop - Pop from TLS cache (3-4 命令) * * Fast path for allocation: * 1. Load head from TLS cache * 2. Check if non-NULL * 3. Pop: head = head->next * 4. Return ptr * * Returns: Pointer if cache hit, NULL if miss (go to slow path) */ static inline void* tiny_alloc_fast_pop(int class_idx) { void* ptr = g_tls_alloc_cache[class_idx]; if (__builtin_expect(ptr != NULL, 1)) { // Pop: store next pointer g_tls_alloc_cache[class_idx] = *(void**)ptr; // Update count (optional, can be batched) g_tls_alloc_count[class_idx]--; return ptr; } return NULL; // Cache miss → slow path } /** * tiny_alloc_fast_push - Push to TLS cache * * Returns: 1 if success, 0 if cache full (go to spill logic) */ static inline int tiny_alloc_fast_push(int class_idx, void* ptr) { int cnt = g_tls_alloc_count[class_idx]; int cap = g_tls_alloc_cap[class_idx]; if (__builtin_expect(cnt < cap, 1)) { // Push: ptr->next = head *(void**)ptr = g_tls_alloc_cache[class_idx]; g_tls_alloc_cache[class_idx] = ptr; g_tls_alloc_count[class_idx]++; return 1; } return 0; // Cache full → slow path } /** * tiny_alloc_fast - Fast allocation entry (public API for fast path) * * Equivalent to: * void* ptr = tiny_alloc_fast_pop(class_idx); * if (!ptr) ptr = tiny_alloc_slow(class_idx); * return ptr; */ static inline void* tiny_alloc_fast(int class_idx) { void* ptr = tiny_alloc_fast_pop(class_idx); if (__builtin_expect(ptr != NULL, 1)) { return ptr; } // Slow path call will be added in hakmem_tiny.c return NULL; // Placeholder } #endif // HAKMEM_TINY_ALLOC_FAST_INC_H ``` **テスト**: ```c // test_tiny_alloc_fast.c void test_tiny_alloc_fast_empty() { g_tls_alloc_cache[0] = NULL; g_tls_alloc_count[0] = 0; assert(tiny_alloc_fast_pop(0) == NULL); } void test_tiny_alloc_fast_push_pop() { void* ptr = (void*)0x12345678; g_tls_alloc_count[0] = 0; g_tls_alloc_cap[0] = 100; assert(tiny_alloc_fast_push(0, ptr) == 1); assert(g_tls_alloc_count[0] == 1); assert(tiny_alloc_fast_pop(0) == ptr); assert(g_tls_alloc_count[0] == 0); } ``` --- ### Phase 1.3: tiny_free_fast.inc.h (新規作成, 200行) **目的**: Same-thread fast free path **ファイル**: `core/tiny_free_fast.inc.h` ```c #ifndef HAKMEM_TINY_FREE_FAST_INC_H #define HAKMEM_TINY_FREE_FAST_INC_H #include "tiny_atomic.h" #include "tiny_alloc_fast.inc.h" // ============================================================================ // TINY_FREE_FAST: Same-thread fast free (15-20 命令) // ============================================================================ /** * tiny_free_fast - Fast free for same-thread ownership * * Ownership check: * 1. Get self TID (uint32_t) * 2. Lookup slab owner_tid * 3. Compare: if owner_tid == self_tid → same thread → push to cache * 4. Otherwise: slow path (remote queue) * * Returns: 1 if successfully freed to cache, 0 if slow path needed */ static inline int tiny_free_fast(void* ptr, int class_idx) { // Step 1: Get self TID uint32_t self_tid = tiny_self_u32(); // Step 2: Owner lookup (O(1) via slab_handle.h) TinySlab* slab = hak_tiny_owner_slab(ptr); if (__builtin_expect(slab == NULL, 0)) { return 0; // Not owned by Tiny → slow path } // Step 3: Compare owner if (__builtin_expect(slab->owner_tid != self_tid, 0)) { return 0; // Cross-thread → slow path (remote queue) } // Step 4: Same-thread → cache push return tiny_alloc_fast_push(class_idx, ptr); } /** * tiny_free_main_entry - Main free entry point * * Dispatches: * - tiny_free_fast() for same-thread * - tiny_free_remote() for cross-thread * - tiny_free_guard() for validation */ static inline void tiny_free_main_entry(void* ptr) { if (__builtin_expect(ptr == NULL, 0)) { return; // NULL is safe } // Fast path: lookup class and owner in one step // (This requires pre-computing or O(1) lookup) // For now, we'll delegate to existing tiny_free() // which will be refactored to call tiny_free_fast() } #endif // HAKMEM_TINY_FREE_FAST_INC_H ``` --- ### Phase 1.4: hakmem_tiny_free.inc Refactoring (削減) **目的**: Free.inc から fast path を抽出し、500行削減 **手順**: 1. Lines 1-558 (Free パス) → tiny_free_fast.inc.h + tiny_free_remote.inc.h へ分割 2. Lines 559-998 (SuperSlab Alloc) → tiny_alloc_slow.inc.h へ移動 3. Lines 999-1369 (SuperSlab Free) → tiny_free_remote.inc.h + Box 4 へ移動 4. Lines 1371-1434 (Query, commented) → 削除 5. Lines 1435-1464 (Shutdown) → tiny_lifecycle_shutdown.inc.h へ移動 **結果**: hakmem_tiny_free.inc: 1470行 → 300行以下 --- ## Priority 2: Implementation Checklist ### Week 1 Checklist - [ ] Box 1: tiny_atomic.h 作成 - [ ] Unit tests - [ ] Integration with tiny_free_fast - [ ] Box 5.1: tiny_alloc_fast.inc.h 作成 - [ ] Pop/push functions - [ ] Unit tests - [ ] Benchmark (cache hit rate) - [ ] Box 6.1: tiny_free_fast.inc.h 作成 - [ ] Same-thread check - [ ] Cache push - [ ] Unit tests - [ ] Extract from hakmem_tiny_free.inc - [ ] Remove fast path (lines 1-558) - [ ] Remove shutdown (lines 1435-1464) - [ ] Verify compilation - [ ] Benchmark - [ ] Measure fast path latency (should be <5 cycles) - [ ] Measure cache hit rate (target: >80%) - [ ] Measure throughput (target: >100M ops/sec for 16-64B) --- ## Priority 2: Remote Queue & Ownership (Week 2) ### Phase 2.1: tiny_remote_queue.inc.h (新規作成, 300行) **出処**: hakmem_tiny_free.inc の remote queue logic を抽出 **責務**: MPSC remote queue operations ```c // tiny_remote_queue.inc.h #ifndef HAKMEM_TINY_REMOTE_QUEUE_INC_H #define HAKMEM_TINY_REMOTE_QUEUE_INC_H #include "tiny_atomic.h" // ============================================================================ // TINY_REMOTE_QUEUE: MPSC stack for cross-thread free // ============================================================================ /** * tiny_remote_queue_push - Push ptr to remote queue * * Single writer (owner) pushes to remote_heads[slab_idx] * Multiple readers (other threads) push to same stack * * MPSC = Many Producers, Single Consumer */ static inline void tiny_remote_queue_push(SuperSlab* ss, int slab_idx, void* ptr) { if (__builtin_expect(!ss || slab_idx < 0, 0)) { return; } // Link: ptr->next = head uintptr_t cur_head = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]); while (1) { *(uintptr_t*)ptr = cur_head; // CAS: if head == cur_head, head = ptr if (tiny_atomic_cas(&ss->remote_heads[slab_idx], &cur_head, (uintptr_t)ptr)) { break; } } } /** * tiny_remote_queue_pop_all - Pop entire chain from remote queue * * Owner thread pops all pending frees * Returns: head of chain (or NULL if empty) */ static inline void* tiny_remote_queue_pop_all(SuperSlab* ss, int slab_idx) { if (__builtin_expect(!ss || slab_idx < 0, 0)) { return NULL; } uintptr_t head = tiny_atomic_exchange(&ss->remote_heads[slab_idx], 0); return (void*)head; } /** * tiny_remote_queue_contains_guard - Guard check (security) * * Verify ptr is in remote queue chain (sentinel check) */ static inline int tiny_remote_queue_contains_guard(SuperSlab* ss, int slab_idx, void* target) { if (!ss || slab_idx < 0) return 0; uintptr_t cur = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]); int limit = 8192; // Prevent infinite loop while (cur && limit-- > 0) { if ((void*)cur == target) { return 1; } cur = *(uintptr_t*)cur; } return (limit <= 0) ? 1 : 0; // Fail-safe: treat unbounded as duplicate } #endif // HAKMEM_TINY_REMOTE_QUEUE_INC_H ``` --- ### Phase 2.2: tiny_owner.inc.h (新規作成, 120行) **責務**: Owner TID management ```c // tiny_owner.inc.h #ifndef HAKMEM_TINY_OWNER_INC_H #define HAKMEM_TINY_OWNER_INC_H #include "tiny_atomic.h" // ============================================================================ // TINY_OWNER: Ownership tracking (owner_tid) // ============================================================================ /** * tiny_owner_acquire - Acquire ownership of slab * * Call when thread takes ownership of a TinySlab */ static inline void tiny_owner_acquire(TinySlab* slab, uint32_t tid) { if (__builtin_expect(!slab, 0)) return; tiny_atomic_store_rel(&slab->owner_tid, tid); } /** * tiny_owner_release - Release ownership of slab * * Call when thread releases a TinySlab (e.g., spill, shutdown) */ static inline void tiny_owner_release(TinySlab* slab) { if (__builtin_expect(!slab, 0)) return; tiny_atomic_store_rel(&slab->owner_tid, 0); } /** * tiny_owner_check - Check if self owns slab * * Returns: 1 if self owns, 0 otherwise */ static inline int tiny_owner_check(TinySlab* slab, uint32_t self_tid) { if (__builtin_expect(!slab, 0)) return 0; return tiny_atomic_load_acq(&slab->owner_tid) == self_tid; } #endif // HAKMEM_TINY_OWNER_INC_H ``` --- ## Testing Framework ### Unit Test Template ```c // tests/test_tiny_.c #include #include "hakmem.h" #include "tiny_atomic.h" #include "tiny_alloc_fast.inc.h" #include "tiny_free_fast.inc.h" static void test_() { // Setup // Action // Assert printf("✅ test_ passed\n"); } int main() { test_(); // ... more tests printf("\n✨ All tests passed!\n"); return 0; } ``` ### Integration Test ```c // tests/test_tiny_alloc_free_cycle.c void test_alloc_free_single_thread_100k() { void* ptrs[100]; for (int i = 0; i < 100; i++) { ptrs[i] = hak_tiny_alloc(16); assert(ptrs[i] != NULL); } for (int i = 0; i < 100; i++) { hak_tiny_free(ptrs[i]); } printf("✅ test_alloc_free_single_thread_100k passed\n"); } void test_alloc_free_cross_thread() { void* ptrs[100]; // Thread A: allocate pthread_t tid; pthread_create(&tid, NULL, allocator_thread, ptrs); // Main: free (cross-thread) for (int i = 0; i < 100; i++) { sleep(10); // Wait for allocs hak_tiny_free(ptrs[i]); } pthread_join(tid, NULL); printf("✅ test_alloc_free_cross_thread passed\n"); } ``` --- ## Performance Validation ### Assembly Check (fast path) ```bash # Compile with -S to generate assembly gcc -S -O3 -c core/hakmem_tiny.c -o /tmp/tiny.s # Count instructions in fast path grep -A20 "tiny_alloc_fast_pop:" /tmp/tiny.s | wc -l # Expected: <= 8 instructions (3-4 ideal) # Check branch mispredicts grep "likely\|unlikely" /tmp/tiny.s | wc -l # Expected: cache hits have likely, misses have unlikely ``` ### Benchmark (larson) ```bash # Baseline ./larson_hakmem 16 1 1000 1000 0 # With new fast path ./larson_hakmem 16 1 1000 1000 0 # Expected improvement: +10-15% throughput ``` --- ## Compilation & Integration ### Makefile Changes ```makefile # Add new files to dependencies TINY_HEADERS = \ core/tiny_atomic.h \ core/tiny_alloc_fast.inc.h \ core/tiny_free_fast.inc.h \ core/tiny_owner.inc.h \ core/tiny_remote_queue.inc.h # Rebuild if any header changes libhakmem.so: $(TINY_HEADERS) core/hakmem_tiny.c ``` ### Include Order (hakmem_tiny.c) ```c // At the top of hakmem_tiny.c, after hakmem_tiny_config.h: // ============================================================ // LAYER 0: Atomic + Ownership (lowest) // ============================================================ #include "tiny_atomic.h" #include "tiny_owner.inc.h" #include "slab_handle.h" // ... rest of includes ``` --- ## Rollback Plan If performance regresses or compilation fails: 1. **Keep old files**: hakmem_tiny_free.inc is not deleted, only refactored 2. **Git revert**: Can revert specific commits per Box 3. **Feature flags**: Add HAKMEM_TINY_NEW_FAST_PATH=0 to disable new code path 4. **Benchmark first**: Always run larson before and after each change --- ## Success Metrics ### Performance - [ ] Fast path: 3-4 instructions (assembly review) - [ ] Throughput: +10-15% on 16-64B allocations - [ ] Cache hit rate: >80% ### Code Quality - [ ] All files <= 500 lines - [ ] Zero cyclic dependencies (verified by include analysis) - [ ] No compilation warnings ### Testing - [ ] Unit tests: 100% pass - [ ] Integration tests: 100% pass - [ ] Larson benchmark: baseline + 10-15% --- ## Contact & Questions Refer to REFACTOR_PLAN.md for high-level strategy and timeline. For specific implementation details, see the corresponding .inc.h files.