Files
hakmem/docs/design/REFACTOR_IMPLEMENTATION_GUIDE.md
Moe Charm (CI) 67fb15f35f Wrap debug fprintf in !HAKMEM_BUILD_RELEASE guards (Release build optimization)
## Changes

### 1. core/page_arena.c
- Removed init failure message (lines 25-27) - error is handled by returning early
- All other fprintf statements already wrapped in existing #if !HAKMEM_BUILD_RELEASE blocks

### 2. core/hakmem.c
- Wrapped SIGSEGV handler init message (line 72)
- CRITICAL: Kept SIGSEGV/SIGBUS/SIGABRT error messages (lines 62-64) - production needs crash logs

### 3. core/hakmem_shared_pool.c
- Wrapped all debug fprintf statements in #if !HAKMEM_BUILD_RELEASE:
  - Node pool exhaustion warning (line 252)
  - SP_META_CAPACITY_ERROR warning (line 421)
  - SP_FIX_GEOMETRY debug logging (line 745)
  - SP_ACQUIRE_STAGE0.5_EMPTY debug logging (line 865)
  - SP_ACQUIRE_STAGE0_L0 debug logging (line 803)
  - SP_ACQUIRE_STAGE1_LOCKFREE debug logging (line 922)
  - SP_ACQUIRE_STAGE2_LOCKFREE debug logging (line 996)
  - SP_ACQUIRE_STAGE3 debug logging (line 1116)
  - SP_SLOT_RELEASE debug logging (line 1245)
  - SP_SLOT_FREELIST_LOCKFREE debug logging (line 1305)
  - SP_SLOT_COMPLETELY_EMPTY debug logging (line 1316)
- Fixed lock_stats_init() for release builds (lines 60-65) - ensure g_lock_stats_enabled is initialized

## Performance Validation

Before: 51M ops/s (with debug fprintf overhead)
After:  49.1M ops/s (consistent performance, fprintf removed from hot paths)

## Build & Test

```bash
./build.sh larson_hakmem
./out/release/larson_hakmem 1 5 1 1000 100 10000 42
# Result: 49.1M ops/s
```

Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 13:14:18 +09:00

17 KiB

HAKMEM Tiny Allocator リファクタリング実装ガイド

クイックスタート

このドキュメントは、REFACTOR_PLAN.md の実装手順を段階的に説明します。


Priority 1: Fast Path リファクタリング (Week 1)

Phase 1.1: tiny_atomic.h (新規作成, 80行)

目的: Atomic操作の統一インターフェース

ファイル: core/tiny_atomic.h

#ifndef HAKMEM_TINY_ATOMIC_H
#define HAKMEM_TINY_ATOMIC_H

#include <stdatomic.h>

// ============================================================================
// TINY_ATOMIC: 統一インターフェース for atomics with memory ordering
// ============================================================================

/**
 * tiny_atomic_load - Load with acquire semantics (default)
 * @ptr: pointer to atomic variable
 * @order: memory_order (default: memory_order_acquire)
 * 
 * Returns: Loaded value
 */
#define tiny_atomic_load(ptr, order) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, order)

#define tiny_atomic_load_acq(ptr) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_acquire)

#define tiny_atomic_load_rel(ptr) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_release)

#define tiny_atomic_load_relax(ptr) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_relaxed)

/**
 * tiny_atomic_store - Store with release semantics (default)
 */
#define tiny_atomic_store(ptr, val, order) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, order)

#define tiny_atomic_store_rel(ptr, val) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_release)

#define tiny_atomic_store_acq(ptr, val) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_acquire)

#define tiny_atomic_store_relax(ptr, val) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_relaxed)

/**
 * tiny_atomic_cas - Compare and swap with seq_cst semantics
 * @ptr: pointer to atomic variable
 * @expected: expected value (in/out)
 * @desired: desired value
 * Returns: true if successful
 */
#define tiny_atomic_cas(ptr, expected, desired) \
    atomic_compare_exchange_strong_explicit( \
        (_Atomic typeof(*ptr)*)ptr, expected, desired, \
        memory_order_seq_cst, memory_order_relaxed)

/**
 * tiny_atomic_cas_weak - Weak CAS for loops
 */
#define tiny_atomic_cas_weak(ptr, expected, desired) \
    atomic_compare_exchange_weak_explicit( \
        (_Atomic typeof(*ptr)*)ptr, expected, desired, \
        memory_order_seq_cst, memory_order_relaxed)

/**
 * tiny_atomic_exchange - Atomic exchange
 */
#define tiny_atomic_exchange(ptr, desired) \
    atomic_exchange_explicit((_Atomic typeof(*ptr)*)ptr, desired, \
                            memory_order_seq_cst)

/**
 * tiny_atomic_fetch_add - Fetch and add
 */
#define tiny_atomic_fetch_add(ptr, val) \
    atomic_fetch_add_explicit((_Atomic typeof(*ptr)*)ptr, val, \
                             memory_order_seq_cst)

/**
 * tiny_atomic_increment - Increment (returns new value)
 */
#define tiny_atomic_increment(ptr) \
    (atomic_fetch_add_explicit((_Atomic typeof(*ptr)*)ptr, 1, \
                              memory_order_seq_cst) + 1)

#endif // HAKMEM_TINY_ATOMIC_H

テスト:

// test_tiny_atomic.c
#include "tiny_atomic.h"

void test_tiny_atomic_load_store() {
    _Atomic int x = 0;
    tiny_atomic_store(&x, 42, memory_order_release);
    assert(tiny_atomic_load(&x, memory_order_acquire) == 42);
}

void test_tiny_atomic_cas() {
    _Atomic int x = 1;
    int expected = 1;
    assert(tiny_atomic_cas(&x, &expected, 2) == true);
    assert(tiny_atomic_load(&x, memory_order_relaxed) == 2);
}

Phase 1.2: tiny_alloc_fast.inc.h (新規作成, 250行)

目的: 3-4命令のfast path allocation

ファイル: core/tiny_alloc_fast.inc.h

#ifndef HAKMEM_TINY_ALLOC_FAST_INC_H
#define HAKMEM_TINY_ALLOC_FAST_INC_H

#include "tiny_atomic.h"

// ============================================================================
// TINY_ALLOC_FAST: Ultra-simple fast path (3-4 命令)
// ============================================================================

// TLS storage (defined in hakmem_tiny.c)
extern void* g_tls_alloc_cache[TINY_NUM_CLASSES];
extern int g_tls_alloc_count[TINY_NUM_CLASSES];
extern int g_tls_alloc_cap[TINY_NUM_CLASSES];

/**
 * tiny_alloc_fast_pop - Pop from TLS cache (3-4 命令)
 * 
 * Fast path for allocation:
 *   1. Load head from TLS cache
 *   2. Check if non-NULL
 *   3. Pop: head = head->next
 *   4. Return ptr
 * 
 * Returns: Pointer if cache hit, NULL if miss (go to slow path)
 */
static inline void* tiny_alloc_fast_pop(int class_idx) {
    void* ptr = g_tls_alloc_cache[class_idx];
    if (__builtin_expect(ptr != NULL, 1)) {
        // Pop: store next pointer
        g_tls_alloc_cache[class_idx] = *(void**)ptr;
        // Update count (optional, can be batched)
        g_tls_alloc_count[class_idx]--;
        return ptr;
    }
    return NULL;  // Cache miss → slow path
}

/**
 * tiny_alloc_fast_push - Push to TLS cache
 * 
 * Returns: 1 if success, 0 if cache full (go to spill logic)
 */
static inline int tiny_alloc_fast_push(int class_idx, void* ptr) {
    int cnt = g_tls_alloc_count[class_idx];
    int cap = g_tls_alloc_cap[class_idx];
    
    if (__builtin_expect(cnt < cap, 1)) {
        // Push: ptr->next = head
        *(void**)ptr = g_tls_alloc_cache[class_idx];
        g_tls_alloc_cache[class_idx] = ptr;
        g_tls_alloc_count[class_idx]++;
        return 1;
    }
    return 0;  // Cache full → slow path
}

/**
 * tiny_alloc_fast - Fast allocation entry (public API for fast path)
 * 
 * Equivalent to:
 *   void* ptr = tiny_alloc_fast_pop(class_idx);
 *   if (!ptr) ptr = tiny_alloc_slow(class_idx);
 *   return ptr;
 */
static inline void* tiny_alloc_fast(int class_idx) {
    void* ptr = tiny_alloc_fast_pop(class_idx);
    if (__builtin_expect(ptr != NULL, 1)) {
        return ptr;
    }
    // Slow path call will be added in hakmem_tiny.c
    return NULL;  // Placeholder
}

#endif // HAKMEM_TINY_ALLOC_FAST_INC_H

テスト:

// test_tiny_alloc_fast.c
void test_tiny_alloc_fast_empty() {
    g_tls_alloc_cache[0] = NULL;
    g_tls_alloc_count[0] = 0;
    assert(tiny_alloc_fast_pop(0) == NULL);
}

void test_tiny_alloc_fast_push_pop() {
    void* ptr = (void*)0x12345678;
    g_tls_alloc_count[0] = 0;
    g_tls_alloc_cap[0] = 100;
    
    assert(tiny_alloc_fast_push(0, ptr) == 1);
    assert(g_tls_alloc_count[0] == 1);
    assert(tiny_alloc_fast_pop(0) == ptr);
    assert(g_tls_alloc_count[0] == 0);
}

Phase 1.3: tiny_free_fast.inc.h (新規作成, 200行)

目的: Same-thread fast free path

ファイル: core/tiny_free_fast.inc.h

#ifndef HAKMEM_TINY_FREE_FAST_INC_H
#define HAKMEM_TINY_FREE_FAST_INC_H

#include "tiny_atomic.h"
#include "tiny_alloc_fast.inc.h"

// ============================================================================
// TINY_FREE_FAST: Same-thread fast free (15-20 命令)
// ============================================================================

/**
 * tiny_free_fast - Fast free for same-thread ownership
 * 
 * Ownership check:
 *   1. Get self TID (uint32_t)
 *   2. Lookup slab owner_tid
 *   3. Compare: if owner_tid == self_tid → same thread → push to cache
 *   4. Otherwise: slow path (remote queue)
 * 
 * Returns: 1 if successfully freed to cache, 0 if slow path needed
 */
static inline int tiny_free_fast(void* ptr, int class_idx) {
    // Step 1: Get self TID
    uint32_t self_tid = tiny_self_u32();
    
    // Step 2: Owner lookup (O(1) via slab_handle.h)
    TinySlab* slab = hak_tiny_owner_slab(ptr);
    if (__builtin_expect(slab == NULL, 0)) {
        return 0;  // Not owned by Tiny → slow path
    }
    
    // Step 3: Compare owner
    if (__builtin_expect(slab->owner_tid != self_tid, 0)) {
        return 0;  // Cross-thread → slow path (remote queue)
    }
    
    // Step 4: Same-thread → cache push
    return tiny_alloc_fast_push(class_idx, ptr);
}

/**
 * tiny_free_main_entry - Main free entry point
 * 
 * Dispatches:
 *   - tiny_free_fast() for same-thread
 *   - tiny_free_remote() for cross-thread
 *   - tiny_free_guard() for validation
 */
static inline void tiny_free_main_entry(void* ptr) {
    if (__builtin_expect(ptr == NULL, 0)) {
        return;  // NULL is safe
    }
    
    // Fast path: lookup class and owner in one step
    // (This requires pre-computing or O(1) lookup)
    // For now, we'll delegate to existing tiny_free()
    // which will be refactored to call tiny_free_fast()
}

#endif // HAKMEM_TINY_FREE_FAST_INC_H

Phase 1.4: hakmem_tiny_free.inc Refactoring (削減)

目的: Free.inc から fast path を抽出し、500行削減

手順:

  1. Lines 1-558 (Free パス) → tiny_free_fast.inc.h + tiny_free_remote.inc.h へ分割
  2. Lines 559-998 (SuperSlab Alloc) → tiny_alloc_slow.inc.h へ移動
  3. Lines 999-1369 (SuperSlab Free) → tiny_free_remote.inc.h + Box 4 へ移動
  4. Lines 1371-1434 (Query, commented) → 削除
  5. Lines 1435-1464 (Shutdown) → tiny_lifecycle_shutdown.inc.h へ移動

結果: hakmem_tiny_free.inc: 1470行 → 300行以下


Priority 2: Implementation Checklist

Week 1 Checklist

  • Box 1: tiny_atomic.h 作成

    • Unit tests
    • Integration with tiny_free_fast
  • Box 5.1: tiny_alloc_fast.inc.h 作成

    • Pop/push functions
    • Unit tests
    • Benchmark (cache hit rate)
  • Box 6.1: tiny_free_fast.inc.h 作成

    • Same-thread check
    • Cache push
    • Unit tests
  • Extract from hakmem_tiny_free.inc

    • Remove fast path (lines 1-558)
    • Remove shutdown (lines 1435-1464)
    • Verify compilation
  • Benchmark

    • Measure fast path latency (should be <5 cycles)
    • Measure cache hit rate (target: >80%)
    • Measure throughput (target: >100M ops/sec for 16-64B)

Priority 2: Remote Queue & Ownership (Week 2)

Phase 2.1: tiny_remote_queue.inc.h (新規作成, 300行)

出処: hakmem_tiny_free.inc の remote queue logic を抽出

責務: MPSC remote queue operations

// tiny_remote_queue.inc.h
#ifndef HAKMEM_TINY_REMOTE_QUEUE_INC_H
#define HAKMEM_TINY_REMOTE_QUEUE_INC_H

#include "tiny_atomic.h"

// ============================================================================
// TINY_REMOTE_QUEUE: MPSC stack for cross-thread free
// ============================================================================

/**
 * tiny_remote_queue_push - Push ptr to remote queue
 * 
 * Single writer (owner) pushes to remote_heads[slab_idx]
 * Multiple readers (other threads) push to same stack
 * 
 * MPSC = Many Producers, Single Consumer
 */
static inline void tiny_remote_queue_push(SuperSlab* ss, int slab_idx, void* ptr) {
    if (__builtin_expect(!ss || slab_idx < 0, 0)) {
        return;
    }
    
    // Link: ptr->next = head
    uintptr_t cur_head = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]);
    while (1) {
        *(uintptr_t*)ptr = cur_head;
        
        // CAS: if head == cur_head, head = ptr
        if (tiny_atomic_cas(&ss->remote_heads[slab_idx], &cur_head, (uintptr_t)ptr)) {
            break;
        }
    }
}

/**
 * tiny_remote_queue_pop_all - Pop entire chain from remote queue
 * 
 * Owner thread pops all pending frees
 * Returns: head of chain (or NULL if empty)
 */
static inline void* tiny_remote_queue_pop_all(SuperSlab* ss, int slab_idx) {
    if (__builtin_expect(!ss || slab_idx < 0, 0)) {
        return NULL;
    }
    
    uintptr_t head = tiny_atomic_exchange(&ss->remote_heads[slab_idx], 0);
    return (void*)head;
}

/**
 * tiny_remote_queue_contains_guard - Guard check (security)
 * 
 * Verify ptr is in remote queue chain (sentinel check)
 */
static inline int tiny_remote_queue_contains_guard(SuperSlab* ss, int slab_idx, void* target) {
    if (!ss || slab_idx < 0) return 0;
    
    uintptr_t cur = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]);
    int limit = 8192;  // Prevent infinite loop
    
    while (cur && limit-- > 0) {
        if ((void*)cur == target) {
            return 1;
        }
        cur = *(uintptr_t*)cur;
    }
    
    return (limit <= 0) ? 1 : 0;  // Fail-safe: treat unbounded as duplicate
}

#endif // HAKMEM_TINY_REMOTE_QUEUE_INC_H

Phase 2.2: tiny_owner.inc.h (新規作成, 120行)

責務: Owner TID management

// tiny_owner.inc.h
#ifndef HAKMEM_TINY_OWNER_INC_H
#define HAKMEM_TINY_OWNER_INC_H

#include "tiny_atomic.h"

// ============================================================================
// TINY_OWNER: Ownership tracking (owner_tid)
// ============================================================================

/**
 * tiny_owner_acquire - Acquire ownership of slab
 * 
 * Call when thread takes ownership of a TinySlab
 */
static inline void tiny_owner_acquire(TinySlab* slab, uint32_t tid) {
    if (__builtin_expect(!slab, 0)) return;
    tiny_atomic_store_rel(&slab->owner_tid, tid);
}

/**
 * tiny_owner_release - Release ownership of slab
 * 
 * Call when thread releases a TinySlab (e.g., spill, shutdown)
 */
static inline void tiny_owner_release(TinySlab* slab) {
    if (__builtin_expect(!slab, 0)) return;
    tiny_atomic_store_rel(&slab->owner_tid, 0);
}

/**
 * tiny_owner_check - Check if self owns slab
 * 
 * Returns: 1 if self owns, 0 otherwise
 */
static inline int tiny_owner_check(TinySlab* slab, uint32_t self_tid) {
    if (__builtin_expect(!slab, 0)) return 0;
    return tiny_atomic_load_acq(&slab->owner_tid) == self_tid;
}

#endif // HAKMEM_TINY_OWNER_INC_H

Testing Framework

Unit Test Template

// tests/test_tiny_<component>.c

#include <assert.h>
#include "hakmem.h"
#include "tiny_atomic.h"
#include "tiny_alloc_fast.inc.h"
#include "tiny_free_fast.inc.h"

static void test_<function>() {
    // Setup
    // Action
    // Assert
    printf("✅ test_<function> passed\n");
}

int main() {
    test_<function>();
    // ... more tests
    printf("\n✨ All tests passed!\n");
    return 0;
}

Integration Test

// tests/test_tiny_alloc_free_cycle.c

void test_alloc_free_single_thread_100k() {
    void* ptrs[100];
    for (int i = 0; i < 100; i++) {
        ptrs[i] = hak_tiny_alloc(16);
        assert(ptrs[i] != NULL);
    }
    
    for (int i = 0; i < 100; i++) {
        hak_tiny_free(ptrs[i]);
    }
    
    printf("✅ test_alloc_free_single_thread_100k passed\n");
}

void test_alloc_free_cross_thread() {
    void* ptrs[100];
    
    // Thread A: allocate
    pthread_t tid;
    pthread_create(&tid, NULL, allocator_thread, ptrs);
    
    // Main: free (cross-thread)
    for (int i = 0; i < 100; i++) {
        sleep(10);  // Wait for allocs
        hak_tiny_free(ptrs[i]);
    }
    
    pthread_join(tid, NULL);
    printf("✅ test_alloc_free_cross_thread passed\n");
}

Performance Validation

Assembly Check (fast path)

# Compile with -S to generate assembly
gcc -S -O3 -c core/hakmem_tiny.c -o /tmp/tiny.s

# Count instructions in fast path
grep -A20 "tiny_alloc_fast_pop:" /tmp/tiny.s | wc -l
# Expected: <= 8 instructions (3-4 ideal)

# Check branch mispredicts
grep "likely\|unlikely" /tmp/tiny.s | wc -l
# Expected: cache hits have likely, misses have unlikely

Benchmark (larson)

# Baseline
./larson_hakmem 16 1 1000 1000 0

# With new fast path
./larson_hakmem 16 1 1000 1000 0

# Expected improvement: +10-15% throughput

Compilation & Integration

Makefile Changes

# Add new files to dependencies
TINY_HEADERS = \
    core/tiny_atomic.h \
    core/tiny_alloc_fast.inc.h \
    core/tiny_free_fast.inc.h \
    core/tiny_owner.inc.h \
    core/tiny_remote_queue.inc.h

# Rebuild if any header changes
libhakmem.so: $(TINY_HEADERS) core/hakmem_tiny.c

Include Order (hakmem_tiny.c)

// At the top of hakmem_tiny.c, after hakmem_tiny_config.h:

// ============================================================
// LAYER 0: Atomic + Ownership (lowest)
// ============================================================
#include "tiny_atomic.h"
#include "tiny_owner.inc.h"
#include "slab_handle.h"

// ... rest of includes

Rollback Plan

If performance regresses or compilation fails:

  1. Keep old files: hakmem_tiny_free.inc is not deleted, only refactored
  2. Git revert: Can revert specific commits per Box
  3. Feature flags: Add HAKMEM_TINY_NEW_FAST_PATH=0 to disable new code path
  4. Benchmark first: Always run larson before and after each change

Success Metrics

Performance

  • Fast path: 3-4 instructions (assembly review)
  • Throughput: +10-15% on 16-64B allocations
  • Cache hit rate: >80%

Code Quality

  • All files <= 500 lines
  • Zero cyclic dependencies (verified by include analysis)
  • No compilation warnings

Testing

  • Unit tests: 100% pass
  • Integration tests: 100% pass
  • Larson benchmark: baseline + 10-15%

Contact & Questions

Refer to REFACTOR_PLAN.md for high-level strategy and timeline.

For specific implementation details, see the corresponding .inc.h files.