hakmem/docs/design/REFACTOR_IMPLEMENTATION_GUIDE.md

# HAKMEM Tiny Allocator リファクタリング実装ガイド

## クイックスタート

このドキュメントは、REFACTOR_PLAN.md の実装手順を段階的に説明します。

---

## Priority 1: Fast Path リファクタリング (Week 1)

### Phase 1.1: tiny_atomic.h (新規作成, 80行)

**目的**: Atomic操作の統一インターフェース

**ファイル**: `core/tiny_atomic.h`

```c
#ifndef HAKMEM_TINY_ATOMIC_H
#define HAKMEM_TINY_ATOMIC_H

#include <stdatomic.h>

// ============================================================================
// TINY_ATOMIC: 統一インターフェース for atomics with memory ordering
// ============================================================================

/**
 * tiny_atomic_load - Load with acquire semantics (default)
 * @ptr: pointer to atomic variable
 * @order: memory_order (default: memory_order_acquire)
 * 
 * Returns: Loaded value
 */
#define tiny_atomic_load(ptr, order) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, order)

#define tiny_atomic_load_acq(ptr) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_acquire)

#define tiny_atomic_load_rel(ptr) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_release)

#define tiny_atomic_load_relax(ptr) \
    atomic_load_explicit((_Atomic typeof(*ptr)*)ptr, memory_order_relaxed)

/**
 * tiny_atomic_store - Store with release semantics (default)
 */
#define tiny_atomic_store(ptr, val, order) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, order)

#define tiny_atomic_store_rel(ptr, val) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_release)

#define tiny_atomic_store_acq(ptr, val) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_acquire)

#define tiny_atomic_store_relax(ptr, val) \
    atomic_store_explicit((_Atomic typeof(*ptr)*)ptr, val, memory_order_relaxed)

/**
 * tiny_atomic_cas - Compare and swap with seq_cst semantics
 * @ptr: pointer to atomic variable
 * @expected: expected value (in/out)
 * @desired: desired value
 * Returns: true if successful
 */
#define tiny_atomic_cas(ptr, expected, desired) \
    atomic_compare_exchange_strong_explicit( \
        (_Atomic typeof(*ptr)*)ptr, expected, desired, \
        memory_order_seq_cst, memory_order_relaxed)

/**
 * tiny_atomic_cas_weak - Weak CAS for loops
 */
#define tiny_atomic_cas_weak(ptr, expected, desired) \
    atomic_compare_exchange_weak_explicit( \
        (_Atomic typeof(*ptr)*)ptr, expected, desired, \
        memory_order_seq_cst, memory_order_relaxed)

/**
 * tiny_atomic_exchange - Atomic exchange
 */
#define tiny_atomic_exchange(ptr, desired) \
    atomic_exchange_explicit((_Atomic typeof(*ptr)*)ptr, desired, \
                            memory_order_seq_cst)

/**
 * tiny_atomic_fetch_add - Fetch and add
 */
#define tiny_atomic_fetch_add(ptr, val) \
    atomic_fetch_add_explicit((_Atomic typeof(*ptr)*)ptr, val, \
                             memory_order_seq_cst)

/**
 * tiny_atomic_increment - Increment (returns new value)
 */
#define tiny_atomic_increment(ptr) \
    (atomic_fetch_add_explicit((_Atomic typeof(*ptr)*)ptr, 1, \
                              memory_order_seq_cst) + 1)

#endif // HAKMEM_TINY_ATOMIC_H
```

**テスト**: 
```c
// test_tiny_atomic.c
#include "tiny_atomic.h"

void test_tiny_atomic_load_store() {
    _Atomic int x = 0;
    tiny_atomic_store(&x, 42, memory_order_release);
    assert(tiny_atomic_load(&x, memory_order_acquire) == 42);
}

void test_tiny_atomic_cas() {
    _Atomic int x = 1;
    int expected = 1;
    assert(tiny_atomic_cas(&x, &expected, 2) == true);
    assert(tiny_atomic_load(&x, memory_order_relaxed) == 2);
}
```

---

### Phase 1.2: tiny_alloc_fast.inc.h (新規作成, 250行)

**目的**: 3-4命令のfast path allocation

**ファイル**: `core/tiny_alloc_fast.inc.h`

```c
#ifndef HAKMEM_TINY_ALLOC_FAST_INC_H
#define HAKMEM_TINY_ALLOC_FAST_INC_H

#include "tiny_atomic.h"

// ============================================================================
// TINY_ALLOC_FAST: Ultra-simple fast path (3-4 命令)
// ============================================================================

// TLS storage (defined in hakmem_tiny.c)
extern void* g_tls_alloc_cache[TINY_NUM_CLASSES];
extern int g_tls_alloc_count[TINY_NUM_CLASSES];
extern int g_tls_alloc_cap[TINY_NUM_CLASSES];

/**
 * tiny_alloc_fast_pop - Pop from TLS cache (3-4 命令)
 * 
 * Fast path for allocation:
 *   1. Load head from TLS cache
 *   2. Check if non-NULL
 *   3. Pop: head = head->next
 *   4. Return ptr
 * 
 * Returns: Pointer if cache hit, NULL if miss (go to slow path)
 */
static inline void* tiny_alloc_fast_pop(int class_idx) {
    void* ptr = g_tls_alloc_cache[class_idx];
    if (__builtin_expect(ptr != NULL, 1)) {
        // Pop: store next pointer
        g_tls_alloc_cache[class_idx] = *(void**)ptr;
        // Update count (optional, can be batched)
        g_tls_alloc_count[class_idx]--;
        return ptr;
    }
    return NULL;  // Cache miss → slow path
}

/**
 * tiny_alloc_fast_push - Push to TLS cache
 * 
 * Returns: 1 if success, 0 if cache full (go to spill logic)
 */
static inline int tiny_alloc_fast_push(int class_idx, void* ptr) {
    int cnt = g_tls_alloc_count[class_idx];
    int cap = g_tls_alloc_cap[class_idx];
    
    if (__builtin_expect(cnt < cap, 1)) {
        // Push: ptr->next = head
        *(void**)ptr = g_tls_alloc_cache[class_idx];
        g_tls_alloc_cache[class_idx] = ptr;
        g_tls_alloc_count[class_idx]++;
        return 1;
    }
    return 0;  // Cache full → slow path
}

/**
 * tiny_alloc_fast - Fast allocation entry (public API for fast path)
 * 
 * Equivalent to:
 *   void* ptr = tiny_alloc_fast_pop(class_idx);
 *   if (!ptr) ptr = tiny_alloc_slow(class_idx);
 *   return ptr;
 */
static inline void* tiny_alloc_fast(int class_idx) {
    void* ptr = tiny_alloc_fast_pop(class_idx);
    if (__builtin_expect(ptr != NULL, 1)) {
        return ptr;
    }
    // Slow path call will be added in hakmem_tiny.c
    return NULL;  // Placeholder
}

#endif // HAKMEM_TINY_ALLOC_FAST_INC_H
```

**テスト**:
```c
// test_tiny_alloc_fast.c
void test_tiny_alloc_fast_empty() {
    g_tls_alloc_cache[0] = NULL;
    g_tls_alloc_count[0] = 0;
    assert(tiny_alloc_fast_pop(0) == NULL);
}

void test_tiny_alloc_fast_push_pop() {
    void* ptr = (void*)0x12345678;
    g_tls_alloc_count[0] = 0;
    g_tls_alloc_cap[0] = 100;
    
    assert(tiny_alloc_fast_push(0, ptr) == 1);
    assert(g_tls_alloc_count[0] == 1);
    assert(tiny_alloc_fast_pop(0) == ptr);
    assert(g_tls_alloc_count[0] == 0);
}
```

---

### Phase 1.3: tiny_free_fast.inc.h (新規作成, 200行)

**目的**: Same-thread fast free path

**ファイル**: `core/tiny_free_fast.inc.h`

```c
#ifndef HAKMEM_TINY_FREE_FAST_INC_H
#define HAKMEM_TINY_FREE_FAST_INC_H

#include "tiny_atomic.h"
#include "tiny_alloc_fast.inc.h"

// ============================================================================
// TINY_FREE_FAST: Same-thread fast free (15-20 命令)
// ============================================================================

/**
 * tiny_free_fast - Fast free for same-thread ownership
 * 
 * Ownership check:
 *   1. Get self TID (uint32_t)
 *   2. Lookup slab owner_tid
 *   3. Compare: if owner_tid == self_tid → same thread → push to cache
 *   4. Otherwise: slow path (remote queue)
 * 
 * Returns: 1 if successfully freed to cache, 0 if slow path needed
 */
static inline int tiny_free_fast(void* ptr, int class_idx) {
    // Step 1: Get self TID
    uint32_t self_tid = tiny_self_u32();
    
    // Step 2: Owner lookup (O(1) via slab_handle.h)
    TinySlab* slab = hak_tiny_owner_slab(ptr);
    if (__builtin_expect(slab == NULL, 0)) {
        return 0;  // Not owned by Tiny → slow path
    }
    
    // Step 3: Compare owner
    if (__builtin_expect(slab->owner_tid != self_tid, 0)) {
        return 0;  // Cross-thread → slow path (remote queue)
    }
    
    // Step 4: Same-thread → cache push
    return tiny_alloc_fast_push(class_idx, ptr);
}

/**
 * tiny_free_main_entry - Main free entry point
 * 
 * Dispatches:
 *   - tiny_free_fast() for same-thread
 *   - tiny_free_remote() for cross-thread
 *   - tiny_free_guard() for validation
 */
static inline void tiny_free_main_entry(void* ptr) {
    if (__builtin_expect(ptr == NULL, 0)) {
        return;  // NULL is safe
    }
    
    // Fast path: lookup class and owner in one step
    // (This requires pre-computing or O(1) lookup)
    // For now, we'll delegate to existing tiny_free()
    // which will be refactored to call tiny_free_fast()
}

#endif // HAKMEM_TINY_FREE_FAST_INC_H
```

---

### Phase 1.4: hakmem_tiny_free.inc Refactoring (削減)

**目的**: Free.inc から fast path を抽出し、500行削減

**手順**:
1. Lines 1-558 (Free パス) → tiny_free_fast.inc.h + tiny_free_remote.inc.h へ分割
2. Lines 559-998 (SuperSlab Alloc) → tiny_alloc_slow.inc.h へ移動
3. Lines 999-1369 (SuperSlab Free) → tiny_free_remote.inc.h + Box 4 へ移動
4. Lines 1371-1434 (Query, commented) → 削除
5. Lines 1435-1464 (Shutdown) → tiny_lifecycle_shutdown.inc.h へ移動

**結果**: hakmem_tiny_free.inc: 1470行 → 300行以下

---

## Priority 2: Implementation Checklist

### Week 1 Checklist

- [ ] Box 1: tiny_atomic.h 作成
  - [ ] Unit tests
  - [ ] Integration with tiny_free_fast
  
- [ ] Box 5.1: tiny_alloc_fast.inc.h 作成
  - [ ] Pop/push functions
  - [ ] Unit tests
  - [ ] Benchmark (cache hit rate)

- [ ] Box 6.1: tiny_free_fast.inc.h 作成
  - [ ] Same-thread check
  - [ ] Cache push
  - [ ] Unit tests

- [ ] Extract from hakmem_tiny_free.inc
  - [ ] Remove fast path (lines 1-558)
  - [ ] Remove shutdown (lines 1435-1464)
  - [ ] Verify compilation

- [ ] Benchmark
  - [ ] Measure fast path latency (should be <5 cycles)
  - [ ] Measure cache hit rate (target: >80%)
  - [ ] Measure throughput (target: >100M ops/sec for 16-64B)

---

## Priority 2: Remote Queue & Ownership (Week 2)

### Phase 2.1: tiny_remote_queue.inc.h (新規作成, 300行)

**出処**: hakmem_tiny_free.inc の remote queue logic を抽出

**責務**: MPSC remote queue operations

```c
// tiny_remote_queue.inc.h
#ifndef HAKMEM_TINY_REMOTE_QUEUE_INC_H
#define HAKMEM_TINY_REMOTE_QUEUE_INC_H

#include "tiny_atomic.h"

// ============================================================================
// TINY_REMOTE_QUEUE: MPSC stack for cross-thread free
// ============================================================================

/**
 * tiny_remote_queue_push - Push ptr to remote queue
 * 
 * Single writer (owner) pushes to remote_heads[slab_idx]
 * Multiple readers (other threads) push to same stack
 * 
 * MPSC = Many Producers, Single Consumer
 */
static inline void tiny_remote_queue_push(SuperSlab* ss, int slab_idx, void* ptr) {
    if (__builtin_expect(!ss || slab_idx < 0, 0)) {
        return;
    }
    
    // Link: ptr->next = head
    uintptr_t cur_head = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]);
    while (1) {
        *(uintptr_t*)ptr = cur_head;
        
        // CAS: if head == cur_head, head = ptr
        if (tiny_atomic_cas(&ss->remote_heads[slab_idx], &cur_head, (uintptr_t)ptr)) {
            break;
        }
    }
}

/**
 * tiny_remote_queue_pop_all - Pop entire chain from remote queue
 * 
 * Owner thread pops all pending frees
 * Returns: head of chain (or NULL if empty)
 */
static inline void* tiny_remote_queue_pop_all(SuperSlab* ss, int slab_idx) {
    if (__builtin_expect(!ss || slab_idx < 0, 0)) {
        return NULL;
    }
    
    uintptr_t head = tiny_atomic_exchange(&ss->remote_heads[slab_idx], 0);
    return (void*)head;
}

/**
 * tiny_remote_queue_contains_guard - Guard check (security)
 * 
 * Verify ptr is in remote queue chain (sentinel check)
 */
static inline int tiny_remote_queue_contains_guard(SuperSlab* ss, int slab_idx, void* target) {
    if (!ss || slab_idx < 0) return 0;
    
    uintptr_t cur = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]);
    int limit = 8192;  // Prevent infinite loop
    
    while (cur && limit-- > 0) {
        if ((void*)cur == target) {
            return 1;
        }
        cur = *(uintptr_t*)cur;
    }
    
    return (limit <= 0) ? 1 : 0;  // Fail-safe: treat unbounded as duplicate
}

#endif // HAKMEM_TINY_REMOTE_QUEUE_INC_H
```

---

### Phase 2.2: tiny_owner.inc.h (新規作成, 120行)

**責務**: Owner TID management

```c
// tiny_owner.inc.h
#ifndef HAKMEM_TINY_OWNER_INC_H
#define HAKMEM_TINY_OWNER_INC_H

#include "tiny_atomic.h"

// ============================================================================
// TINY_OWNER: Ownership tracking (owner_tid)
// ============================================================================

/**
 * tiny_owner_acquire - Acquire ownership of slab
 * 
 * Call when thread takes ownership of a TinySlab
 */
static inline void tiny_owner_acquire(TinySlab* slab, uint32_t tid) {
    if (__builtin_expect(!slab, 0)) return;
    tiny_atomic_store_rel(&slab->owner_tid, tid);
}

/**
 * tiny_owner_release - Release ownership of slab
 * 
 * Call when thread releases a TinySlab (e.g., spill, shutdown)
 */
static inline void tiny_owner_release(TinySlab* slab) {
    if (__builtin_expect(!slab, 0)) return;
    tiny_atomic_store_rel(&slab->owner_tid, 0);
}

/**
 * tiny_owner_check - Check if self owns slab
 * 
 * Returns: 1 if self owns, 0 otherwise
 */
static inline int tiny_owner_check(TinySlab* slab, uint32_t self_tid) {
    if (__builtin_expect(!slab, 0)) return 0;
    return tiny_atomic_load_acq(&slab->owner_tid) == self_tid;
}

#endif // HAKMEM_TINY_OWNER_INC_H
```

---

## Testing Framework

### Unit Test Template

```c
// tests/test_tiny_<component>.c

#include <assert.h>
#include "hakmem.h"
#include "tiny_atomic.h"
#include "tiny_alloc_fast.inc.h"
#include "tiny_free_fast.inc.h"

static void test_<function>() {
    // Setup
    // Action
    // Assert
    printf("✅ test_<function> passed\n");
}

int main() {
    test_<function>();
    // ... more tests
    printf("\n✨ All tests passed!\n");
    return 0;
}
```

### Integration Test

```c
// tests/test_tiny_alloc_free_cycle.c

void test_alloc_free_single_thread_100k() {
    void* ptrs[100];
    for (int i = 0; i < 100; i++) {
        ptrs[i] = hak_tiny_alloc(16);
        assert(ptrs[i] != NULL);
    }
    
    for (int i = 0; i < 100; i++) {
        hak_tiny_free(ptrs[i]);
    }
    
    printf("✅ test_alloc_free_single_thread_100k passed\n");
}

void test_alloc_free_cross_thread() {
    void* ptrs[100];
    
    // Thread A: allocate
    pthread_t tid;
    pthread_create(&tid, NULL, allocator_thread, ptrs);
    
    // Main: free (cross-thread)
    for (int i = 0; i < 100; i++) {
        sleep(10);  // Wait for allocs
        hak_tiny_free(ptrs[i]);
    }
    
    pthread_join(tid, NULL);
    printf("✅ test_alloc_free_cross_thread passed\n");
}
```

---

## Performance Validation

### Assembly Check (fast path)

```bash
# Compile with -S to generate assembly
gcc -S -O3 -c core/hakmem_tiny.c -o /tmp/tiny.s

# Count instructions in fast path
grep -A20 "tiny_alloc_fast_pop:" /tmp/tiny.s | wc -l
# Expected: <= 8 instructions (3-4 ideal)

# Check branch mispredicts
grep "likely\|unlikely" /tmp/tiny.s | wc -l
# Expected: cache hits have likely, misses have unlikely
```

### Benchmark (larson)

```bash
# Baseline
./larson_hakmem 16 1 1000 1000 0

# With new fast path
./larson_hakmem 16 1 1000 1000 0

# Expected improvement: +10-15% throughput
```

---

## Compilation & Integration

### Makefile Changes

```makefile
# Add new files to dependencies
TINY_HEADERS = \
    core/tiny_atomic.h \
    core/tiny_alloc_fast.inc.h \
    core/tiny_free_fast.inc.h \
    core/tiny_owner.inc.h \
    core/tiny_remote_queue.inc.h

# Rebuild if any header changes
libhakmem.so: $(TINY_HEADERS) core/hakmem_tiny.c
```

### Include Order (hakmem_tiny.c)

```c
// At the top of hakmem_tiny.c, after hakmem_tiny_config.h:

// ============================================================
// LAYER 0: Atomic + Ownership (lowest)
// ============================================================
#include "tiny_atomic.h"
#include "tiny_owner.inc.h"
#include "slab_handle.h"

// ... rest of includes
```

---

## Rollback Plan

If performance regresses or compilation fails:

1. **Keep old files**: hakmem_tiny_free.inc is not deleted, only refactored
2. **Git revert**: Can revert specific commits per Box
3. **Feature flags**: Add HAKMEM_TINY_NEW_FAST_PATH=0 to disable new code path
4. **Benchmark first**: Always run larson before and after each change

---

## Success Metrics

### Performance
- [ ] Fast path: 3-4 instructions (assembly review)
- [ ] Throughput: +10-15% on 16-64B allocations
- [ ] Cache hit rate: >80%

### Code Quality
- [ ] All files <= 500 lines
- [ ] Zero cyclic dependencies (verified by include analysis)
- [ ] No compilation warnings

### Testing
- [ ] Unit tests: 100% pass
- [ ] Integration tests: 100% pass
- [ ] Larson benchmark: baseline + 10-15%

---

## Contact & Questions

Refer to REFACTOR_PLAN.md for high-level strategy and timeline.

For specific implementation details, see the corresponding .inc.h files.
Wrap debug fprintf in !HAKMEM_BUILD_RELEASE guards (Release build optimization) ## Changes ### 1. core/page_arena.c - Removed init failure message (lines 25-27) - error is handled by returning early - All other fprintf statements already wrapped in existing #if !HAKMEM_BUILD_RELEASE blocks ### 2. core/hakmem.c - Wrapped SIGSEGV handler init message (line 72) - CRITICAL: Kept SIGSEGV/SIGBUS/SIGABRT error messages (lines 62-64) - production needs crash logs ### 3. core/hakmem_shared_pool.c - Wrapped all debug fprintf statements in #if !HAKMEM_BUILD_RELEASE: - Node pool exhaustion warning (line 252) - SP_META_CAPACITY_ERROR warning (line 421) - SP_FIX_GEOMETRY debug logging (line 745) - SP_ACQUIRE_STAGE0.5_EMPTY debug logging (line 865) - SP_ACQUIRE_STAGE0_L0 debug logging (line 803) - SP_ACQUIRE_STAGE1_LOCKFREE debug logging (line 922) - SP_ACQUIRE_STAGE2_LOCKFREE debug logging (line 996) - SP_ACQUIRE_STAGE3 debug logging (line 1116) - SP_SLOT_RELEASE debug logging (line 1245) - SP_SLOT_FREELIST_LOCKFREE debug logging (line 1305) - SP_SLOT_COMPLETELY_EMPTY debug logging (line 1316) - Fixed lock_stats_init() for release builds (lines 60-65) - ensure g_lock_stats_enabled is initialized ## Performance Validation Before: 51M ops/s (with debug fprintf overhead) After: 49.1M ops/s (consistent performance, fprintf removed from hot paths) ## Build & Test ```bash ./build.sh larson_hakmem ./out/release/larson_hakmem 1 5 1 1000 100 10000 42 # Result: 49.1M ops/s ``` Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-26 13:14:18 +09:00			`# HAKMEM Tiny Allocator リファクタリング実装ガイド`

			`## クイックスタート`

			`このドキュメントは、REFACTOR_PLAN.md の実装手順を段階的に説明します。`

			`---`

			`## Priority 1: Fast Path リファクタリング (Week 1)`

			`### Phase 1.1: tiny_atomic.h (新規作成, 80行)`

			`目的: Atomic操作の統一インターフェース`

			ファイル: `core/tiny_atomic.h`

			```c
			`#ifndef HAKMEM_TINY_ATOMIC_H`
			`#define HAKMEM_TINY_ATOMIC_H`

			`#include <stdatomic.h>`

			`// ============================================================================`
			`// TINY_ATOMIC: 統一インターフェース for atomics with memory ordering`
			`// ============================================================================`

			`/**`
			`* tiny_atomic_load - Load with acquire semantics (default)`
			`* @ptr: pointer to atomic variable`
			`* @order: memory_order (default: memory_order_acquire)`
			`*`
			`* Returns: Loaded value`
			`*/`
			`#define tiny_atomic_load(ptr, order) \`
			`atomic_load_explicit((_Atomic typeof(ptr))ptr, order)`

			`#define tiny_atomic_load_acq(ptr) \`
			`atomic_load_explicit((_Atomic typeof(ptr))ptr, memory_order_acquire)`

			`#define tiny_atomic_load_rel(ptr) \`
			`atomic_load_explicit((_Atomic typeof(ptr))ptr, memory_order_release)`

			`#define tiny_atomic_load_relax(ptr) \`
			`atomic_load_explicit((_Atomic typeof(ptr))ptr, memory_order_relaxed)`

			`/**`
			`* tiny_atomic_store - Store with release semantics (default)`
			`*/`
			`#define tiny_atomic_store(ptr, val, order) \`
			`atomic_store_explicit((_Atomic typeof(ptr))ptr, val, order)`

			`#define tiny_atomic_store_rel(ptr, val) \`
			`atomic_store_explicit((_Atomic typeof(ptr))ptr, val, memory_order_release)`

			`#define tiny_atomic_store_acq(ptr, val) \`
			`atomic_store_explicit((_Atomic typeof(ptr))ptr, val, memory_order_acquire)`

			`#define tiny_atomic_store_relax(ptr, val) \`
			`atomic_store_explicit((_Atomic typeof(ptr))ptr, val, memory_order_relaxed)`

			`/**`
			`* tiny_atomic_cas - Compare and swap with seq_cst semantics`
			`* @ptr: pointer to atomic variable`
			`* @expected: expected value (in/out)`
			`* @desired: desired value`
			`* Returns: true if successful`
			`*/`
			`#define tiny_atomic_cas(ptr, expected, desired) \`
			`atomic_compare_exchange_strong_explicit( \`
			`(_Atomic typeof(ptr))ptr, expected, desired, \`
			`memory_order_seq_cst, memory_order_relaxed)`

			`/**`
			`* tiny_atomic_cas_weak - Weak CAS for loops`
			`*/`
			`#define tiny_atomic_cas_weak(ptr, expected, desired) \`
			`atomic_compare_exchange_weak_explicit( \`
			`(_Atomic typeof(ptr))ptr, expected, desired, \`
			`memory_order_seq_cst, memory_order_relaxed)`

			`/**`
			`* tiny_atomic_exchange - Atomic exchange`
			`*/`
			`#define tiny_atomic_exchange(ptr, desired) \`
			`atomic_exchange_explicit((_Atomic typeof(ptr))ptr, desired, \`
			`memory_order_seq_cst)`

			`/**`
			`* tiny_atomic_fetch_add - Fetch and add`
			`*/`
			`#define tiny_atomic_fetch_add(ptr, val) \`
			`atomic_fetch_add_explicit((_Atomic typeof(ptr))ptr, val, \`
			`memory_order_seq_cst)`

			`/**`
			`* tiny_atomic_increment - Increment (returns new value)`
			`*/`
			`#define tiny_atomic_increment(ptr) \`
			`(atomic_fetch_add_explicit((_Atomic typeof(ptr))ptr, 1, \`
			`memory_order_seq_cst) + 1)`

			`#endif // HAKMEM_TINY_ATOMIC_H`
			```

			`テスト:`
			```c
			`// test_tiny_atomic.c`
			`#include "tiny_atomic.h"`

			`void test_tiny_atomic_load_store() {`
			`_Atomic int x = 0;`
			`tiny_atomic_store(&x, 42, memory_order_release);`
			`assert(tiny_atomic_load(&x, memory_order_acquire) == 42);`
			`}`

			`void test_tiny_atomic_cas() {`
			`_Atomic int x = 1;`
			`int expected = 1;`
			`assert(tiny_atomic_cas(&x, &expected, 2) == true);`
			`assert(tiny_atomic_load(&x, memory_order_relaxed) == 2);`
			`}`
			```

			`---`

			`### Phase 1.2: tiny_alloc_fast.inc.h (新規作成, 250行)`

			`目的: 3-4命令のfast path allocation`

			ファイル: `core/tiny_alloc_fast.inc.h`

			```c
			`#ifndef HAKMEM_TINY_ALLOC_FAST_INC_H`
			`#define HAKMEM_TINY_ALLOC_FAST_INC_H`

			`#include "tiny_atomic.h"`

			`// ============================================================================`
			`// TINY_ALLOC_FAST: Ultra-simple fast path (3-4 命令)`
			`// ============================================================================`

			`// TLS storage (defined in hakmem_tiny.c)`
			`extern void* g_tls_alloc_cache[TINY_NUM_CLASSES];`
			`extern int g_tls_alloc_count[TINY_NUM_CLASSES];`
			`extern int g_tls_alloc_cap[TINY_NUM_CLASSES];`

			`/**`
			`* tiny_alloc_fast_pop - Pop from TLS cache (3-4 命令)`
			`*`
			`* Fast path for allocation:`
			`* 1. Load head from TLS cache`
			`* 2. Check if non-NULL`
			`* 3. Pop: head = head->next`
			`* 4. Return ptr`
			`*`
			`* Returns: Pointer if cache hit, NULL if miss (go to slow path)`
			`*/`
			`static inline void* tiny_alloc_fast_pop(int class_idx) {`
			`void* ptr = g_tls_alloc_cache[class_idx];`
			`if (__builtin_expect(ptr != NULL, 1)) {`
			`// Pop: store next pointer`
			`g_tls_alloc_cache[class_idx] = (void*)ptr;`
			`// Update count (optional, can be batched)`
			`g_tls_alloc_count[class_idx]--;`
			`return ptr;`
			`}`
			`return NULL; // Cache miss → slow path`
			`}`

			`/**`
			`* tiny_alloc_fast_push - Push to TLS cache`
			`*`
			`* Returns: 1 if success, 0 if cache full (go to spill logic)`
			`*/`
			`static inline int tiny_alloc_fast_push(int class_idx, void* ptr) {`
			`int cnt = g_tls_alloc_count[class_idx];`
			`int cap = g_tls_alloc_cap[class_idx];`

			`if (__builtin_expect(cnt < cap, 1)) {`
			`// Push: ptr->next = head`
			`(void*)ptr = g_tls_alloc_cache[class_idx];`
			`g_tls_alloc_cache[class_idx] = ptr;`
			`g_tls_alloc_count[class_idx]++;`
			`return 1;`
			`}`
			`return 0; // Cache full → slow path`
			`}`

			`/**`
			`* tiny_alloc_fast - Fast allocation entry (public API for fast path)`
			`*`
			`* Equivalent to:`
			`* void* ptr = tiny_alloc_fast_pop(class_idx);`
			`* if (!ptr) ptr = tiny_alloc_slow(class_idx);`
			`* return ptr;`
			`*/`
			`static inline void* tiny_alloc_fast(int class_idx) {`
			`void* ptr = tiny_alloc_fast_pop(class_idx);`
			`if (__builtin_expect(ptr != NULL, 1)) {`
			`return ptr;`
			`}`
			`// Slow path call will be added in hakmem_tiny.c`
			`return NULL; // Placeholder`
			`}`

			`#endif // HAKMEM_TINY_ALLOC_FAST_INC_H`
			```

			`テスト:`
			```c
			`// test_tiny_alloc_fast.c`
			`void test_tiny_alloc_fast_empty() {`
			`g_tls_alloc_cache[0] = NULL;`
			`g_tls_alloc_count[0] = 0;`
			`assert(tiny_alloc_fast_pop(0) == NULL);`
			`}`

			`void test_tiny_alloc_fast_push_pop() {`
			`void* ptr = (void*)0x12345678;`
			`g_tls_alloc_count[0] = 0;`
			`g_tls_alloc_cap[0] = 100;`

			`assert(tiny_alloc_fast_push(0, ptr) == 1);`
			`assert(g_tls_alloc_count[0] == 1);`
			`assert(tiny_alloc_fast_pop(0) == ptr);`
			`assert(g_tls_alloc_count[0] == 0);`
			`}`
			```

			`---`

			`### Phase 1.3: tiny_free_fast.inc.h (新規作成, 200行)`

			`目的: Same-thread fast free path`

			ファイル: `core/tiny_free_fast.inc.h`

			```c
			`#ifndef HAKMEM_TINY_FREE_FAST_INC_H`
			`#define HAKMEM_TINY_FREE_FAST_INC_H`

			`#include "tiny_atomic.h"`
			`#include "tiny_alloc_fast.inc.h"`

			`// ============================================================================`
			`// TINY_FREE_FAST: Same-thread fast free (15-20 命令)`
			`// ============================================================================`

			`/**`
			`* tiny_free_fast - Fast free for same-thread ownership`
			`*`
			`* Ownership check:`
			`* 1. Get self TID (uint32_t)`
			`* 2. Lookup slab owner_tid`
			`* 3. Compare: if owner_tid == self_tid → same thread → push to cache`
			`* 4. Otherwise: slow path (remote queue)`
			`*`
			`* Returns: 1 if successfully freed to cache, 0 if slow path needed`
			`*/`
			`static inline int tiny_free_fast(void* ptr, int class_idx) {`
			`// Step 1: Get self TID`
			`uint32_t self_tid = tiny_self_u32();`

			`// Step 2: Owner lookup (O(1) via slab_handle.h)`
			`TinySlab* slab = hak_tiny_owner_slab(ptr);`
			`if (__builtin_expect(slab == NULL, 0)) {`
			`return 0; // Not owned by Tiny → slow path`
			`}`

			`// Step 3: Compare owner`
			`if (__builtin_expect(slab->owner_tid != self_tid, 0)) {`
			`return 0; // Cross-thread → slow path (remote queue)`
			`}`

			`// Step 4: Same-thread → cache push`
			`return tiny_alloc_fast_push(class_idx, ptr);`
			`}`

			`/**`
			`* tiny_free_main_entry - Main free entry point`
			`*`
			`* Dispatches:`
			`* - tiny_free_fast() for same-thread`
			`* - tiny_free_remote() for cross-thread`
			`* - tiny_free_guard() for validation`
			`*/`
			`static inline void tiny_free_main_entry(void* ptr) {`
			`if (__builtin_expect(ptr == NULL, 0)) {`
			`return; // NULL is safe`
			`}`

			`// Fast path: lookup class and owner in one step`
			`// (This requires pre-computing or O(1) lookup)`
			`// For now, we'll delegate to existing tiny_free()`
			`// which will be refactored to call tiny_free_fast()`
			`}`

			`#endif // HAKMEM_TINY_FREE_FAST_INC_H`
			```

			`---`

			`### Phase 1.4: hakmem_tiny_free.inc Refactoring (削減)`

			`目的: Free.inc から fast path を抽出し、500行削減`

			`手順:`
			`1. Lines 1-558 (Free パス) → tiny_free_fast.inc.h + tiny_free_remote.inc.h へ分割`
			`2. Lines 559-998 (SuperSlab Alloc) → tiny_alloc_slow.inc.h へ移動`
			`3. Lines 999-1369 (SuperSlab Free) → tiny_free_remote.inc.h + Box 4 へ移動`
			`4. Lines 1371-1434 (Query, commented) → 削除`
			`5. Lines 1435-1464 (Shutdown) → tiny_lifecycle_shutdown.inc.h へ移動`

			`結果: hakmem_tiny_free.inc: 1470行 → 300行以下`

			`---`

			`## Priority 2: Implementation Checklist`

			`### Week 1 Checklist`

			`- [ ] Box 1: tiny_atomic.h 作成`
			`- [ ] Unit tests`
			`- [ ] Integration with tiny_free_fast`

			`- [ ] Box 5.1: tiny_alloc_fast.inc.h 作成`
			`- [ ] Pop/push functions`
			`- [ ] Unit tests`
			`- [ ] Benchmark (cache hit rate)`

			`- [ ] Box 6.1: tiny_free_fast.inc.h 作成`
			`- [ ] Same-thread check`
			`- [ ] Cache push`
			`- [ ] Unit tests`

			`- [ ] Extract from hakmem_tiny_free.inc`
			`- [ ] Remove fast path (lines 1-558)`
			`- [ ] Remove shutdown (lines 1435-1464)`
			`- [ ] Verify compilation`

			`- [ ] Benchmark`
			`- [ ] Measure fast path latency (should be <5 cycles)`
			`- [ ] Measure cache hit rate (target: >80%)`
			`- [ ] Measure throughput (target: >100M ops/sec for 16-64B)`

			`---`

			`## Priority 2: Remote Queue & Ownership (Week 2)`

			`### Phase 2.1: tiny_remote_queue.inc.h (新規作成, 300行)`

			`出処: hakmem_tiny_free.inc の remote queue logic を抽出`

			`責務: MPSC remote queue operations`

			```c
			`// tiny_remote_queue.inc.h`
			`#ifndef HAKMEM_TINY_REMOTE_QUEUE_INC_H`
			`#define HAKMEM_TINY_REMOTE_QUEUE_INC_H`

			`#include "tiny_atomic.h"`

			`// ============================================================================`
			`// TINY_REMOTE_QUEUE: MPSC stack for cross-thread free`
			`// ============================================================================`

			`/**`
			`* tiny_remote_queue_push - Push ptr to remote queue`
			`*`
			`* Single writer (owner) pushes to remote_heads[slab_idx]`
			`* Multiple readers (other threads) push to same stack`
			`*`
			`* MPSC = Many Producers, Single Consumer`
			`*/`
			`static inline void tiny_remote_queue_push(SuperSlab* ss, int slab_idx, void* ptr) {`
			`if (__builtin_expect(!ss \|\| slab_idx < 0, 0)) {`
			`return;`
			`}`

			`// Link: ptr->next = head`
			`uintptr_t cur_head = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]);`
			`while (1) {`
			`(uintptr_t)ptr = cur_head;`

			`// CAS: if head == cur_head, head = ptr`
			`if (tiny_atomic_cas(&ss->remote_heads[slab_idx], &cur_head, (uintptr_t)ptr)) {`
			`break;`
			`}`
			`}`
			`}`

			`/**`
			`* tiny_remote_queue_pop_all - Pop entire chain from remote queue`
			`*`
			`* Owner thread pops all pending frees`
			`* Returns: head of chain (or NULL if empty)`
			`*/`
			`static inline void* tiny_remote_queue_pop_all(SuperSlab* ss, int slab_idx) {`
			`if (__builtin_expect(!ss \|\| slab_idx < 0, 0)) {`
			`return NULL;`
			`}`

			`uintptr_t head = tiny_atomic_exchange(&ss->remote_heads[slab_idx], 0);`
			`return (void*)head;`
			`}`

			`/**`
			`* tiny_remote_queue_contains_guard - Guard check (security)`
			`*`
			`* Verify ptr is in remote queue chain (sentinel check)`
			`*/`
			`static inline int tiny_remote_queue_contains_guard(SuperSlab* ss, int slab_idx, void* target) {`
			`if (!ss \|\| slab_idx < 0) return 0;`

			`uintptr_t cur = tiny_atomic_load_acq(&ss->remote_heads[slab_idx]);`
			`int limit = 8192; // Prevent infinite loop`

			`while (cur && limit-- > 0) {`
			`if ((void*)cur == target) {`
			`return 1;`
			`}`
			`cur = (uintptr_t)cur;`
			`}`

			`return (limit <= 0) ? 1 : 0; // Fail-safe: treat unbounded as duplicate`
			`}`

			`#endif // HAKMEM_TINY_REMOTE_QUEUE_INC_H`
			```

			`---`

			`### Phase 2.2: tiny_owner.inc.h (新規作成, 120行)`

			`責務: Owner TID management`

			```c
			`// tiny_owner.inc.h`
			`#ifndef HAKMEM_TINY_OWNER_INC_H`
			`#define HAKMEM_TINY_OWNER_INC_H`

			`#include "tiny_atomic.h"`

			`// ============================================================================`
			`// TINY_OWNER: Ownership tracking (owner_tid)`
			`// ============================================================================`

			`/**`
			`* tiny_owner_acquire - Acquire ownership of slab`
			`*`
			`* Call when thread takes ownership of a TinySlab`
			`*/`
			`static inline void tiny_owner_acquire(TinySlab* slab, uint32_t tid) {`
			`if (__builtin_expect(!slab, 0)) return;`
			`tiny_atomic_store_rel(&slab->owner_tid, tid);`
			`}`

			`/**`
			`* tiny_owner_release - Release ownership of slab`
			`*`
			`* Call when thread releases a TinySlab (e.g., spill, shutdown)`
			`*/`
			`static inline void tiny_owner_release(TinySlab* slab) {`
			`if (__builtin_expect(!slab, 0)) return;`
			`tiny_atomic_store_rel(&slab->owner_tid, 0);`
			`}`

			`/**`
			`* tiny_owner_check - Check if self owns slab`
			`*`
			`* Returns: 1 if self owns, 0 otherwise`
			`*/`
			`static inline int tiny_owner_check(TinySlab* slab, uint32_t self_tid) {`
			`if (__builtin_expect(!slab, 0)) return 0;`
			`return tiny_atomic_load_acq(&slab->owner_tid) == self_tid;`
			`}`

			`#endif // HAKMEM_TINY_OWNER_INC_H`
			```

			`---`

			`## Testing Framework`

			`### Unit Test Template`

			```c
			`// tests/test_tiny_<component>.c`

			`#include <assert.h>`
			`#include "hakmem.h"`
			`#include "tiny_atomic.h"`
			`#include "tiny_alloc_fast.inc.h"`
			`#include "tiny_free_fast.inc.h"`

			`static void test_<function>() {`
			`// Setup`
			`// Action`
			`// Assert`
			`printf("✅ test_<function> passed\n");`
			`}`

			`int main() {`
			`test_<function>();`
			`// ... more tests`
			`printf("\n✨ All tests passed!\n");`
			`return 0;`
			`}`
			```

			`### Integration Test`

			```c
			`// tests/test_tiny_alloc_free_cycle.c`

			`void test_alloc_free_single_thread_100k() {`
			`void* ptrs[100];`
			`for (int i = 0; i < 100; i++) {`
			`ptrs[i] = hak_tiny_alloc(16);`
			`assert(ptrs[i] != NULL);`
			`}`

			`for (int i = 0; i < 100; i++) {`
			`hak_tiny_free(ptrs[i]);`
			`}`

			`printf("✅ test_alloc_free_single_thread_100k passed\n");`
			`}`

			`void test_alloc_free_cross_thread() {`
			`void* ptrs[100];`

			`// Thread A: allocate`
			`pthread_t tid;`
			`pthread_create(&tid, NULL, allocator_thread, ptrs);`

			`// Main: free (cross-thread)`
			`for (int i = 0; i < 100; i++) {`
			`sleep(10); // Wait for allocs`
			`hak_tiny_free(ptrs[i]);`
			`}`

			`pthread_join(tid, NULL);`
			`printf("✅ test_alloc_free_cross_thread passed\n");`
			`}`
			```

			`---`

			`## Performance Validation`

			`### Assembly Check (fast path)`

			```bash
			`# Compile with -S to generate assembly`
			`gcc -S -O3 -c core/hakmem_tiny.c -o /tmp/tiny.s`

			`# Count instructions in fast path`
			`grep -A20 "tiny_alloc_fast_pop:" /tmp/tiny.s \| wc -l`
			`# Expected: <= 8 instructions (3-4 ideal)`

			`# Check branch mispredicts`
			`grep "likely\\|unlikely" /tmp/tiny.s \| wc -l`
			`# Expected: cache hits have likely, misses have unlikely`
			```

			`### Benchmark (larson)`

			```bash
			`# Baseline`
			`./larson_hakmem 16 1 1000 1000 0`

			`# With new fast path`
			`./larson_hakmem 16 1 1000 1000 0`

			`# Expected improvement: +10-15% throughput`
			```

			`---`

			`## Compilation & Integration`

			`### Makefile Changes`

			```makefile
			`# Add new files to dependencies`
			`TINY_HEADERS = \`
			`core/tiny_atomic.h \`
			`core/tiny_alloc_fast.inc.h \`
			`core/tiny_free_fast.inc.h \`
			`core/tiny_owner.inc.h \`
			`core/tiny_remote_queue.inc.h`

			`# Rebuild if any header changes`
			`libhakmem.so: $(TINY_HEADERS) core/hakmem_tiny.c`
			```

			`### Include Order (hakmem_tiny.c)`

			```c
			`// At the top of hakmem_tiny.c, after hakmem_tiny_config.h:`

			`// ============================================================`
			`// LAYER 0: Atomic + Ownership (lowest)`
			`// ============================================================`
			`#include "tiny_atomic.h"`
			`#include "tiny_owner.inc.h"`
			`#include "slab_handle.h"`

			`// ... rest of includes`
			```

			`---`

			`## Rollback Plan`

			`If performance regresses or compilation fails:`

			`1. Keep old files: hakmem_tiny_free.inc is not deleted, only refactored`
			`2. Git revert: Can revert specific commits per Box`
			`3. Feature flags: Add HAKMEM_TINY_NEW_FAST_PATH=0 to disable new code path`
			`4. Benchmark first: Always run larson before and after each change`

			`---`

			`## Success Metrics`

			`### Performance`
			`- [ ] Fast path: 3-4 instructions (assembly review)`
			`- [ ] Throughput: +10-15% on 16-64B allocations`
			`- [ ] Cache hit rate: >80%`

			`### Code Quality`
			`- [ ] All files <= 500 lines`
			`- [ ] Zero cyclic dependencies (verified by include analysis)`
			`- [ ] No compilation warnings`

			`### Testing`
			`- [ ] Unit tests: 100% pass`
			`- [ ] Integration tests: 100% pass`
			`- [ ] Larson benchmark: baseline + 10-15%`

			`---`

			`## Contact & Questions`

			`Refer to REFACTOR_PLAN.md for high-level strategy and timeline.`

			`For specific implementation details, see the corresponding .inc.h files.`