diff --git a/core/hakmem_super_registry.c b/core/hakmem_super_registry.c
index f98c1e4e..897fe93b 100644
--- a/core/hakmem_super_registry.c
+++ b/core/hakmem_super_registry.c
@@ -8,13 +8,21 @@ SuperRegEntry g_super_reg[SUPER_REG_SIZE];
 pthread_mutex_t g_super_reg_lock = PTHREAD_MUTEX_INITIALIZER;
 int g_super_reg_initialized = 0;
 
+// Per-class registry storage (Phase 6: Registry Optimization)
+SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
+int g_super_reg_class_size[TINY_NUM_CLASSES];
+
 // Initialize registry (call once at startup)
 void hak_super_registry_init(void) {
     if (g_super_reg_initialized) return;
 
-    // Zero-initialize all entries
+    // Zero-initialize all entries (hash table)
     memset(g_super_reg, 0, sizeof(g_super_reg));
 
+    // Zero-initialize per-class registry (Phase 6: Registry Optimization)
+    memset(g_super_reg_by_class, 0, sizeof(g_super_reg_by_class));
+    memset(g_super_reg_class_size, 0, sizeof(g_super_reg_class_size));
+
     // Memory fence to ensure initialization is visible to all threads
     atomic_thread_fence(memory_order_release);
 
@@ -25,6 +33,7 @@ void hak_super_registry_init(void) {
 // CRITICAL: Call AFTER SuperSlab is fully initialized
 // Publish order: ss init → release fence → base write
 // Phase 8.3: ACE - lg_size aware registration
+// Phase 6: Registry Optimization - Also add to per-class registry for fast refill scan
 int hak_super_register(uintptr_t base, SuperSlab* ss) {
     if (!g_super_reg_initialized) {
         hak_super_registry_init();
@@ -35,7 +44,8 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) {
     int lg = ss->lg_size;  // Phase 8.3: Get lg_size from SuperSlab
     int h = hak_super_hash(base, lg);
 
-    // Linear probing to find empty slot
+    // Step 1: Register in hash table (for address → SuperSlab lookup)
+    int hash_registered = 0;
     for (int i = 0; i < SUPER_MAX_PROBE; i++) {
         SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK];
 
@@ -52,33 +62,67 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) {
             atomic_store_explicit((_Atomic uintptr_t*)&e->base, base,
                                  memory_order_release);
 
-            pthread_mutex_unlock(&g_super_reg_lock);
-            return 1;
+            hash_registered = 1;
+            break;
         }
 
         if (e->base == base && e->lg_size == lg) {
             // Already registered (duplicate registration)
-            pthread_mutex_unlock(&g_super_reg_lock);
-            return 1;
+            hash_registered = 1;
+            break;
+        }
+    }
+
+    if (!hash_registered) {
+        // Hash table full (probing limit reached)
+        pthread_mutex_unlock(&g_super_reg_lock);
+        fprintf(stderr, "HAKMEM: SuperSlab registry full! Increase SUPER_REG_SIZE\n");
+        return 0;
+    }
+
+    // Step 2: Register in per-class registry (Phase 6: Registry Optimization)
+    // Purpose: Enable O(class_size) refill scan instead of O(262K)
+    int class_idx = ss->size_class;
+    if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) {
+        int size = g_super_reg_class_size[class_idx];
+        if (size < SUPER_REG_PER_CLASS) {
+            // Check for duplicate registration
+            int already_in_class = 0;
+            for (int i = 0; i < size; i++) {
+                if (g_super_reg_by_class[class_idx][i] == ss) {
+                    already_in_class = 1;
+                    break;
+                }
+            }
+
+            if (!already_in_class) {
+                // Add to per-class registry
+                g_super_reg_by_class[class_idx][size] = ss;
+                g_super_reg_class_size[class_idx]++;
+            }
+        } else {
+            // Per-class registry full (should be rare)
+            fprintf(stderr, "HAKMEM: Per-class registry full for class %d! "
+                           "Increase SUPER_REG_PER_CLASS\n", class_idx);
         }
     }
 
-    // Registry full (probing limit reached)
     pthread_mutex_unlock(&g_super_reg_lock);
-    fprintf(stderr, "HAKMEM: SuperSlab registry full! Increase SUPER_REG_SIZE\n");
-    return 0;
+    return 1;
 }
 
 // Unregister SuperSlab (mutex-protected)
 // CRITICAL: Call BEFORE munmap to prevent reader segfault
 // Unpublish order: base = 0 (release) → munmap outside this function
 // Phase 8.3: ACE - Try both lg_sizes (we don't know which one was used)
+// Phase 6: Registry Optimization - Also remove from per-class registry
 void hak_super_unregister(uintptr_t base) {
     if (!g_super_reg_initialized) return;
 
     pthread_mutex_lock(&g_super_reg_lock);
 
-    // Try both 1MB (20) and 2MB (21) alignments
+    // Step 1: Find and remove from hash table
+    SuperSlab* ss = NULL;  // Save SuperSlab pointer for per-class removal
     for (int lg = 20; lg <= 21; lg++) {
         int h = hak_super_hash(base, lg);
 
@@ -88,6 +132,9 @@ void hak_super_unregister(uintptr_t base) {
 
             if (e->base == base && e->lg_size == lg) {
                 // Found entry to remove
+                // Save SuperSlab pointer BEFORE clearing (for per-class removal)
+                ss = atomic_load_explicit(&e->ss, memory_order_acquire);
+
                 // Step 1: Clear SuperSlab pointer (atomic, prevents TOCTOU race)
                 atomic_store_explicit(&e->ss, NULL, memory_order_release);
 
@@ -98,8 +145,8 @@ void hak_super_unregister(uintptr_t base) {
                 // Step 3: Clear lg_size (optional cleanup)
                 e->lg_size = 0;
 
-                pthread_mutex_unlock(&g_super_reg_lock);
-                return;
+                // Found in hash table, continue to per-class removal
+                goto hash_removed;
             }
 
             if (e->base == 0) {
@@ -109,6 +156,32 @@ void hak_super_unregister(uintptr_t base) {
         }
     }
 
+hash_removed:
+    // Step 2: Remove from per-class registry (Phase 6: Registry Optimization)
+    if (ss && ss->magic == SUPERSLAB_MAGIC) {
+        int class_idx = ss->size_class;
+        if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) {
+            int size = g_super_reg_class_size[class_idx];
+
+            // Linear scan to find and remove SuperSlab from per-class array
+            for (int i = 0; i < size; i++) {
+                if (g_super_reg_by_class[class_idx][i] == ss) {
+                    // Found: Remove by shifting last element to this position
+                    g_super_reg_class_size[class_idx]--;
+                    int new_size = g_super_reg_class_size[class_idx];
+
+                    // Swap with last element (O(1) removal, order doesn't matter)
+                    if (i != new_size) {
+                        g_super_reg_by_class[class_idx][i] =
+                            g_super_reg_by_class[class_idx][new_size];
+                    }
+                    g_super_reg_by_class[class_idx][new_size] = NULL;
+                    break;
+                }
+            }
+        }
+    }
+
     pthread_mutex_unlock(&g_super_reg_lock);
     // Not found is not an error (could be duplicate unregister)
 }
diff --git a/core/hakmem_super_registry.h b/core/hakmem_super_registry.h
index d38bee8e..5f50b1c3 100644
--- a/core/hakmem_super_registry.h
+++ b/core/hakmem_super_registry.h
@@ -27,6 +27,13 @@
 #define SUPER_REG_MASK      (SUPER_REG_SIZE - 1)
 #define SUPER_MAX_PROBE     8      // Linear probing limit
 
+// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
+// Purpose: Avoid 262K linear scan by indexing SuperSlabs by size class
+// - Each class has 16384 slots (total: 8 classes × 16384 = 128K entries)
+// - Refill scan: O(class_size) instead of O(262144)
+// - Expected speedup: +200-300% for Larson (2.59M → 7.8M ops/s)
+#define SUPER_REG_PER_CLASS 16384  // Per-class registry capacity (increased for high-churn workloads)
+
 // Registry entry: base address → SuperSlab pointer mapping
 typedef struct {
     uintptr_t base;           // Aligned base address (1MB or 2MB, 0 = empty slot)
@@ -40,6 +47,17 @@ extern SuperRegEntry g_super_reg[SUPER_REG_SIZE];
 extern pthread_mutex_t g_super_reg_lock;
 extern int g_super_reg_initialized;
 
+// Per-class registry for fast refill scan (Phase 6: Registry Optimization)
+// Note: TINY_NUM_CLASSES is defined in hakmem_tiny.h (typically 8 for 16B-1KB)
+// - g_super_reg_by_class[class][i] = SuperSlab pointer (NULL = empty slot)
+// - g_super_reg_class_size[class] = number of active SuperSlabs for this class
+// - Protected by g_super_reg_lock (shared with main registry)
+#ifndef TINY_NUM_CLASSES
+#define TINY_NUM_CLASSES 8  // Fallback if hakmem_tiny.h not included yet
+#endif
+extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
+extern int g_super_reg_class_size[TINY_NUM_CLASSES];
+
 // Initialize registry (call once at startup)
 void hak_super_registry_init(void);
 
diff --git a/core/hakmem_tiny_free.inc b/core/hakmem_tiny_free.inc
index 6d9fce92..6553601e 100644
--- a/core/hakmem_tiny_free.inc
+++ b/core/hakmem_tiny_free.inc
@@ -910,37 +910,39 @@ static SuperSlab* superslab_refill(int class_idx) {
 
     // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan)
     // This reduces pressure to allocate new SS when other threads freed blocks.
+    // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
     if (!tls->ss) {
-        // Best-effort: scan a small window of registry for our class
-        extern SuperRegEntry g_super_reg[];
-        int scanned = 0;
+        // Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
+        extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
+        extern int g_super_reg_class_size[TINY_NUM_CLASSES];
+
         const int scan_max = tiny_reg_scan_max();
-        for (int i = 0; i < SUPER_REG_SIZE && scanned < scan_max; i++) {
-            SuperRegEntry* e = &g_super_reg[i];
-            uintptr_t base = atomic_load_explicit((_Atomic uintptr_t*)&e->base, memory_order_acquire);
-            if (base == 0) continue;
-            SuperSlab* ss = atomic_load_explicit(&e->ss, memory_order_acquire);
+        int reg_size = g_super_reg_class_size[class_idx];
+        int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;
+
+        for (int i = 0; i < scan_limit; i++) {
+            SuperSlab* ss = g_super_reg_by_class[class_idx][i];
             if (!ss || ss->magic != SUPERSLAB_MAGIC) continue;
-            if ((int)ss->size_class != class_idx) { scanned++; continue; }
+            // Note: class_idx check is not needed (per-class registry!)
+
             // Pick first slab with freelist (Box 4: 所有権取得 + remote check)
-        int reg_cap = ss_slabs_capacity(ss);
-        uint32_t self_tid = tiny_self_u32();
-        for (int s = 0; s < reg_cap; s++) {
-            if (ss->slabs[s].freelist) {
-                SlabHandle h = slab_try_acquire(ss, s, self_tid);
-                if (slab_is_valid(&h)) {
-                    slab_drain_remote_full(&h);
-                    if (slab_is_safe_to_bind(&h)) {
-                        tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
-                        tiny_tls_bind_slab(tls, ss, s);
-                        return ss;
+            int reg_cap = ss_slabs_capacity(ss);
+            uint32_t self_tid = tiny_self_u32();
+            for (int s = 0; s < reg_cap; s++) {
+                if (ss->slabs[s].freelist) {
+                    SlabHandle h = slab_try_acquire(ss, s, self_tid);
+                    if (slab_is_valid(&h)) {
+                        slab_drain_remote_full(&h);
+                        if (slab_is_safe_to_bind(&h)) {
+                            tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
+                            tiny_tls_bind_slab(tls, ss, s);
+                            return ss;
+                        }
+                        slab_release(&h);
                     }
-                    slab_release(&h);
                 }
             }
         }
-            scanned++;
-        }
     }
 
     // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window
diff --git a/core/tiny_mmap_gate.h b/core/tiny_mmap_gate.h
index d5f8dd91..4b89d5d2 100644
--- a/core/tiny_mmap_gate.h
+++ b/core/tiny_mmap_gate.h
@@ -40,17 +40,22 @@ static inline SuperSlab* tiny_must_adopt_gate(int class_idx, TinyTLSSlab* tls) {
     if (ss) return ss;
 
     // Registry small-window adopt (one pass, limited scan)
+    // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
     {
+        // Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
+        extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
+        extern int g_super_reg_class_size[TINY_NUM_CLASSES];
+
         uint32_t self_tid = tiny_self_u32();
-        int scanned = 0;
         const int scan_max = tiny_reg_scan_max();
-        for (int i = 0; i < SUPER_REG_SIZE && scanned < scan_max; i++) {
-            SuperRegEntry* e = &g_super_reg[i];
-            uintptr_t base = atomic_load_explicit((_Atomic uintptr_t*)&e->base, memory_order_acquire);
-            if (base == 0) continue;
-            SuperSlab* cand = atomic_load_explicit(&e->ss, memory_order_acquire);
+        int reg_size = g_super_reg_class_size[class_idx];
+        int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;
+
+        for (int i = 0; i < scan_limit; i++) {
+            SuperSlab* cand = g_super_reg_by_class[class_idx][i];
             if (!cand || cand->magic != SUPERSLAB_MAGIC) continue;
-            if ((int)cand->size_class != class_idx) { scanned++; continue; }
+            // Note: class_idx check is not needed (per-class registry!)
+
             int cap = ss_slabs_capacity(cand);
             for (int s = 0; s < cap; s++) {
                 // Box: Try to acquire ownership
@@ -67,7 +72,6 @@ static inline SuperSlab* tiny_must_adopt_gate(int class_idx, TinyTLSSlab* tls) {
                     slab_release(&h);
                 }
             }
-            scanned++;
         }
     }
     return NULL;