From 6c849fd020fa7090ac61fac7a95dd8fc16afc5e3 Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Sat, 13 Dec 2025 00:04:41 +0900 Subject: [PATCH] POOL-MID-DN-BATCH: Add last-match cache to reduce linear search overhead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: Linear search in 32-entry TLS map averaged 16 iterations, causing instruction overhead that exceeded mid_desc_lookup savings. Fix implemented: - Added last_idx field to MidInuseTlsPageMap for temporal locality - Check last_idx before linear search (O(1) fast path) - Update last_idx on hits and new entries - Reset last_idx on drain Changes: 1. pool_mid_inuse_tls_pagemap_box.h: - Added uint32_t last_idx field to struct 2. pool_mid_inuse_deferred_box.h: - Check last_idx before linear search (lines 90-94) - Update last_idx on linear search hit (line 101) - Set last_idx on new entry insert (line 117) - Reset last_idx on drain (line 166) Benchmark results (bench_mid_large_mt_hakmem): - Baseline (DEFERRED=0): median 9.08M ops/s, variance 300B - Deferred with cache (DEFERRED=1): median 8.38M ops/s, variance 207B - Performance: -7.6% regression (vs expected +2-4% gain) - Stability: -31% variance (improvement as expected) Analysis: The last-match cache reduces variance but does not eliminate the regression for this benchmark's random access pattern (2048 slots, many pages). The temporal locality assumption (60-80% hit rate) is not met by bench_mid_large_mt's allocation pattern. Further optimization needed: - Consider hash-based lookup for better than O(n) search - OR reduce map size to decrease search iterations - OR add drain triggers at better boundaries 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- core/box/pool_mid_inuse_deferred_box.h | 12 ++++++++++++ core/box/pool_mid_inuse_tls_pagemap_box.h | 1 + 2 files changed, 13 insertions(+) diff --git a/core/box/pool_mid_inuse_deferred_box.h b/core/box/pool_mid_inuse_deferred_box.h index d3dc4d78..8512f058 100644 --- a/core/box/pool_mid_inuse_deferred_box.h +++ b/core/box/pool_mid_inuse_deferred_box.h @@ -85,10 +85,20 @@ static inline void mid_inuse_dec_deferred(void* raw) { // Search TLS map for existing page entry MidInuseTlsPageMap* map = &g_mid_inuse_tls_map; + + // Check last match first (60-80% hit rate expected - temporal locality) + if (map->last_idx < map->used && map->pages[map->last_idx] == page) { + map->counts[map->last_idx]++; + MID_INUSE_DEFERRED_STAT_INC(mid_inuse_deferred_hit); + return; + } + + // Fallback to linear search (now rarely executed) for (uint32_t i = 0; i < map->used; i++) { if (map->pages[i] == page) { // Page already in map, increment count map->counts[i]++; + map->last_idx = i; // Update last_idx on hit MID_INUSE_DEFERRED_STAT_INC(mid_inuse_deferred_hit); return; } @@ -104,6 +114,7 @@ static inline void mid_inuse_dec_deferred(void* raw) { uint32_t idx = map->used++; map->pages[idx] = page; map->counts[idx] = 1; + map->last_idx = idx; // Set last_idx to new entry (temporal locality) MID_INUSE_DEFERRED_STAT_INC(mid_inuse_deferred_hit); } @@ -152,6 +163,7 @@ static inline void mid_inuse_deferred_drain(void) { // Clear map (reset for next batch) map->used = 0; + map->last_idx = 0; // Reset last_idx on drain } #endif // POOL_MID_INUSE_DEFERRED_BOX_H diff --git a/core/box/pool_mid_inuse_tls_pagemap_box.h b/core/box/pool_mid_inuse_tls_pagemap_box.h index 8b9edae2..900d50ae 100644 --- a/core/box/pool_mid_inuse_tls_pagemap_box.h +++ b/core/box/pool_mid_inuse_tls_pagemap_box.h @@ -23,6 +23,7 @@ typedef struct { void* pages[MID_INUSE_TLS_MAP_SIZE]; // Page base addresses uint32_t counts[MID_INUSE_TLS_MAP_SIZE]; // Pending dec count per page uint32_t used; // Number of active entries + uint32_t last_idx; // Cache last hit index for temporal locality } MidInuseTlsPageMap; // Thread-local instance (zero-initialized by default)