Files
hakmem/diagnose_180s_crash.sh
Moe Charm (CI) 2d8dfdf3d1 Fix critical integer overflow bug in TLS SLL trace counters
Root Cause:
- Diagnostic trace counters (g_tls_push_trace, g_tls_pop_trace) were declared
  as 'int' type instead of 'uint32_t'
- Counter would overflow at exactly 256 iterations, causing SIGSEGV
- Bug prevented any meaningful testing in debug builds

Changes:
1. core/box/tls_sll_box.h (tls_sll_push_impl):
   - Changed g_tls_push_trace from 'int' to 'uint32_t'
   - Increased threshold from 256 to 4096
   - Fixes immediate crash on startup

2. core/box/tls_sll_box.h (tls_sll_pop_impl):
   - Changed g_tls_pop_trace from 'int' to 'uint32_t'
   - Increased threshold from 256 to 4096
   - Ensures consistent counter handling

3. core/hakmem_tiny_refill.inc.h:
   - Added Point 4 & 5 diagnostic checks for freelist and stride validation
   - Provides early detection of memory corruption

Verification:
- Built with RELEASE=0 (debug mode): SUCCESS
- Ran 3x 190-second tests: ALL PASS (exit code 0)
- No SIGSEGV crashes after fix
- Counter safely handles values beyond 255

Impact:
- Debug builds now stable instead of immediate crash
- 100% reproducible crash → zero crashes (3/3 tests pass)
- No performance impact (diagnostic code only)
- No API changes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 10:38:19 +09:00

119 lines
3.2 KiB
Bash
Executable File

#!/bin/bash
# 180秒クラッシュ診断スクリプト
# 目的: 複数回テストを実行し、クラッシュ直前のログパターンを抽出
set -e
WORKDIR="/mnt/workdisk/public_share/hakmem"
LOGDIR="/tmp/hakmem_diagnostic"
mkdir -p "$LOGDIR"
echo "=== Hakmem 180s Crash Diagnosis ==="
echo "Log directory: $LOGDIR"
echo ""
# テスト設定
NUM_RUNS=3
TIMEOUT_SEC=190
# 環境設定(既知の診断ログのみ有効化)
export LD_PRELOAD="$WORKDIR/libhakmem.so"
export LD_LIBRARY_PATH="$WORKDIR"
# デバッグ出力抑制
unset HAKMEM_TINY_SLL_NEXTCLS
unset HAKMEM_TINY_SLL_NEXTTAG
unset HAKMEM_TINY_SLL_HEADCLS
unset HAKMEM_DEBUG_COUNTER
unset HAK_DEBUG_LOG_FREQ
echo "Running $NUM_RUNS iterations of 180-second test..."
echo ""
for i in $(seq 1 $NUM_RUNS); do
echo "--- Run $i/$NUM_RUNS ---"
LOGFILE="$LOGDIR/run_${i}.log"
START_TIME=$(date +%s)
# タイムアウト付きでテスト実行
if timeout $TIMEOUT_SEC env \
LD_PRELOAD="$LD_PRELOAD" \
LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \
"$WORKDIR/mimalloc-bench/out/bench/sh8bench" > "$LOGFILE" 2>&1; then
EXIT_CODE=$?
RESULT="PASS"
else
EXIT_CODE=$?
RESULT="FAIL"
fi
END_TIME=$(date +%s)
ELAPSED=$((END_TIME - START_TIME))
echo " Result: $RESULT (exit code: $EXIT_CODE, elapsed: ${ELAPSED}s)"
echo " Log: $LOGFILE"
# クラッシュ/エラーのキーワードを検索
if grep -q "SIGSEGV\|Segmentation\|ERROR\|FATAL" "$LOGFILE" 2>/dev/null; then
echo " ⚠️ CRASH DETECTED"
# ログの最後 50 行を表示
echo " === Last 50 lines of log ==="
tail -50 "$LOGFILE" | sed 's/^/ /'
else
echo " ✓ No crash detected"
# テール 10 行を表示
echo " === Last 10 lines ==="
tail -10 "$LOGFILE" | sed 's/^/ /'
fi
echo ""
done
echo "=== Summary ==="
echo ""
# 各ログファイルのサイズと最終行
for i in $(seq 1 $NUM_RUNS); do
LOGFILE="$LOGDIR/run_${i}.log"
SIZE=$(wc -c < "$LOGFILE")
LAST=$(tail -1 "$LOGFILE" 2>/dev/null || echo "(empty)")
echo "Run $i: $SIZE bytes"
echo " Last line: $LAST"
done
echo ""
echo "=== Diagnostic Patterns ==="
echo ""
# すべてのログを結合して、エラーパターンをマイニング
cat "$LOGDIR"/*.log 2>/dev/null | \
grep -E "\[.*\]" | \
sort | uniq -c | sort -rn | head -20 | \
sed 's/^/ /'
echo ""
echo "=== Crash Analysis ==="
echo ""
# SIGSEGV が出たかどうか
CRASH_COUNT=$(grep -l "SIGSEGV\|Segmentation" "$LOGDIR"/*.log 2>/dev/null | wc -l)
if [ "$CRASH_COUNT" -gt 0 ]; then
echo "✓ Crashes detected in $CRASH_COUNT/$NUM_RUNS runs"
echo ""
echo "Last crash log:"
LAST_CRASH=$(grep -l "SIGSEGV\|Segmentation" "$LOGDIR"/*.log 2>/dev/null | tail -1)
tail -100 "$LAST_CRASH" | sed 's/^/ /'
else
echo "✗ No crashes detected in any run"
echo ""
echo "This suggests either:"
echo " 1. The 180s crash is NOT reproducible in current build"
echo " 2. Crash requires specific conditions/load patterns"
echo " 3. Issue may have been fixed"
fi
echo ""
echo "Diagnosis complete. Check $LOGDIR for full logs."