feat: add comprehensive hang detection for pytest CI debugging

- Add Python faulthandler integration with signal-triggered stack dumps
- Implement periodic stack dumps at 5min and 10min intervals
- Add external process monitoring with SIGUSR1 signal on hang detection
- Use debug_pytest.py wrapper to capture exact hang location in C++ cleanup
- Enhance CPU stability monitoring to trigger precise stack traces

This addresses the persistent pytest hanging issue in Ubuntu 22.04 CI by
providing detailed stack traces to identify the exact code location where
the hang occurs during test cleanup phase.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Andy Lee
2025-08-12 12:42:16 -07:00
parent 2d8a1ac328
commit 8d06aa99f4

View File

@@ -253,6 +253,55 @@ jobs:
if [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then
echo "🔍 [HANG DEBUG] Ubuntu 22.04 detected - enabling enhanced process monitoring"
# Create Python script to inject stack trace dumping into pytest
cat > debug_pytest.py << 'EOF'
import signal
import faulthandler
import threading
import time
import sys
import traceback
def setup_hang_detection():
"""Setup signal handlers and faulthandler for hang detection"""
# Enable faulthandler for automatic stack dumps
faulthandler.enable()
def dump_all_stacks(signum, frame):
print(f"\n🔥 [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:")
faulthandler.dump_traceback()
# Also dump current frames manually
for thread_id, frame in sys._current_frames().items():
print(f"\n📍 Thread {thread_id}:")
traceback.print_stack(frame)
def periodic_stack_dump():
"""Periodically dump stacks to catch hang location"""
time.sleep(300) # Wait 5 minutes
print(f"\n⏰ [HANG DEBUG] Periodic stack dump at {time.time()}:")
for thread_id, frame in sys._current_frames().items():
print(f"\n📍 Thread {thread_id}:")
traceback.print_stack(frame)
time.sleep(300) # Wait another 5 minutes if still running
print(f"\n⚠ [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):")
faulthandler.dump_traceback()
# Register signal handlers for external debugging
signal.signal(signal.SIGUSR1, dump_all_stacks)
signal.signal(signal.SIGUSR2, dump_all_stacks)
# Start periodic dumping thread
dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True)
dump_thread.start()
if __name__ == "__main__":
setup_hang_detection()
# Re-exec pytest with debugging enabled
import subprocess
result = subprocess.run([sys.executable, "-m", "pytest"] + sys.argv[1:])
sys.exit(result.returncode)
EOF
# Pre-test state
echo "📊 [HANG DEBUG] Pre-test process state:"
ps aux | grep -E "(python|embedding|zmq)" | grep -v grep || echo "No relevant processes"
@@ -307,6 +356,8 @@ jobs:
stable_count=$((stable_count + 1))
if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU
echo "⚠️ [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s"
echo "🔍 [EXTERNAL] $(date): Sending SIGUSR1 to dump stack traces..."
kill -USR1 $pytest_pid 2>/dev/null || echo "Failed to send signal"
fi
else
stable_count=0
@@ -341,7 +392,7 @@ jobs:
timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c '
echo "▶️ [HANG DEBUG] Pytest starting at: $(date)"
# Force unbuffered output and immediate flush
stdbuf -o0 -e0 pytest tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do
stdbuf -o0 -e0 python debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do
printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line"
# Force flush after each line
sync