From 8d06aa99f4074e30c9f5f2367ae22bb65f3f1a20 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 12 Aug 2025 12:42:16 -0700 Subject: [PATCH] feat: add comprehensive hang detection for pytest CI debugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Python faulthandler integration with signal-triggered stack dumps - Implement periodic stack dumps at 5min and 10min intervals - Add external process monitoring with SIGUSR1 signal on hang detection - Use debug_pytest.py wrapper to capture exact hang location in C++ cleanup - Enhance CPU stability monitoring to trigger precise stack traces This addresses the persistent pytest hanging issue in Ubuntu 22.04 CI by providing detailed stack traces to identify the exact code location where the hang occurs during test cleanup phase. šŸ¤– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .github/workflows/build-reusable.yml | 53 +++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index e6a381e..8c0f31d 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -253,6 +253,55 @@ jobs: if [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then echo "šŸ” [HANG DEBUG] Ubuntu 22.04 detected - enabling enhanced process monitoring" + # Create Python script to inject stack trace dumping into pytest + cat > debug_pytest.py << 'EOF' + import signal + import faulthandler + import threading + import time + import sys + import traceback + + def setup_hang_detection(): + """Setup signal handlers and faulthandler for hang detection""" + # Enable faulthandler for automatic stack dumps + faulthandler.enable() + + def dump_all_stacks(signum, frame): + print(f"\nšŸ”„ [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:") + faulthandler.dump_traceback() + # Also dump current frames manually + for thread_id, frame in sys._current_frames().items(): + print(f"\nšŸ“ Thread {thread_id}:") + traceback.print_stack(frame) + + def periodic_stack_dump(): + """Periodically dump stacks to catch hang location""" + time.sleep(300) # Wait 5 minutes + print(f"\nā° [HANG DEBUG] Periodic stack dump at {time.time()}:") + for thread_id, frame in sys._current_frames().items(): + print(f"\nšŸ“ Thread {thread_id}:") + traceback.print_stack(frame) + time.sleep(300) # Wait another 5 minutes if still running + print(f"\nāš ļø [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):") + faulthandler.dump_traceback() + + # Register signal handlers for external debugging + signal.signal(signal.SIGUSR1, dump_all_stacks) + signal.signal(signal.SIGUSR2, dump_all_stacks) + + # Start periodic dumping thread + dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True) + dump_thread.start() + + if __name__ == "__main__": + setup_hang_detection() + # Re-exec pytest with debugging enabled + import subprocess + result = subprocess.run([sys.executable, "-m", "pytest"] + sys.argv[1:]) + sys.exit(result.returncode) + EOF + # Pre-test state echo "šŸ“Š [HANG DEBUG] Pre-test process state:" ps aux | grep -E "(python|embedding|zmq)" | grep -v grep || echo "No relevant processes" @@ -307,6 +356,8 @@ jobs: stable_count=$((stable_count + 1)) if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU echo "āš ļø [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s" + echo "šŸ” [EXTERNAL] $(date): Sending SIGUSR1 to dump stack traces..." + kill -USR1 $pytest_pid 2>/dev/null || echo "Failed to send signal" fi else stable_count=0 @@ -341,7 +392,7 @@ jobs: timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c ' echo "ā–¶ļø [HANG DEBUG] Pytest starting at: $(date)" # Force unbuffered output and immediate flush - stdbuf -o0 -e0 pytest tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do + stdbuf -o0 -e0 python debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line" # Force flush after each line sync