From c1d39eead85d2f42b0ab79a6df846864da1af05f Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 12 Aug 2025 13:12:27 -0700 Subject: [PATCH] CI: move pytest hang-debug script into scripts/ci_debug_pytest.py; sort imports and apply ruff suggestion; update workflow to call the script --- .github/workflows/build-reusable.yml | 51 +------------------------ scripts/ci_debug_pytest.py | 56 ++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 49 deletions(-) create mode 100644 scripts/ci_debug_pytest.py diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index 8c0f31d..947ee2f 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -253,54 +253,7 @@ jobs: if [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then echo "šŸ” [HANG DEBUG] Ubuntu 22.04 detected - enabling enhanced process monitoring" - # Create Python script to inject stack trace dumping into pytest - cat > debug_pytest.py << 'EOF' - import signal - import faulthandler - import threading - import time - import sys - import traceback - - def setup_hang_detection(): - """Setup signal handlers and faulthandler for hang detection""" - # Enable faulthandler for automatic stack dumps - faulthandler.enable() - - def dump_all_stacks(signum, frame): - print(f"\nšŸ”„ [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:") - faulthandler.dump_traceback() - # Also dump current frames manually - for thread_id, frame in sys._current_frames().items(): - print(f"\nšŸ“ Thread {thread_id}:") - traceback.print_stack(frame) - - def periodic_stack_dump(): - """Periodically dump stacks to catch hang location""" - time.sleep(300) # Wait 5 minutes - print(f"\nā° [HANG DEBUG] Periodic stack dump at {time.time()}:") - for thread_id, frame in sys._current_frames().items(): - print(f"\nšŸ“ Thread {thread_id}:") - traceback.print_stack(frame) - time.sleep(300) # Wait another 5 minutes if still running - print(f"\nāš ļø [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):") - faulthandler.dump_traceback() - - # Register signal handlers for external debugging - signal.signal(signal.SIGUSR1, dump_all_stacks) - signal.signal(signal.SIGUSR2, dump_all_stacks) - - # Start periodic dumping thread - dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True) - dump_thread.start() - - if __name__ == "__main__": - setup_hang_detection() - # Re-exec pytest with debugging enabled - import subprocess - result = subprocess.run([sys.executable, "-m", "pytest"] + sys.argv[1:]) - sys.exit(result.returncode) - EOF + # Create debug runner script exists in repo: scripts/ci_debug_pytest.py # Pre-test state echo "šŸ“Š [HANG DEBUG] Pre-test process state:" @@ -392,7 +345,7 @@ jobs: timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c ' echo "ā–¶ļø [HANG DEBUG] Pytest starting at: $(date)" # Force unbuffered output and immediate flush - stdbuf -o0 -e0 python debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do + stdbuf -o0 -e0 python scripts/ci_debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line" # Force flush after each line sync diff --git a/scripts/ci_debug_pytest.py b/scripts/ci_debug_pytest.py new file mode 100644 index 0000000..3d3ca24 --- /dev/null +++ b/scripts/ci_debug_pytest.py @@ -0,0 +1,56 @@ +import faulthandler +import signal +import subprocess +import sys +import threading +import time +import traceback + + +def setup_hang_detection() -> None: + """Setup signal handlers and periodic dumps to help debug hangs in CI. + + - Enables faulthandler to dump Python stack traces on fatal signals + - Installs handlers for SIGUSR1/2 to dump all thread stacks on demand + - Starts a background thread that periodically dumps stacks + """ + # Enable faulthandler for automatic stack dumps + faulthandler.enable() + + def dump_all_stacks(signum, frame): # type: ignore[no-redef] + print(f"\nšŸ”„ [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:") + faulthandler.dump_traceback() + # Also dump current frames manually for completeness + for thread_id, thread_frame in sys._current_frames().items(): + print(f"\nšŸ“ Thread {thread_id}:") + traceback.print_stack(thread_frame) + + def periodic_stack_dump() -> None: + """Periodically dump stacks to catch where the process is stuck.""" + time.sleep(300) # Wait 5 minutes + print(f"\nā° [HANG DEBUG] Periodic stack dump at {time.time()}:") + for thread_id, thread_frame in sys._current_frames().items(): + print(f"\nšŸ“ Thread {thread_id}:") + traceback.print_stack(thread_frame) + time.sleep(300) # Wait another 5 minutes if still running + print(f"\nāš ļø [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):") + faulthandler.dump_traceback() + + # Register signal handlers for external debugging + signal.signal(signal.SIGUSR1, dump_all_stacks) + signal.signal(signal.SIGUSR2, dump_all_stacks) + + # Start periodic dumping thread + dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True) + dump_thread.start() + + +def main(argv: list[str]) -> int: + setup_hang_detection() + # Re-exec pytest with debugging enabled + result = subprocess.run([sys.executable, "-m", "pytest", *argv]) + return result.returncode + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:]))