diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index e6a381e..8c0f31d 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -253,6 +253,55 @@ jobs: if [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then echo "šŸ” [HANG DEBUG] Ubuntu 22.04 detected - enabling enhanced process monitoring" + # Create Python script to inject stack trace dumping into pytest + cat > debug_pytest.py << 'EOF' + import signal + import faulthandler + import threading + import time + import sys + import traceback + + def setup_hang_detection(): + """Setup signal handlers and faulthandler for hang detection""" + # Enable faulthandler for automatic stack dumps + faulthandler.enable() + + def dump_all_stacks(signum, frame): + print(f"\nšŸ”„ [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:") + faulthandler.dump_traceback() + # Also dump current frames manually + for thread_id, frame in sys._current_frames().items(): + print(f"\nšŸ“ Thread {thread_id}:") + traceback.print_stack(frame) + + def periodic_stack_dump(): + """Periodically dump stacks to catch hang location""" + time.sleep(300) # Wait 5 minutes + print(f"\nā° [HANG DEBUG] Periodic stack dump at {time.time()}:") + for thread_id, frame in sys._current_frames().items(): + print(f"\nšŸ“ Thread {thread_id}:") + traceback.print_stack(frame) + time.sleep(300) # Wait another 5 minutes if still running + print(f"\nāš ļø [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):") + faulthandler.dump_traceback() + + # Register signal handlers for external debugging + signal.signal(signal.SIGUSR1, dump_all_stacks) + signal.signal(signal.SIGUSR2, dump_all_stacks) + + # Start periodic dumping thread + dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True) + dump_thread.start() + + if __name__ == "__main__": + setup_hang_detection() + # Re-exec pytest with debugging enabled + import subprocess + result = subprocess.run([sys.executable, "-m", "pytest"] + sys.argv[1:]) + sys.exit(result.returncode) + EOF + # Pre-test state echo "šŸ“Š [HANG DEBUG] Pre-test process state:" ps aux | grep -E "(python|embedding|zmq)" | grep -v grep || echo "No relevant processes" @@ -307,6 +356,8 @@ jobs: stable_count=$((stable_count + 1)) if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU echo "āš ļø [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s" + echo "šŸ” [EXTERNAL] $(date): Sending SIGUSR1 to dump stack traces..." + kill -USR1 $pytest_pid 2>/dev/null || echo "Failed to send signal" fi else stable_count=0 @@ -341,7 +392,7 @@ jobs: timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c ' echo "ā–¶ļø [HANG DEBUG] Pytest starting at: $(date)" # Force unbuffered output and immediate flush - stdbuf -o0 -e0 pytest tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do + stdbuf -o0 -e0 python debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line" # Force flush after each line sync