From f2c5355c739aaed48b4ec15141bc8a43e77d1ebc Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 8 Aug 2025 21:25:58 -0700 Subject: [PATCH] feat: add comprehensive debugging capabilities with tmate integration 1. Tmate SSH Debugging: - Added manual workflow_dispatch trigger with debug_enabled option - Integrated mxschmitt/action-tmate@v3 for SSH access to CI runner - Can be triggered manually or by adding [debug] to commit message - Detached mode with 30min timeout, limited to actor only - Also triggers on test failure when debug is enabled 2. Enhanced Pytest Output: - Added --capture=no to see real-time output - Added --log-cli-level=DEBUG for maximum verbosity - Added --tb=short for cleaner tracebacks - Pipe output to tee for both display and logging - Show last 20 lines of output on completion 3. Environment Diagnostics: - Export PYTHONUNBUFFERED=1 for immediate output - Show Python/Pytest versions at start - Display relevant environment variables - Check network ports before/after tests 4. Diagnostic Script: - Created scripts/diagnose_hang.sh for comprehensive system checks - Shows processes, network, file descriptors, memory, ZMQ status - Automatically runs on timeout for detailed debugging info This allows debugging CI hangs via SSH when needed while providing extensive logging by default. --- .github/workflows/build-and-publish.yml | 9 ++++ .github/workflows/build-reusable.yml | 58 ++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml index bf076d3..058b0a5 100644 --- a/.github/workflows/build-and-publish.yml +++ b/.github/workflows/build-and-publish.yml @@ -5,7 +5,16 @@ on: branches: [ main ] pull_request: branches: [ main ] + workflow_dispatch: + inputs: + debug_enabled: + type: boolean + description: 'Run with tmate debugging enabled (SSH access to runner)' + required: false + default: false jobs: build: uses: ./.github/workflows/build-reusable.yml + with: + debug_enabled: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled || false }} diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index 033e4eb..99781e5 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -8,6 +8,11 @@ on: required: false type: string default: '' + debug_enabled: + description: 'Enable tmate debugging session for troubleshooting' + required: false + type: boolean + default: false jobs: lint: @@ -250,6 +255,15 @@ jobs: fi fi + # Enable tmate debugging session if requested + - name: Setup tmate session for debugging + if: ${{ inputs.debug_enabled }} + uses: mxschmitt/action-tmate@v3 + with: + detached: true + timeout-minutes: 30 + limit-access-to-actor: true + - name: Run tests with pytest env: CI: true # Mark as CI environment to skip memory-intensive tests @@ -309,11 +323,21 @@ jobs: echo "===== COMPREHENSIVE DIAGNOSTICS END =====" } + # Enable verbose logging for debugging + export PYTHONUNBUFFERED=1 + export PYTEST_CURRENT_TEST=1 + # Run all tests with extensive logging if [[ "$RUNNER_OS" == "Linux" ]]; then echo "🚀 Starting Linux test execution with timeout..." echo "Current time: $(date)" echo "Shell PID: $$" + echo "Python: $(python --version)" + echo "Pytest: $(pytest --version)" + + # Show environment variables for debugging + echo "📦 Environment variables:" + env | grep -E "PYTHON|PYTEST|CI|RUNNER" | sort # Set trap for diagnostics trap diag INT TERM EXIT @@ -321,12 +345,22 @@ jobs: echo "📋 Pre-test diagnostics:" ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test" + # Check for any listening ports before test + echo "🔌 Pre-test network state:" + ss -ltn 2>/dev/null | grep -E "555[0-9]|556[0-9]" || echo "No embedding server ports open" + echo "🏃 Running pytest with 180s timeout..." timeout --preserve-status --signal=INT --kill-after=10 180 bash -c ' echo "⏱️ Pytest starting at: $(date)" - pytest tests/ -vv --maxfail=3 - PYTEST_EXIT=$? + echo "Running command: pytest tests/ -vv --maxfail=3 --tb=short --capture=no" + + # Run pytest with maximum verbosity and no output capture + pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=DEBUG 2>&1 | tee pytest.log + PYTEST_EXIT=${PIPESTATUS[0]} + echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT" + echo "Last 20 lines of pytest output:" + tail -20 pytest.log || true # Immediately check for leftover processes echo "🔍 Post-pytest process check:" @@ -347,11 +381,17 @@ jobs: EXIT_CODE=$? echo "🔚 Timeout command exited with code: $EXIT_CODE" - if [ $EXIT_CODE -eq 124 ]; then + if [ $EXIT_CODE -eq 124 ]; then echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!" echo "📸 Capturing full diagnostics..." diag + # Run diagnostic script if available + if [ -f scripts/diagnose_hang.sh ]; then + echo "🔍 Running diagnostic script..." + bash scripts/diagnose_hang.sh || true + fi + # More aggressive cleanup echo "💀 Killing all Python processes owned by runner..." pkill -9 -u runner python || true @@ -362,7 +402,7 @@ jobs: echo "✅ All tests passed!" fi - # Always show final state + # Always show final state echo "📍 Final state check:" ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining" @@ -370,9 +410,17 @@ jobs: else # For macOS/Windows, run without GNU timeout echo "🚀 Running tests on $RUNNER_OS..." - pytest tests/ -vv --maxfail=3 + pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=INFO fi + # Provide tmate session on test failure for debugging + - name: Setup tmate session on failure + if: ${{ failure() && (inputs.debug_enabled || contains(github.event.head_commit.message, '[debug]')) }} + uses: mxschmitt/action-tmate@v3 + with: + timeout-minutes: 30 + limit-access-to-actor: true + - name: Run sanity checks (optional) run: | # Activate virtual environment