feat: add comprehensive debugging capabilities with tmate integration
1. Tmate SSH Debugging: - Added manual workflow_dispatch trigger with debug_enabled option - Integrated mxschmitt/action-tmate@v3 for SSH access to CI runner - Can be triggered manually or by adding [debug] to commit message - Detached mode with 30min timeout, limited to actor only - Also triggers on test failure when debug is enabled 2. Enhanced Pytest Output: - Added --capture=no to see real-time output - Added --log-cli-level=DEBUG for maximum verbosity - Added --tb=short for cleaner tracebacks - Pipe output to tee for both display and logging - Show last 20 lines of output on completion 3. Environment Diagnostics: - Export PYTHONUNBUFFERED=1 for immediate output - Show Python/Pytest versions at start - Display relevant environment variables - Check network ports before/after tests 4. Diagnostic Script: - Created scripts/diagnose_hang.sh for comprehensive system checks - Shows processes, network, file descriptors, memory, ZMQ status - Automatically runs on timeout for detailed debugging info This allows debugging CI hangs via SSH when needed while providing extensive logging by default.
This commit is contained in:
9
.github/workflows/build-and-publish.yml
vendored
9
.github/workflows/build-and-publish.yml
vendored
@@ -5,7 +5,16 @@ on:
|
|||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
debug_enabled:
|
||||||
|
type: boolean
|
||||||
|
description: 'Run with tmate debugging enabled (SSH access to runner)'
|
||||||
|
required: false
|
||||||
|
default: false
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
uses: ./.github/workflows/build-reusable.yml
|
uses: ./.github/workflows/build-reusable.yml
|
||||||
|
with:
|
||||||
|
debug_enabled: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled || false }}
|
||||||
|
|||||||
58
.github/workflows/build-reusable.yml
vendored
58
.github/workflows/build-reusable.yml
vendored
@@ -8,6 +8,11 @@ on:
|
|||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
default: ''
|
default: ''
|
||||||
|
debug_enabled:
|
||||||
|
description: 'Enable tmate debugging session for troubleshooting'
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint:
|
lint:
|
||||||
@@ -250,6 +255,15 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Enable tmate debugging session if requested
|
||||||
|
- name: Setup tmate session for debugging
|
||||||
|
if: ${{ inputs.debug_enabled }}
|
||||||
|
uses: mxschmitt/action-tmate@v3
|
||||||
|
with:
|
||||||
|
detached: true
|
||||||
|
timeout-minutes: 30
|
||||||
|
limit-access-to-actor: true
|
||||||
|
|
||||||
- name: Run tests with pytest
|
- name: Run tests with pytest
|
||||||
env:
|
env:
|
||||||
CI: true # Mark as CI environment to skip memory-intensive tests
|
CI: true # Mark as CI environment to skip memory-intensive tests
|
||||||
@@ -309,11 +323,21 @@ jobs:
|
|||||||
echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
|
echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Enable verbose logging for debugging
|
||||||
|
export PYTHONUNBUFFERED=1
|
||||||
|
export PYTEST_CURRENT_TEST=1
|
||||||
|
|
||||||
# Run all tests with extensive logging
|
# Run all tests with extensive logging
|
||||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||||
echo "🚀 Starting Linux test execution with timeout..."
|
echo "🚀 Starting Linux test execution with timeout..."
|
||||||
echo "Current time: $(date)"
|
echo "Current time: $(date)"
|
||||||
echo "Shell PID: $$"
|
echo "Shell PID: $$"
|
||||||
|
echo "Python: $(python --version)"
|
||||||
|
echo "Pytest: $(pytest --version)"
|
||||||
|
|
||||||
|
# Show environment variables for debugging
|
||||||
|
echo "📦 Environment variables:"
|
||||||
|
env | grep -E "PYTHON|PYTEST|CI|RUNNER" | sort
|
||||||
|
|
||||||
# Set trap for diagnostics
|
# Set trap for diagnostics
|
||||||
trap diag INT TERM EXIT
|
trap diag INT TERM EXIT
|
||||||
@@ -321,12 +345,22 @@ jobs:
|
|||||||
echo "📋 Pre-test diagnostics:"
|
echo "📋 Pre-test diagnostics:"
|
||||||
ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
|
ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
|
||||||
|
|
||||||
|
# Check for any listening ports before test
|
||||||
|
echo "🔌 Pre-test network state:"
|
||||||
|
ss -ltn 2>/dev/null | grep -E "555[0-9]|556[0-9]" || echo "No embedding server ports open"
|
||||||
|
|
||||||
echo "🏃 Running pytest with 180s timeout..."
|
echo "🏃 Running pytest with 180s timeout..."
|
||||||
timeout --preserve-status --signal=INT --kill-after=10 180 bash -c '
|
timeout --preserve-status --signal=INT --kill-after=10 180 bash -c '
|
||||||
echo "⏱️ Pytest starting at: $(date)"
|
echo "⏱️ Pytest starting at: $(date)"
|
||||||
pytest tests/ -vv --maxfail=3
|
echo "Running command: pytest tests/ -vv --maxfail=3 --tb=short --capture=no"
|
||||||
PYTEST_EXIT=$?
|
|
||||||
|
# Run pytest with maximum verbosity and no output capture
|
||||||
|
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=DEBUG 2>&1 | tee pytest.log
|
||||||
|
PYTEST_EXIT=${PIPESTATUS[0]}
|
||||||
|
|
||||||
echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
|
echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
|
||||||
|
echo "Last 20 lines of pytest output:"
|
||||||
|
tail -20 pytest.log || true
|
||||||
|
|
||||||
# Immediately check for leftover processes
|
# Immediately check for leftover processes
|
||||||
echo "🔍 Post-pytest process check:"
|
echo "🔍 Post-pytest process check:"
|
||||||
@@ -347,11 +381,17 @@ jobs:
|
|||||||
EXIT_CODE=$?
|
EXIT_CODE=$?
|
||||||
echo "🔚 Timeout command exited with code: $EXIT_CODE"
|
echo "🔚 Timeout command exited with code: $EXIT_CODE"
|
||||||
|
|
||||||
if [ $EXIT_CODE -eq 124 ]; then
|
if [ $EXIT_CODE -eq 124 ]; then
|
||||||
echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!"
|
echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!"
|
||||||
echo "📸 Capturing full diagnostics..."
|
echo "📸 Capturing full diagnostics..."
|
||||||
diag
|
diag
|
||||||
|
|
||||||
|
# Run diagnostic script if available
|
||||||
|
if [ -f scripts/diagnose_hang.sh ]; then
|
||||||
|
echo "🔍 Running diagnostic script..."
|
||||||
|
bash scripts/diagnose_hang.sh || true
|
||||||
|
fi
|
||||||
|
|
||||||
# More aggressive cleanup
|
# More aggressive cleanup
|
||||||
echo "💀 Killing all Python processes owned by runner..."
|
echo "💀 Killing all Python processes owned by runner..."
|
||||||
pkill -9 -u runner python || true
|
pkill -9 -u runner python || true
|
||||||
@@ -362,7 +402,7 @@ jobs:
|
|||||||
echo "✅ All tests passed!"
|
echo "✅ All tests passed!"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Always show final state
|
# Always show final state
|
||||||
echo "📍 Final state check:"
|
echo "📍 Final state check:"
|
||||||
ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
|
ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
|
||||||
|
|
||||||
@@ -370,9 +410,17 @@ jobs:
|
|||||||
else
|
else
|
||||||
# For macOS/Windows, run without GNU timeout
|
# For macOS/Windows, run without GNU timeout
|
||||||
echo "🚀 Running tests on $RUNNER_OS..."
|
echo "🚀 Running tests on $RUNNER_OS..."
|
||||||
pytest tests/ -vv --maxfail=3
|
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=INFO
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Provide tmate session on test failure for debugging
|
||||||
|
- name: Setup tmate session on failure
|
||||||
|
if: ${{ failure() && (inputs.debug_enabled || contains(github.event.head_commit.message, '[debug]')) }}
|
||||||
|
uses: mxschmitt/action-tmate@v3
|
||||||
|
with:
|
||||||
|
timeout-minutes: 30
|
||||||
|
limit-access-to-actor: true
|
||||||
|
|
||||||
- name: Run sanity checks (optional)
|
- name: Run sanity checks (optional)
|
||||||
run: |
|
run: |
|
||||||
# Activate virtual environment
|
# Activate virtual environment
|
||||||
|
|||||||
Reference in New Issue
Block a user