From 341141cf8bd6d9c4c0d4d3241c749496f4c029df Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Tue, 12 Aug 2025 00:31:27 -0700 Subject: [PATCH] refactor: remove upterm/tmate debug code and clean CI workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove all upterm/tmate SSH debugging infrastructure - Restore clean CI workflow from main branch - Remove diagnostic script that was only for SSH debugging - Keep valuable DiskANN and HNSW backend improvements This provides a clean base to add targeted pytest hang debugging without the complexity of SSH sessions. ๐Ÿค– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .github/workflows/build-reusable.yml | 219 +-------------------------- scripts/diagnose_hang.sh | 103 ------------- 2 files changed, 4 insertions(+), 318 deletions(-) delete mode 100755 scripts/diagnose_hang.sh diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index cb1b20e..06ac31a 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -8,11 +8,6 @@ on: required: false type: string default: '' - debug_enabled: - description: 'Enable tmate debugging session for troubleshooting' - required: false - type: boolean - default: false jobs: lint: @@ -33,7 +28,7 @@ jobs: - name: Install ruff run: | - uv tool install ruff==0.12.7 + uv tool install ruff - name: Run ruff check run: | @@ -190,15 +185,10 @@ jobs: fi cd ../.. - # Repair DiskANN wheel - use show first to debug + # Repair DiskANN wheel cd packages/leann-backend-diskann if [ -d dist ]; then - echo "Checking DiskANN wheel contents before repair:" - unzip -l dist/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found" - auditwheel show dist/*.whl || echo "auditwheel show failed" auditwheel repair dist/*.whl -w dist_repaired - echo "Checking DiskANN wheel contents after repair:" - unzip -l dist_repaired/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found after repair" rm -rf dist mv dist_repaired dist fi @@ -246,12 +236,6 @@ jobs: # Install test dependencies using extras uv pip install -e ".[test]" - # Debug: Check if _diskannpy module is installed correctly - echo "Checking installed DiskANN module structure:" - python -c "import leann_backend_diskann; print('leann_backend_diskann location:', leann_backend_diskann.__file__)" || echo "Failed to import leann_backend_diskann" - python -c "from leann_backend_diskann import _diskannpy; print('_diskannpy imported successfully')" || echo "Failed to import _diskannpy" - ls -la $(python -c "import leann_backend_diskann; import os; print(os.path.dirname(leann_backend_diskann.__file__))" 2>/dev/null) 2>/dev/null || echo "Failed to list module directory" - - name: Run tests with pytest env: CI: true # Mark as CI environment to skip memory-intensive tests @@ -265,203 +249,8 @@ jobs: # Activate virtual environment source .venv/bin/activate || source .venv/Scripts/activate - # Debug: Show debug_enabled value - echo "๐Ÿ” DEBUG_ENABLED value: '${{ inputs.debug_enabled }}'" - echo "๐Ÿ” EVENT NAME: '${{ github.event_name }}'" - echo "๐Ÿ” COMMIT MESSAGE: '${{ github.event.head_commit.message }}'" - echo "๐Ÿ” Contains [debug]: '${{ contains(github.event.head_commit.message, '[debug]') }}'" - echo "๐Ÿ” GITHUB REF: '${{ github.ref }}'" - echo "๐Ÿ” GITHUB HEAD_REF: '${{ github.head_ref }}'" - - # Start tmate session INSIDE the test step if debug enabled - # FORCE DEBUG MODE - Always enable on this debug branch - DEBUG_MODE=true - echo "โœ… DEBUG MODE FORCED ON - Investigation branch" - - if [[ "$DEBUG_MODE" == "true" ]]; then - echo "๐Ÿ”ง DEBUG MODE: Starting tmate session before tests..." - # Install tmate if not available - if ! command -v tmate &> /dev/null; then - if [[ "$RUNNER_OS" == "Linux" ]]; then - sudo apt-get update && sudo apt-get install -y tmate - elif [[ "$RUNNER_OS" == "macOS" ]]; then - brew install tmate - fi - fi - - # Start tmate session in background - echo "Starting tmate session..." - tmate -S debug-session new-session -d - - # Wait for tmate to initialize and get connection info - echo "Waiting for tmate to initialize..." - sleep 5 - - # Try multiple times to get connection info - for i in {1..10}; do - SSH_INFO=$(tmate -S debug-session display -p '#{tmate_ssh}' 2>/dev/null || echo "") - WEB_INFO=$(tmate -S debug-session display -p '#{tmate_web}' 2>/dev/null || echo "") - - if [[ -n "$SSH_INFO" && "$SSH_INFO" != "connecting..." ]]; then - echo "๐Ÿ”— SSH: $SSH_INFO" - echo "๐Ÿ”— Web: $WEB_INFO" - break - fi - - echo "Attempt $i: Still connecting... (SSH: '$SSH_INFO')" - sleep 2 - done - echo "โฑ๏ธ Session will timeout after 30 minutes" - echo "๐Ÿ’ก You can now SSH in and run: pytest tests/ -vv --capture=no" - echo "๐Ÿ’ก Or run diagnostics: bash scripts/diagnose_hang.sh" - echo "" - echo "Waiting 60 seconds for you to connect..." - sleep 60 - fi - - # Define comprehensive diagnostic function - diag() { - echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN =====" - date - echo "" - echo "### Current Shell Info ###" - echo "Shell PID: $$" - echo "Shell PPID: $PPID" - echo "Current directory: $(pwd)" - echo "" - - echo "### Process Tree (full) ###" - pstree -ap 2>/dev/null || ps auxf || true - echo "" - - echo "### All Python/Pytest Processes ###" - ps -ef | grep -E 'python|pytest' | grep -v grep || true - echo "" - - echo "### Embedding Server Processes ###" - ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true - echo "" - - echo "### Network Listeners ###" - ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true - echo "" - - echo "### Open File Descriptors (lsof) ###" - lsof -p $$ 2>/dev/null | head -20 || true - echo "" - - echo "### Zombie Processes ###" - ps aux | grep '' || echo "No zombie processes" - echo "" - - echo "### Current Jobs ###" - jobs -l || true - echo "" - - echo "### /proc/PID/fd for current shell ###" - ls -la /proc/$$/fd 2>/dev/null || true - echo "" - - echo "===== COMPREHENSIVE DIAGNOSTICS END =====" - } - - # Enable verbose logging for debugging - export PYTHONUNBUFFERED=1 - export PYTEST_CURRENT_TEST=1 - - # Run all tests with extensive logging - if [[ "$RUNNER_OS" == "Linux" ]]; then - echo "๐Ÿš€ Starting Linux test execution with timeout..." - echo "Current time: $(date)" - echo "Shell PID: $$" - echo "Python: $(python --version)" - echo "Pytest: $(pytest --version)" - - # Show environment variables for debugging - echo "๐Ÿ“ฆ Environment variables:" - env | grep -E "PYTHON|PYTEST|CI|RUNNER" | sort - - # Set trap for diagnostics - trap diag INT TERM EXIT - - echo "๐Ÿ“‹ Pre-test diagnostics:" - ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test" - - # Check for any listening ports before test - echo "๐Ÿ”Œ Pre-test network state:" - ss -ltn 2>/dev/null | grep -E "555[0-9]|556[0-9]" || echo "No embedding server ports open" - - echo "๐Ÿƒ Running pytest with 180s timeout..." - timeout --preserve-status --signal=INT --kill-after=10 180 bash -c ' - echo "โฑ๏ธ Pytest starting at: $(date)" - echo "Running command: pytest tests/ -vv --maxfail=3 --tb=short --capture=no" - - # Run pytest with maximum verbosity and no output capture - pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=DEBUG 2>&1 | tee pytest.log - PYTEST_EXIT=${PIPESTATUS[0]} - - echo "โœ… Pytest finished at: $(date) with exit code: $PYTEST_EXIT" - echo "Last 20 lines of pytest output:" - tail -20 pytest.log || true - - # Immediately check for leftover processes - echo "๐Ÿ” Post-pytest process check:" - ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes" - - # Clean up any children before exit - echo "๐Ÿงน Cleaning up child processes..." - pkill -TERM -P $$ 2>/dev/null || true - sleep 0.5 - pkill -KILL -P $$ 2>/dev/null || true - - echo "๐Ÿ“Š Final check before exit:" - ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean" - - exit $PYTEST_EXIT - ' - - EXIT_CODE=$? - echo "๐Ÿ”š Timeout command exited with code: $EXIT_CODE" - - if [ $EXIT_CODE -eq 124 ]; then - echo "โš ๏ธ TIMEOUT TRIGGERED - Tests took more than 180 seconds!" - echo "๐Ÿ“ธ Capturing full diagnostics..." - diag - - # Run diagnostic script if available - if [ -f scripts/diagnose_hang.sh ]; then - echo "๐Ÿ” Running diagnostic script..." - bash scripts/diagnose_hang.sh || true - fi - - # More aggressive cleanup - echo "๐Ÿ’€ Killing all Python processes owned by runner..." - pkill -9 -u runner python || true - pkill -9 -u runner pytest || true - elif [ $EXIT_CODE -ne 0 ]; then - echo "โŒ Tests failed with exit code: $EXIT_CODE" - else - echo "โœ… All tests passed!" - fi - - # Always show final state - echo "๐Ÿ“ Final state check:" - ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining" - - exit $EXIT_CODE - else - # For macOS/Windows, run without GNU timeout - echo "๐Ÿš€ Running tests on $RUNNER_OS..." - pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=INFO - fi - - # Provide tmate session on test failure for debugging - - name: Setup tmate session on failure - if: ${{ failure() && (inputs.debug_enabled || contains(github.event.head_commit.message, '[debug]')) }} - uses: mxschmitt/action-tmate@v3 - with: - timeout-minutes: 30 - limit-access-to-actor: true + # Run all tests + pytest tests/ - name: Run sanity checks (optional) run: | diff --git a/scripts/diagnose_hang.sh b/scripts/diagnose_hang.sh deleted file mode 100755 index 7d8d830..0000000 --- a/scripts/diagnose_hang.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash -# Diagnostic script for debugging CI hangs - -echo "=========================================" -echo " CI HANG DIAGNOSTIC SCRIPT" -echo "=========================================" -echo "" - -echo "๐Ÿ“… Current time: $(date)" -echo "๐Ÿ–ฅ๏ธ Hostname: $(hostname)" -echo "๐Ÿ‘ค User: $(whoami)" -echo "๐Ÿ“‚ Working directory: $(pwd)" -echo "" - -echo "=== PYTHON ENVIRONMENT ===" -python --version 2>&1 || echo "Python not found" -pip list 2>&1 | head -20 || echo "pip not available" -echo "" - -echo "=== PROCESS INFORMATION ===" -echo "Current shell PID: $$" -echo "Parent PID: $PPID" -echo "" - -echo "All Python processes:" -ps aux | grep -E "[p]ython" || echo "No Python processes" -echo "" - -echo "All pytest processes:" -ps aux | grep -E "[p]ytest" || echo "No pytest processes" -echo "" - -echo "Embedding server processes:" -ps aux | grep -E "[e]mbedding_server" || echo "No embedding server processes" -echo "" - -echo "Zombie processes:" -ps aux | grep "" || echo "No zombie processes" -echo "" - -echo "=== NETWORK INFORMATION ===" -echo "Network listeners on typical embedding server ports:" -ss -ltn 2>/dev/null | grep -E ":555[0-9]|:556[0-9]" || netstat -ltn 2>/dev/null | grep -E ":555[0-9]|:556[0-9]" || echo "No listeners on embedding ports" -echo "" - -echo "All network listeners:" -ss -ltn 2>/dev/null | head -20 || netstat -ltn 2>/dev/null | head -20 || echo "Cannot get network info" -echo "" - -echo "=== FILE DESCRIPTORS ===" -echo "Open files for current shell:" -lsof -p $$ 2>/dev/null | head -20 || echo "lsof not available" -echo "" - -if [ -d "/proc/$$" ]; then - echo "File descriptors for current shell (/proc/$$/fd):" - ls -la /proc/$$/fd 2>/dev/null | head -20 || echo "Cannot access /proc/$$/fd" - echo "" -fi - -echo "=== SYSTEM RESOURCES ===" -echo "Memory usage:" -free -h 2>/dev/null || vm_stat 2>/dev/null || echo "Cannot get memory info" -echo "" - -echo "Disk usage:" -df -h . 2>/dev/null || echo "Cannot get disk info" -echo "" - -echo "CPU info:" -nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "Cannot get CPU info" -echo "" - -echo "=== PYTHON SPECIFIC CHECKS ===" -python -c " -import sys -import os -print(f'Python executable: {sys.executable}') -print(f'Python path: {sys.path[:3]}...') -print(f'Environment PYTHONPATH: {os.environ.get(\"PYTHONPATH\", \"Not set\")}') -print(f'Site packages: {[p for p in sys.path if \"site-packages\" in p][:2]}') -" 2>&1 || echo "Cannot run Python diagnostics" -echo "" - -echo "=== ZMQ SPECIFIC CHECKS ===" -python -c " -try: - import zmq - print(f'ZMQ version: {zmq.zmq_version()}') - print(f'PyZMQ version: {zmq.pyzmq_version()}') - ctx = zmq.Context.instance() - print(f'ZMQ context instance: {ctx}') -except Exception as e: - print(f'ZMQ check failed: {e}') -" 2>&1 || echo "Cannot check ZMQ" -echo "" - -echo "=== PYTEST CHECK ===" -pytest --version 2>&1 || echo "pytest not found" -echo "" - -echo "=== END OF DIAGNOSTICS ===" -echo "Generated at: $(date)"