From 439debbd3f569591692080cda2ababe0bfdc31e4 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Fri, 8 Aug 2025 18:55:50 -0700 Subject: [PATCH] fix: add extensive logging and fix subprocess PIPE blocking 1. CI Logging Enhancements: - Added comprehensive diagnostics with process tree, network listeners, file descriptors - Added timestamps at every stage (before/during/after pytest) - Added trap EXIT to always show diagnostics - Added immediate process checks after pytest finishes - Added sub-shell execution with immediate cleanup 2. Fixed Subprocess PIPE Blocking: - Changed Colab mode from PIPE to DEVNULL to prevent blocking - PIPE without reading can cause parent process to wait indefinitely 3. Pytest Session Hooks: - Added pytest_sessionstart to log initial state - Added pytest_sessionfinish for aggressive cleanup before exit - Shows all child processes and their status This should reveal exactly where the hang is happening. --- .github/workflows/build-reusable.yml | 125 ++++++++++++++---- .../src/leann/embedding_server_manager.py | 7 +- tests/conftest.py | 66 +++++++++ 3 files changed, 168 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build-reusable.yml b/.github/workflows/build-reusable.yml index 80cfc8e..033e4eb 100644 --- a/.github/workflows/build-reusable.yml +++ b/.github/workflows/build-reusable.yml @@ -263,42 +263,113 @@ jobs: # Activate virtual environment source .venv/bin/activate || source .venv/Scripts/activate - # Define diagnostic function for debugging hangs + # Define comprehensive diagnostic function diag() { - echo "===== DIAG BEGIN =====" + echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN =====" date - echo "# pstree (current shell group)" - pstree -ap $$ 2>/dev/null || true - echo "# python/pytest processes" - ps -ef | grep -E 'python|pytest|embedding|zmq|diskann' | grep -v grep || true - echo "# network listeners" + echo "" + echo "### Current Shell Info ###" + echo "Shell PID: $$" + echo "Shell PPID: $PPID" + echo "Current directory: $(pwd)" + echo "" + + echo "### Process Tree (full) ###" + pstree -ap 2>/dev/null || ps auxf || true + echo "" + + echo "### All Python/Pytest Processes ###" + ps -ef | grep -E 'python|pytest' | grep -v grep || true + echo "" + + echo "### Embedding Server Processes ###" + ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true + echo "" + + echo "### Network Listeners ###" ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true - echo "# pytest PIDs" - pgrep -fa pytest || true - echo "===== DIAG END =====" + echo "" + + echo "### Open File Descriptors (lsof) ###" + lsof -p $$ 2>/dev/null | head -20 || true + echo "" + + echo "### Zombie Processes ###" + ps aux | grep '' || echo "No zombie processes" + echo "" + + echo "### Current Jobs ###" + jobs -l || true + echo "" + + echo "### /proc/PID/fd for current shell ###" + ls -la /proc/$$/fd 2>/dev/null || true + echo "" + + echo "===== COMPREHENSIVE DIAGNOSTICS END =====" } - # Run all tests with timeout on Linux to prevent hanging + # Run all tests with extensive logging if [[ "$RUNNER_OS" == "Linux" ]]; then - echo "Running tests with timeout (Linux)..." - # Set trap for diagnostics - trap diag INT TERM + echo "๐Ÿš€ Starting Linux test execution with timeout..." + echo "Current time: $(date)" + echo "Shell PID: $$" - timeout --signal=INT 180 pytest tests/ -vv --maxfail=3 || { - EXIT_CODE=$? - if [ $EXIT_CODE -eq 124 ]; then - echo "โš ๏ธ Tests timed out after 180 seconds - dumping diagnostics..." - diag - # Try to clean up any leftover processes - pkill -TERM -P $$ || true - sleep 1 - pkill -KILL -P $$ || true - fi - exit $EXIT_CODE - } + # Set trap for diagnostics + trap diag INT TERM EXIT + + echo "๐Ÿ“‹ Pre-test diagnostics:" + ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test" + + echo "๐Ÿƒ Running pytest with 180s timeout..." + timeout --preserve-status --signal=INT --kill-after=10 180 bash -c ' + echo "โฑ๏ธ Pytest starting at: $(date)" + pytest tests/ -vv --maxfail=3 + PYTEST_EXIT=$? + echo "โœ… Pytest finished at: $(date) with exit code: $PYTEST_EXIT" + + # Immediately check for leftover processes + echo "๐Ÿ” Post-pytest process check:" + ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes" + + # Clean up any children before exit + echo "๐Ÿงน Cleaning up child processes..." + pkill -TERM -P $$ 2>/dev/null || true + sleep 0.5 + pkill -KILL -P $$ 2>/dev/null || true + + echo "๐Ÿ“Š Final check before exit:" + ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean" + + exit $PYTEST_EXIT + ' + + EXIT_CODE=$? + echo "๐Ÿ”š Timeout command exited with code: $EXIT_CODE" + + if [ $EXIT_CODE -eq 124 ]; then + echo "โš ๏ธ TIMEOUT TRIGGERED - Tests took more than 180 seconds!" + echo "๐Ÿ“ธ Capturing full diagnostics..." + diag + + # More aggressive cleanup + echo "๐Ÿ’€ Killing all Python processes owned by runner..." + pkill -9 -u runner python || true + pkill -9 -u runner pytest || true + elif [ $EXIT_CODE -ne 0 ]; then + echo "โŒ Tests failed with exit code: $EXIT_CODE" + else + echo "โœ… All tests passed!" + fi + + # Always show final state + echo "๐Ÿ“ Final state check:" + ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining" + + exit $EXIT_CODE else # For macOS/Windows, run without GNU timeout - echo "Running tests ($RUNNER_OS)..." + echo "๐Ÿš€ Running tests on $RUNNER_OS..." pytest tests/ -vv --maxfail=3 fi diff --git a/packages/leann-core/src/leann/embedding_server_manager.py b/packages/leann-core/src/leann/embedding_server_manager.py index 9ded6e5..427b8b1 100644 --- a/packages/leann-core/src/leann/embedding_server_manager.py +++ b/packages/leann-core/src/leann/embedding_server_manager.py @@ -402,11 +402,12 @@ class EmbeddingServerManager: """Launch the server process with Colab-specific settings.""" logger.info(f"Colab Command: {' '.join(command)}") - # In Colab, we need to be more careful about process management + # In Colab, redirect to DEVNULL to avoid pipe blocking + # PIPE without reading can cause hangs self.server_process = subprocess.Popen( command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, text=True, ) self.server_port = port diff --git a/tests/conftest.py b/tests/conftest.py index f278669..b8219bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -184,3 +184,69 @@ def pytest_configure(config): # Set default timeout method to thread if not specified if not config.getoption("--timeout-method", None): config.option.timeout_method = "thread" + + # Add more logging + print(f"๐Ÿ”ง Pytest configured at {time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f" Python version: {os.sys.version}") + print(f" Platform: {os.sys.platform}") + + +def pytest_sessionstart(session): + """Called after the Session object has been created.""" + print(f"๐Ÿ Pytest session starting at {time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f" Session ID: {id(session)}") + + # Show initial process state + try: + import psutil + + current = psutil.Process() + print(f" Current PID: {current.pid}") + print(f" Parent PID: {current.ppid()}") + children = current.children(recursive=True) + if children: + print(f" โš ๏ธ Already have {len(children)} child processes at start!") + except Exception: + pass + + +def pytest_sessionfinish(session, exitstatus): + """Called after whole test run finished.""" + print(f"๐Ÿ Pytest session finishing at {time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f" Exit status: {exitstatus}") + + # Aggressive cleanup before pytest exits + print("๐Ÿงน Starting aggressive cleanup...") + + try: + import psutil + + current = psutil.Process() + children = current.children(recursive=True) + + if children: + print(f" Found {len(children)} child processes to clean up:") + for child in children: + try: + print(f" - PID {child.pid}: {child.name()} (status: {child.status()})") + child.terminate() + except Exception as e: + print(f" - Failed to terminate {child.pid}: {e}") + + # Wait briefly then kill + time.sleep(0.5) + _, alive = psutil.wait_procs(children, timeout=1) + + for child in alive: + try: + print(f" - Force killing {child.pid}") + child.kill() + except Exception: + pass + else: + print(" No child processes found") + + except Exception as e: + print(f" Cleanup error: {e}") + + print(f"โœ… Pytest exiting at {time.strftime('%Y-%m-%d %H:%M:%S')}")