fix: add extensive logging and fix subprocess PIPE blocking

1. CI Logging Enhancements: - Added comprehensive diagnostics with process tree, network listeners, file descriptors - Added timestamps at every stage (before/during/after pytest) - Added trap EXIT to always show diagnostics - Added immediate process checks after pytest finishes - Added sub-shell execution with immediate cleanup 2. Fixed Subprocess PIPE Blocking: - Changed Colab mode from PIPE to DEVNULL to prevent blocking - PIPE without reading can cause parent process to wait indefinitely 3. Pytest Session Hooks: - Added pytest_sessionstart to log initial state - Added pytest_sessionfinish for aggressive cleanup before exit - Shows all child processes and their status This should reveal exactly where the hang is happening.
2025-08-08 18:55:50 -07:00
parent a35bfb0354
commit 439debbd3f
3 changed files with 168 additions and 30 deletions
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -263,42 +263,113 @@ jobs:
          # Activate virtual environment
          source .venv/bin/activate || source .venv/Scripts/activate
-          # Define diagnostic function for debugging hangs
+          # Define comprehensive diagnostic function
          diag() {
-            echo "===== DIAG BEGIN ====="
+            echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN ====="
            date
-            echo "# pstree (current shell group)"
+            echo ""
-            pstree -ap $$ 2>/dev/null || true
+            echo "### Current Shell Info ###"
-            echo "# python/pytest processes"
+            echo "Shell PID: $$"
-            ps -ef | grep -E 'python|pytest|embedding|zmq|diskann' | grep -v grep || true
+            echo "Shell PPID: $PPID"
-            echo "# network listeners"
+            echo "Current directory: $(pwd)"
            echo ""
            echo "### Process Tree (full) ###"
            pstree -ap 2>/dev/null || ps auxf || true
            echo ""
            echo "### All Python/Pytest Processes ###"
            ps -ef | grep -E 'python|pytest' | grep -v grep || true
            echo ""
            echo "### Embedding Server Processes ###"
            ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true
            echo ""
            echo "### Network Listeners ###"
            ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true
-            echo "# pytest PIDs"
+            echo ""
-            pgrep -fa pytest || true
+
-            echo "===== DIAG END ====="
+            echo "### Open File Descriptors (lsof) ###"
            lsof -p $$ 2>/dev/null | head -20 || true
            echo ""
            echo "### Zombie Processes ###"
            ps aux | grep '<defunct>' || echo "No zombie processes"
            echo ""
            echo "### Current Jobs ###"
            jobs -l || true
            echo ""
            echo "### /proc/PID/fd for current shell ###"
            ls -la /proc/$$/fd 2>/dev/null || true
            echo ""
            echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
          }
-          # Run all tests with timeout on Linux to prevent hanging
+          # Run all tests with extensive logging
          if [[ "$RUNNER_OS" == "Linux" ]]; then
-            echo "Running tests with timeout (Linux)..."
+            echo "🚀 Starting Linux test execution with timeout..."
-            # Set trap for diagnostics
+            echo "Current time: $(date)"
-            trap diag INT TERM
+            echo "Shell PID: $$"
-            timeout --signal=INT 180 pytest tests/ -vv --maxfail=3 || {
+            # Set trap for diagnostics
-              EXIT_CODE=$?
+            trap diag INT TERM EXIT
-              if [ $EXIT_CODE -eq 124 ]; then
+
-                echo "⚠️ Tests timed out after 180 seconds - dumping diagnostics..."
+            echo "📋 Pre-test diagnostics:"
-                diag
+            ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
-                # Try to clean up any leftover processes
+
-                pkill -TERM -P $$ || true
+            echo "🏃 Running pytest with 180s timeout..."
-                sleep 1
+            timeout --preserve-status --signal=INT --kill-after=10 180 bash -c '
-                pkill -KILL -P $$ || true
+              echo "⏱️ Pytest starting at: $(date)"
-              fi
+              pytest tests/ -vv --maxfail=3
-              exit $EXIT_CODE
+              PYTEST_EXIT=$?
-            }
+              echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
              # Immediately check for leftover processes
              echo "🔍 Post-pytest process check:"
              ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes"
              # Clean up any children before exit
              echo "🧹 Cleaning up child processes..."
              pkill -TERM -P $$ 2>/dev/null || true
              sleep 0.5
              pkill -KILL -P $$ 2>/dev/null || true
              echo "📊 Final check before exit:"
              ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean"
              exit $PYTEST_EXIT
            '
            EXIT_CODE=$?
            echo "🔚 Timeout command exited with code: $EXIT_CODE"
            if [ $EXIT_CODE -eq 124 ]; then
              echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!"
              echo "📸 Capturing full diagnostics..."
              diag
              # More aggressive cleanup
              echo "💀 Killing all Python processes owned by runner..."
              pkill -9 -u runner python || true
              pkill -9 -u runner pytest || true
            elif [ $EXIT_CODE -ne 0 ]; then
              echo "❌ Tests failed with exit code: $EXIT_CODE"
            else
              echo "✅ All tests passed!"
            fi
            # Always show final state
            echo "📍 Final state check:"
            ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
            exit $EXIT_CODE
          else
            # For macOS/Windows, run without GNU timeout
-            echo "Running tests ($RUNNER_OS)..."
+            echo "🚀 Running tests on $RUNNER_OS..."
            pytest tests/ -vv --maxfail=3
          fi
--- a/packages/leann-core/src/leann/embedding_server_manager.py
+++ b/packages/leann-core/src/leann/embedding_server_manager.py
@@ -402,11 +402,12 @@ class EmbeddingServerManager:
        """Launch the server process with Colab-specific settings."""
        logger.info(f"Colab Command: {' '.join(command)}")
-        # In Colab, we need to be more careful about process management
+        # In Colab, redirect to DEVNULL to avoid pipe blocking
        # PIPE without reading can cause hangs
        self.server_process = subprocess.Popen(
            command,
-            stdout=subprocess.PIPE,
+            stdout=subprocess.DEVNULL,
-            stderr=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
            text=True,
        )
        self.server_port = port
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -184,3 +184,69 @@ def pytest_configure(config):
    # Set default timeout method to thread if not specified
    if not config.getoption("--timeout-method", None):
        config.option.timeout_method = "thread"
    # Add more logging
    print(f"🔧 Pytest configured at {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"   Python version: {os.sys.version}")
    print(f"   Platform: {os.sys.platform}")
 def pytest_sessionstart(session):
    """Called after the Session object has been created."""
    print(f"🏁 Pytest session starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"   Session ID: {id(session)}")
    # Show initial process state
    try:
        import psutil
        current = psutil.Process()
        print(f"   Current PID: {current.pid}")
        print(f"   Parent PID: {current.ppid()}")
        children = current.children(recursive=True)
        if children:
            print(f"   ⚠️ Already have {len(children)} child processes at start!")
    except Exception:
        pass
 def pytest_sessionfinish(session, exitstatus):
    """Called after whole test run finished."""
    print(f"🏁 Pytest session finishing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"   Exit status: {exitstatus}")
    # Aggressive cleanup before pytest exits
    print("🧹 Starting aggressive cleanup...")
    try:
        import psutil
        current = psutil.Process()
        children = current.children(recursive=True)
        if children:
            print(f"   Found {len(children)} child processes to clean up:")
            for child in children:
                try:
                    print(f"     - PID {child.pid}: {child.name()} (status: {child.status()})")
                    child.terminate()
                except Exception as e:
                    print(f"     - Failed to terminate {child.pid}: {e}")
            # Wait briefly then kill
            time.sleep(0.5)
            _, alive = psutil.wait_procs(children, timeout=1)
            for child in alive:
                try:
                    print(f"     - Force killing {child.pid}")
                    child.kill()
                except Exception:
                    pass
        else:
            print("   No child processes found")
    except Exception as e:
        print(f"   Cleanup error: {e}")
    print(f"✅ Pytest exiting at {time.strftime('%Y-%m-%d %H:%M:%S')}")