fix: add extensive logging and fix subprocess PIPE blocking
1. CI Logging Enhancements: - Added comprehensive diagnostics with process tree, network listeners, file descriptors - Added timestamps at every stage (before/during/after pytest) - Added trap EXIT to always show diagnostics - Added immediate process checks after pytest finishes - Added sub-shell execution with immediate cleanup 2. Fixed Subprocess PIPE Blocking: - Changed Colab mode from PIPE to DEVNULL to prevent blocking - PIPE without reading can cause parent process to wait indefinitely 3. Pytest Session Hooks: - Added pytest_sessionstart to log initial state - Added pytest_sessionfinish for aggressive cleanup before exit - Shows all child processes and their status This should reveal exactly where the hang is happening.
This commit is contained in:
125
.github/workflows/build-reusable.yml
vendored
125
.github/workflows/build-reusable.yml
vendored
@@ -263,42 +263,113 @@ jobs:
|
|||||||
# Activate virtual environment
|
# Activate virtual environment
|
||||||
source .venv/bin/activate || source .venv/Scripts/activate
|
source .venv/bin/activate || source .venv/Scripts/activate
|
||||||
|
|
||||||
# Define diagnostic function for debugging hangs
|
# Define comprehensive diagnostic function
|
||||||
diag() {
|
diag() {
|
||||||
echo "===== DIAG BEGIN ====="
|
echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN ====="
|
||||||
date
|
date
|
||||||
echo "# pstree (current shell group)"
|
echo ""
|
||||||
pstree -ap $$ 2>/dev/null || true
|
echo "### Current Shell Info ###"
|
||||||
echo "# python/pytest processes"
|
echo "Shell PID: $$"
|
||||||
ps -ef | grep -E 'python|pytest|embedding|zmq|diskann' | grep -v grep || true
|
echo "Shell PPID: $PPID"
|
||||||
echo "# network listeners"
|
echo "Current directory: $(pwd)"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### Process Tree (full) ###"
|
||||||
|
pstree -ap 2>/dev/null || ps auxf || true
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### All Python/Pytest Processes ###"
|
||||||
|
ps -ef | grep -E 'python|pytest' | grep -v grep || true
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### Embedding Server Processes ###"
|
||||||
|
ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### Network Listeners ###"
|
||||||
ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true
|
ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true
|
||||||
echo "# pytest PIDs"
|
echo ""
|
||||||
pgrep -fa pytest || true
|
|
||||||
echo "===== DIAG END ====="
|
echo "### Open File Descriptors (lsof) ###"
|
||||||
|
lsof -p $$ 2>/dev/null | head -20 || true
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### Zombie Processes ###"
|
||||||
|
ps aux | grep '<defunct>' || echo "No zombie processes"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### Current Jobs ###"
|
||||||
|
jobs -l || true
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "### /proc/PID/fd for current shell ###"
|
||||||
|
ls -la /proc/$$/fd 2>/dev/null || true
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
|
||||||
}
|
}
|
||||||
|
|
||||||
# Run all tests with timeout on Linux to prevent hanging
|
# Run all tests with extensive logging
|
||||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||||
echo "Running tests with timeout (Linux)..."
|
echo "🚀 Starting Linux test execution with timeout..."
|
||||||
# Set trap for diagnostics
|
echo "Current time: $(date)"
|
||||||
trap diag INT TERM
|
echo "Shell PID: $$"
|
||||||
|
|
||||||
timeout --signal=INT 180 pytest tests/ -vv --maxfail=3 || {
|
# Set trap for diagnostics
|
||||||
EXIT_CODE=$?
|
trap diag INT TERM EXIT
|
||||||
if [ $EXIT_CODE -eq 124 ]; then
|
|
||||||
echo "⚠️ Tests timed out after 180 seconds - dumping diagnostics..."
|
echo "📋 Pre-test diagnostics:"
|
||||||
diag
|
ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
|
||||||
# Try to clean up any leftover processes
|
|
||||||
pkill -TERM -P $$ || true
|
echo "🏃 Running pytest with 180s timeout..."
|
||||||
sleep 1
|
timeout --preserve-status --signal=INT --kill-after=10 180 bash -c '
|
||||||
pkill -KILL -P $$ || true
|
echo "⏱️ Pytest starting at: $(date)"
|
||||||
fi
|
pytest tests/ -vv --maxfail=3
|
||||||
exit $EXIT_CODE
|
PYTEST_EXIT=$?
|
||||||
}
|
echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
|
||||||
|
|
||||||
|
# Immediately check for leftover processes
|
||||||
|
echo "🔍 Post-pytest process check:"
|
||||||
|
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes"
|
||||||
|
|
||||||
|
# Clean up any children before exit
|
||||||
|
echo "🧹 Cleaning up child processes..."
|
||||||
|
pkill -TERM -P $$ 2>/dev/null || true
|
||||||
|
sleep 0.5
|
||||||
|
pkill -KILL -P $$ 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "📊 Final check before exit:"
|
||||||
|
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean"
|
||||||
|
|
||||||
|
exit $PYTEST_EXIT
|
||||||
|
'
|
||||||
|
|
||||||
|
EXIT_CODE=$?
|
||||||
|
echo "🔚 Timeout command exited with code: $EXIT_CODE"
|
||||||
|
|
||||||
|
if [ $EXIT_CODE -eq 124 ]; then
|
||||||
|
echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!"
|
||||||
|
echo "📸 Capturing full diagnostics..."
|
||||||
|
diag
|
||||||
|
|
||||||
|
# More aggressive cleanup
|
||||||
|
echo "💀 Killing all Python processes owned by runner..."
|
||||||
|
pkill -9 -u runner python || true
|
||||||
|
pkill -9 -u runner pytest || true
|
||||||
|
elif [ $EXIT_CODE -ne 0 ]; then
|
||||||
|
echo "❌ Tests failed with exit code: $EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "✅ All tests passed!"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Always show final state
|
||||||
|
echo "📍 Final state check:"
|
||||||
|
ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
|
||||||
|
|
||||||
|
exit $EXIT_CODE
|
||||||
else
|
else
|
||||||
# For macOS/Windows, run without GNU timeout
|
# For macOS/Windows, run without GNU timeout
|
||||||
echo "Running tests ($RUNNER_OS)..."
|
echo "🚀 Running tests on $RUNNER_OS..."
|
||||||
pytest tests/ -vv --maxfail=3
|
pytest tests/ -vv --maxfail=3
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -402,11 +402,12 @@ class EmbeddingServerManager:
|
|||||||
"""Launch the server process with Colab-specific settings."""
|
"""Launch the server process with Colab-specific settings."""
|
||||||
logger.info(f"Colab Command: {' '.join(command)}")
|
logger.info(f"Colab Command: {' '.join(command)}")
|
||||||
|
|
||||||
# In Colab, we need to be more careful about process management
|
# In Colab, redirect to DEVNULL to avoid pipe blocking
|
||||||
|
# PIPE without reading can cause hangs
|
||||||
self.server_process = subprocess.Popen(
|
self.server_process = subprocess.Popen(
|
||||||
command,
|
command,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.DEVNULL,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.DEVNULL,
|
||||||
text=True,
|
text=True,
|
||||||
)
|
)
|
||||||
self.server_port = port
|
self.server_port = port
|
||||||
|
|||||||
@@ -184,3 +184,69 @@ def pytest_configure(config):
|
|||||||
# Set default timeout method to thread if not specified
|
# Set default timeout method to thread if not specified
|
||||||
if not config.getoption("--timeout-method", None):
|
if not config.getoption("--timeout-method", None):
|
||||||
config.option.timeout_method = "thread"
|
config.option.timeout_method = "thread"
|
||||||
|
|
||||||
|
# Add more logging
|
||||||
|
print(f"🔧 Pytest configured at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f" Python version: {os.sys.version}")
|
||||||
|
print(f" Platform: {os.sys.platform}")
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_sessionstart(session):
|
||||||
|
"""Called after the Session object has been created."""
|
||||||
|
print(f"🏁 Pytest session starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f" Session ID: {id(session)}")
|
||||||
|
|
||||||
|
# Show initial process state
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
current = psutil.Process()
|
||||||
|
print(f" Current PID: {current.pid}")
|
||||||
|
print(f" Parent PID: {current.ppid()}")
|
||||||
|
children = current.children(recursive=True)
|
||||||
|
if children:
|
||||||
|
print(f" ⚠️ Already have {len(children)} child processes at start!")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_sessionfinish(session, exitstatus):
|
||||||
|
"""Called after whole test run finished."""
|
||||||
|
print(f"🏁 Pytest session finishing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
print(f" Exit status: {exitstatus}")
|
||||||
|
|
||||||
|
# Aggressive cleanup before pytest exits
|
||||||
|
print("🧹 Starting aggressive cleanup...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
current = psutil.Process()
|
||||||
|
children = current.children(recursive=True)
|
||||||
|
|
||||||
|
if children:
|
||||||
|
print(f" Found {len(children)} child processes to clean up:")
|
||||||
|
for child in children:
|
||||||
|
try:
|
||||||
|
print(f" - PID {child.pid}: {child.name()} (status: {child.status()})")
|
||||||
|
child.terminate()
|
||||||
|
except Exception as e:
|
||||||
|
print(f" - Failed to terminate {child.pid}: {e}")
|
||||||
|
|
||||||
|
# Wait briefly then kill
|
||||||
|
time.sleep(0.5)
|
||||||
|
_, alive = psutil.wait_procs(children, timeout=1)
|
||||||
|
|
||||||
|
for child in alive:
|
||||||
|
try:
|
||||||
|
print(f" - Force killing {child.pid}")
|
||||||
|
child.kill()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
print(" No child processes found")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Cleanup error: {e}")
|
||||||
|
|
||||||
|
print(f"✅ Pytest exiting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
|||||||
Reference in New Issue
Block a user