The issue was that tmate was placed before pytest step, but the hang occurs during pytest execution. Now tmate starts inside the test step and provides connection info before running tests.
429 lines
16 KiB
YAML
429 lines
16 KiB
YAML
name: Reusable Build
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
ref:
|
|
description: 'Git ref to build'
|
|
required: false
|
|
type: string
|
|
default: ''
|
|
debug_enabled:
|
|
description: 'Enable tmate debugging session for troubleshooting'
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
|
|
jobs:
|
|
lint:
|
|
name: Lint and Format Check
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.ref }}
|
|
|
|
- name: Setup Python
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: '3.11'
|
|
|
|
- name: Install uv
|
|
uses: astral-sh/setup-uv@v4
|
|
|
|
- name: Install ruff
|
|
run: |
|
|
uv tool install ruff==0.12.7
|
|
|
|
- name: Run ruff check
|
|
run: |
|
|
ruff check .
|
|
|
|
- name: Run ruff format check
|
|
run: |
|
|
ruff format --check .
|
|
|
|
build:
|
|
needs: lint
|
|
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
|
|
strategy:
|
|
matrix:
|
|
include:
|
|
- os: ubuntu-22.04
|
|
python: '3.9'
|
|
- os: ubuntu-22.04
|
|
python: '3.10'
|
|
- os: ubuntu-22.04
|
|
python: '3.11'
|
|
- os: ubuntu-22.04
|
|
python: '3.12'
|
|
- os: ubuntu-22.04
|
|
python: '3.13'
|
|
- os: macos-latest
|
|
python: '3.9'
|
|
- os: macos-latest
|
|
python: '3.10'
|
|
- os: macos-latest
|
|
python: '3.11'
|
|
- os: macos-latest
|
|
python: '3.12'
|
|
- os: macos-latest
|
|
python: '3.13'
|
|
runs-on: ${{ matrix.os }}
|
|
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
with:
|
|
ref: ${{ inputs.ref }}
|
|
submodules: recursive
|
|
|
|
- name: Setup Python
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: ${{ matrix.python }}
|
|
|
|
- name: Install uv
|
|
uses: astral-sh/setup-uv@v4
|
|
|
|
- name: Install system dependencies (Ubuntu)
|
|
if: runner.os == 'Linux'
|
|
run: |
|
|
sudo apt-get update
|
|
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
|
|
pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
|
|
|
|
# Install Intel MKL for DiskANN
|
|
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
|
|
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
|
|
source /opt/intel/oneapi/setvars.sh
|
|
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
|
|
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
|
|
|
|
- name: Install system dependencies (macOS)
|
|
if: runner.os == 'macOS'
|
|
run: |
|
|
# Don't install LLVM, use system clang for better compatibility
|
|
brew install libomp boost protobuf zeromq
|
|
|
|
- name: Install build dependencies
|
|
run: |
|
|
uv pip install --system scikit-build-core numpy swig Cython pybind11
|
|
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
|
uv pip install --system auditwheel
|
|
else
|
|
uv pip install --system delocate
|
|
fi
|
|
|
|
- name: Build packages
|
|
run: |
|
|
# Build core (platform independent) on all platforms for consistency
|
|
cd packages/leann-core
|
|
uv build
|
|
cd ../..
|
|
|
|
# Build HNSW backend
|
|
cd packages/leann-backend-hnsw
|
|
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
|
# Use system clang instead of homebrew LLVM for better compatibility
|
|
export CC=clang
|
|
export CXX=clang++
|
|
export MACOSX_DEPLOYMENT_TARGET=11.0
|
|
uv build --wheel --python python
|
|
else
|
|
uv build --wheel --python python
|
|
fi
|
|
cd ../..
|
|
|
|
# Build DiskANN backend
|
|
cd packages/leann-backend-diskann
|
|
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
|
# Use system clang instead of homebrew LLVM for better compatibility
|
|
export CC=clang
|
|
export CXX=clang++
|
|
# sgesdd_ is only available on macOS 13.3+
|
|
export MACOSX_DEPLOYMENT_TARGET=13.3
|
|
uv build --wheel --python python
|
|
else
|
|
uv build --wheel --python python
|
|
fi
|
|
cd ../..
|
|
|
|
# Build meta package (platform independent) on all platforms
|
|
cd packages/leann
|
|
uv build
|
|
cd ../..
|
|
|
|
- name: Repair wheels (Linux)
|
|
if: runner.os == 'Linux'
|
|
run: |
|
|
# Repair HNSW wheel
|
|
cd packages/leann-backend-hnsw
|
|
if [ -d dist ]; then
|
|
auditwheel repair dist/*.whl -w dist_repaired
|
|
rm -rf dist
|
|
mv dist_repaired dist
|
|
fi
|
|
cd ../..
|
|
|
|
# Repair DiskANN wheel - use show first to debug
|
|
cd packages/leann-backend-diskann
|
|
if [ -d dist ]; then
|
|
echo "Checking DiskANN wheel contents before repair:"
|
|
unzip -l dist/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found"
|
|
auditwheel show dist/*.whl || echo "auditwheel show failed"
|
|
auditwheel repair dist/*.whl -w dist_repaired
|
|
echo "Checking DiskANN wheel contents after repair:"
|
|
unzip -l dist_repaired/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found after repair"
|
|
rm -rf dist
|
|
mv dist_repaired dist
|
|
fi
|
|
cd ../..
|
|
|
|
- name: Repair wheels (macOS)
|
|
if: runner.os == 'macOS'
|
|
run: |
|
|
# Repair HNSW wheel
|
|
cd packages/leann-backend-hnsw
|
|
if [ -d dist ]; then
|
|
delocate-wheel -w dist_repaired -v dist/*.whl
|
|
rm -rf dist
|
|
mv dist_repaired dist
|
|
fi
|
|
cd ../..
|
|
|
|
# Repair DiskANN wheel
|
|
cd packages/leann-backend-diskann
|
|
if [ -d dist ]; then
|
|
delocate-wheel -w dist_repaired -v dist/*.whl
|
|
rm -rf dist
|
|
mv dist_repaired dist
|
|
fi
|
|
cd ../..
|
|
|
|
- name: List built packages
|
|
run: |
|
|
echo "📦 Built packages:"
|
|
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
|
|
|
- name: Install built packages for testing
|
|
run: |
|
|
# Create a virtual environment with the correct Python version
|
|
uv venv --python python${{ matrix.python }}
|
|
source .venv/bin/activate || source .venv/Scripts/activate
|
|
|
|
# Install the built wheels directly to ensure we use locally built packages
|
|
# Use only locally built wheels on all platforms for full consistency
|
|
FIND_LINKS="--find-links packages/leann-core/dist --find-links packages/leann/dist"
|
|
FIND_LINKS="$FIND_LINKS --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist"
|
|
|
|
uv pip install leann-core leann leann-backend-hnsw leann-backend-diskann \
|
|
$FIND_LINKS --force-reinstall
|
|
|
|
# Install test dependencies using extras
|
|
uv pip install -e ".[test]"
|
|
|
|
# Debug: Check if _diskannpy module is installed correctly
|
|
echo "Checking installed DiskANN module structure:"
|
|
python -c "import leann_backend_diskann; print('leann_backend_diskann location:', leann_backend_diskann.__file__)" || echo "Failed to import leann_backend_diskann"
|
|
python -c "from leann_backend_diskann import _diskannpy; print('_diskannpy imported successfully')" || echo "Failed to import _diskannpy"
|
|
ls -la $(python -c "import leann_backend_diskann; import os; print(os.path.dirname(leann_backend_diskann.__file__))" 2>/dev/null) 2>/dev/null || echo "Failed to list module directory"
|
|
|
|
- name: Run tests with pytest
|
|
env:
|
|
CI: true # Mark as CI environment to skip memory-intensive tests
|
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
HF_HUB_DISABLE_SYMLINKS: 1
|
|
TOKENIZERS_PARALLELISM: false
|
|
PYTORCH_ENABLE_MPS_FALLBACK: 0 # Disable MPS on macOS CI to avoid memory issues
|
|
OMP_NUM_THREADS: 1 # Disable OpenMP parallelism to avoid libomp crashes
|
|
MKL_NUM_THREADS: 1 # Single thread for MKL operations
|
|
run: |
|
|
# Activate virtual environment
|
|
source .venv/bin/activate || source .venv/Scripts/activate
|
|
|
|
# Start tmate session INSIDE the test step if debug enabled
|
|
if [[ "${{ inputs.debug_enabled }}" == "true" ]]; then
|
|
echo "🔧 DEBUG MODE: Starting tmate session before tests..."
|
|
# Install tmate if not available
|
|
if ! command -v tmate &> /dev/null; then
|
|
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
|
sudo apt-get update && sudo apt-get install -y tmate
|
|
elif [[ "$RUNNER_OS" == "macOS" ]]; then
|
|
brew install tmate
|
|
fi
|
|
fi
|
|
|
|
# Start tmate session in background
|
|
tmate -S debug-session new-session -d
|
|
echo "🔗 Tmate session created. Connection info:"
|
|
tmate -S debug-session display -p '#{tmate_ssh}'
|
|
echo "🔗 Web session: $(tmate -S debug-session display -p '#{tmate_web}')"
|
|
echo "⏱️ Session will timeout after 30 minutes"
|
|
echo "💡 You can now SSH in and run: pytest tests/ -vv --capture=no"
|
|
echo "💡 Or run diagnostics: bash scripts/diagnose_hang.sh"
|
|
echo ""
|
|
echo "Waiting 60 seconds for you to connect..."
|
|
sleep 60
|
|
fi
|
|
|
|
# Define comprehensive diagnostic function
|
|
diag() {
|
|
echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN ====="
|
|
date
|
|
echo ""
|
|
echo "### Current Shell Info ###"
|
|
echo "Shell PID: $$"
|
|
echo "Shell PPID: $PPID"
|
|
echo "Current directory: $(pwd)"
|
|
echo ""
|
|
|
|
echo "### Process Tree (full) ###"
|
|
pstree -ap 2>/dev/null || ps auxf || true
|
|
echo ""
|
|
|
|
echo "### All Python/Pytest Processes ###"
|
|
ps -ef | grep -E 'python|pytest' | grep -v grep || true
|
|
echo ""
|
|
|
|
echo "### Embedding Server Processes ###"
|
|
ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true
|
|
echo ""
|
|
|
|
echo "### Network Listeners ###"
|
|
ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true
|
|
echo ""
|
|
|
|
echo "### Open File Descriptors (lsof) ###"
|
|
lsof -p $$ 2>/dev/null | head -20 || true
|
|
echo ""
|
|
|
|
echo "### Zombie Processes ###"
|
|
ps aux | grep '<defunct>' || echo "No zombie processes"
|
|
echo ""
|
|
|
|
echo "### Current Jobs ###"
|
|
jobs -l || true
|
|
echo ""
|
|
|
|
echo "### /proc/PID/fd for current shell ###"
|
|
ls -la /proc/$$/fd 2>/dev/null || true
|
|
echo ""
|
|
|
|
echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
|
|
}
|
|
|
|
# Enable verbose logging for debugging
|
|
export PYTHONUNBUFFERED=1
|
|
export PYTEST_CURRENT_TEST=1
|
|
|
|
# Run all tests with extensive logging
|
|
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
|
echo "🚀 Starting Linux test execution with timeout..."
|
|
echo "Current time: $(date)"
|
|
echo "Shell PID: $$"
|
|
echo "Python: $(python --version)"
|
|
echo "Pytest: $(pytest --version)"
|
|
|
|
# Show environment variables for debugging
|
|
echo "📦 Environment variables:"
|
|
env | grep -E "PYTHON|PYTEST|CI|RUNNER" | sort
|
|
|
|
# Set trap for diagnostics
|
|
trap diag INT TERM EXIT
|
|
|
|
echo "📋 Pre-test diagnostics:"
|
|
ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
|
|
|
|
# Check for any listening ports before test
|
|
echo "🔌 Pre-test network state:"
|
|
ss -ltn 2>/dev/null | grep -E "555[0-9]|556[0-9]" || echo "No embedding server ports open"
|
|
|
|
echo "🏃 Running pytest with 180s timeout..."
|
|
timeout --preserve-status --signal=INT --kill-after=10 180 bash -c '
|
|
echo "⏱️ Pytest starting at: $(date)"
|
|
echo "Running command: pytest tests/ -vv --maxfail=3 --tb=short --capture=no"
|
|
|
|
# Run pytest with maximum verbosity and no output capture
|
|
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=DEBUG 2>&1 | tee pytest.log
|
|
PYTEST_EXIT=${PIPESTATUS[0]}
|
|
|
|
echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
|
|
echo "Last 20 lines of pytest output:"
|
|
tail -20 pytest.log || true
|
|
|
|
# Immediately check for leftover processes
|
|
echo "🔍 Post-pytest process check:"
|
|
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes"
|
|
|
|
# Clean up any children before exit
|
|
echo "🧹 Cleaning up child processes..."
|
|
pkill -TERM -P $$ 2>/dev/null || true
|
|
sleep 0.5
|
|
pkill -KILL -P $$ 2>/dev/null || true
|
|
|
|
echo "📊 Final check before exit:"
|
|
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean"
|
|
|
|
exit $PYTEST_EXIT
|
|
'
|
|
|
|
EXIT_CODE=$?
|
|
echo "🔚 Timeout command exited with code: $EXIT_CODE"
|
|
|
|
if [ $EXIT_CODE -eq 124 ]; then
|
|
echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!"
|
|
echo "📸 Capturing full diagnostics..."
|
|
diag
|
|
|
|
# Run diagnostic script if available
|
|
if [ -f scripts/diagnose_hang.sh ]; then
|
|
echo "🔍 Running diagnostic script..."
|
|
bash scripts/diagnose_hang.sh || true
|
|
fi
|
|
|
|
# More aggressive cleanup
|
|
echo "💀 Killing all Python processes owned by runner..."
|
|
pkill -9 -u runner python || true
|
|
pkill -9 -u runner pytest || true
|
|
elif [ $EXIT_CODE -ne 0 ]; then
|
|
echo "❌ Tests failed with exit code: $EXIT_CODE"
|
|
else
|
|
echo "✅ All tests passed!"
|
|
fi
|
|
|
|
# Always show final state
|
|
echo "📍 Final state check:"
|
|
ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
|
|
|
|
exit $EXIT_CODE
|
|
else
|
|
# For macOS/Windows, run without GNU timeout
|
|
echo "🚀 Running tests on $RUNNER_OS..."
|
|
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=INFO
|
|
fi
|
|
|
|
# Provide tmate session on test failure for debugging
|
|
- name: Setup tmate session on failure
|
|
if: ${{ failure() && (inputs.debug_enabled || contains(github.event.head_commit.message, '[debug]')) }}
|
|
uses: mxschmitt/action-tmate@v3
|
|
with:
|
|
timeout-minutes: 30
|
|
limit-access-to-actor: true
|
|
|
|
- name: Run sanity checks (optional)
|
|
run: |
|
|
# Activate virtual environment
|
|
source .venv/bin/activate || source .venv/Scripts/activate
|
|
|
|
# Run distance function tests if available
|
|
if [ -f test/sanity_checks/test_distance_functions.py ]; then
|
|
echo "Running distance function sanity checks..."
|
|
python test/sanity_checks/test_distance_functions.py || echo "⚠️ Distance function test failed, continuing..."
|
|
fi
|
|
|
|
- name: Upload artifacts
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: packages-${{ matrix.os }}-py${{ matrix.python }}
|
|
path: packages/*/dist/
|