Files
LEANN/.github/workflows/build-reusable.yml
Andy Lee eb6c9e0a32 fix: move tmate debug session inside pytest step to avoid hanging
The issue was that tmate was placed before pytest step, but the hang
occurs during pytest execution. Now tmate starts inside the test step
and provides connection info before running tests.
2025-08-11 22:19:34 +00:00

429 lines
16 KiB
YAML

name: Reusable Build
on:
workflow_call:
inputs:
ref:
description: 'Git ref to build'
required: false
type: string
default: ''
debug_enabled:
description: 'Enable tmate debugging session for troubleshooting'
required: false
type: boolean
default: false
jobs:
lint:
name: Lint and Format Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Install ruff
run: |
uv tool install ruff==0.12.7
- name: Run ruff check
run: |
ruff check .
- name: Run ruff format check
run: |
ruff format --check .
build:
needs: lint
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
strategy:
matrix:
include:
- os: ubuntu-22.04
python: '3.9'
- os: ubuntu-22.04
python: '3.10'
- os: ubuntu-22.04
python: '3.11'
- os: ubuntu-22.04
python: '3.12'
- os: ubuntu-22.04
python: '3.13'
- os: macos-latest
python: '3.9'
- os: macos-latest
python: '3.10'
- os: macos-latest
python: '3.11'
- os: macos-latest
python: '3.12'
- os: macos-latest
python: '3.13'
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Install system dependencies (Ubuntu)
if: runner.os == 'Linux'
run: |
sudo apt-get update
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
# Install Intel MKL for DiskANN
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
source /opt/intel/oneapi/setvars.sh
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Install system dependencies (macOS)
if: runner.os == 'macOS'
run: |
# Don't install LLVM, use system clang for better compatibility
brew install libomp boost protobuf zeromq
- name: Install build dependencies
run: |
uv pip install --system scikit-build-core numpy swig Cython pybind11
if [[ "$RUNNER_OS" == "Linux" ]]; then
uv pip install --system auditwheel
else
uv pip install --system delocate
fi
- name: Build packages
run: |
# Build core (platform independent) on all platforms for consistency
cd packages/leann-core
uv build
cd ../..
# Build HNSW backend
cd packages/leann-backend-hnsw
if [ "${{ matrix.os }}" == "macos-latest" ]; then
# Use system clang instead of homebrew LLVM for better compatibility
export CC=clang
export CXX=clang++
export MACOSX_DEPLOYMENT_TARGET=11.0
uv build --wheel --python python
else
uv build --wheel --python python
fi
cd ../..
# Build DiskANN backend
cd packages/leann-backend-diskann
if [ "${{ matrix.os }}" == "macos-latest" ]; then
# Use system clang instead of homebrew LLVM for better compatibility
export CC=clang
export CXX=clang++
# sgesdd_ is only available on macOS 13.3+
export MACOSX_DEPLOYMENT_TARGET=13.3
uv build --wheel --python python
else
uv build --wheel --python python
fi
cd ../..
# Build meta package (platform independent) on all platforms
cd packages/leann
uv build
cd ../..
- name: Repair wheels (Linux)
if: runner.os == 'Linux'
run: |
# Repair HNSW wheel
cd packages/leann-backend-hnsw
if [ -d dist ]; then
auditwheel repair dist/*.whl -w dist_repaired
rm -rf dist
mv dist_repaired dist
fi
cd ../..
# Repair DiskANN wheel - use show first to debug
cd packages/leann-backend-diskann
if [ -d dist ]; then
echo "Checking DiskANN wheel contents before repair:"
unzip -l dist/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found"
auditwheel show dist/*.whl || echo "auditwheel show failed"
auditwheel repair dist/*.whl -w dist_repaired
echo "Checking DiskANN wheel contents after repair:"
unzip -l dist_repaired/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found after repair"
rm -rf dist
mv dist_repaired dist
fi
cd ../..
- name: Repair wheels (macOS)
if: runner.os == 'macOS'
run: |
# Repair HNSW wheel
cd packages/leann-backend-hnsw
if [ -d dist ]; then
delocate-wheel -w dist_repaired -v dist/*.whl
rm -rf dist
mv dist_repaired dist
fi
cd ../..
# Repair DiskANN wheel
cd packages/leann-backend-diskann
if [ -d dist ]; then
delocate-wheel -w dist_repaired -v dist/*.whl
rm -rf dist
mv dist_repaired dist
fi
cd ../..
- name: List built packages
run: |
echo "📦 Built packages:"
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
- name: Install built packages for testing
run: |
# Create a virtual environment with the correct Python version
uv venv --python python${{ matrix.python }}
source .venv/bin/activate || source .venv/Scripts/activate
# Install the built wheels directly to ensure we use locally built packages
# Use only locally built wheels on all platforms for full consistency
FIND_LINKS="--find-links packages/leann-core/dist --find-links packages/leann/dist"
FIND_LINKS="$FIND_LINKS --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist"
uv pip install leann-core leann leann-backend-hnsw leann-backend-diskann \
$FIND_LINKS --force-reinstall
# Install test dependencies using extras
uv pip install -e ".[test]"
# Debug: Check if _diskannpy module is installed correctly
echo "Checking installed DiskANN module structure:"
python -c "import leann_backend_diskann; print('leann_backend_diskann location:', leann_backend_diskann.__file__)" || echo "Failed to import leann_backend_diskann"
python -c "from leann_backend_diskann import _diskannpy; print('_diskannpy imported successfully')" || echo "Failed to import _diskannpy"
ls -la $(python -c "import leann_backend_diskann; import os; print(os.path.dirname(leann_backend_diskann.__file__))" 2>/dev/null) 2>/dev/null || echo "Failed to list module directory"
- name: Run tests with pytest
env:
CI: true # Mark as CI environment to skip memory-intensive tests
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
HF_HUB_DISABLE_SYMLINKS: 1
TOKENIZERS_PARALLELISM: false
PYTORCH_ENABLE_MPS_FALLBACK: 0 # Disable MPS on macOS CI to avoid memory issues
OMP_NUM_THREADS: 1 # Disable OpenMP parallelism to avoid libomp crashes
MKL_NUM_THREADS: 1 # Single thread for MKL operations
run: |
# Activate virtual environment
source .venv/bin/activate || source .venv/Scripts/activate
# Start tmate session INSIDE the test step if debug enabled
if [[ "${{ inputs.debug_enabled }}" == "true" ]]; then
echo "🔧 DEBUG MODE: Starting tmate session before tests..."
# Install tmate if not available
if ! command -v tmate &> /dev/null; then
if [[ "$RUNNER_OS" == "Linux" ]]; then
sudo apt-get update && sudo apt-get install -y tmate
elif [[ "$RUNNER_OS" == "macOS" ]]; then
brew install tmate
fi
fi
# Start tmate session in background
tmate -S debug-session new-session -d
echo "🔗 Tmate session created. Connection info:"
tmate -S debug-session display -p '#{tmate_ssh}'
echo "🔗 Web session: $(tmate -S debug-session display -p '#{tmate_web}')"
echo "⏱️ Session will timeout after 30 minutes"
echo "💡 You can now SSH in and run: pytest tests/ -vv --capture=no"
echo "💡 Or run diagnostics: bash scripts/diagnose_hang.sh"
echo ""
echo "Waiting 60 seconds for you to connect..."
sleep 60
fi
# Define comprehensive diagnostic function
diag() {
echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN ====="
date
echo ""
echo "### Current Shell Info ###"
echo "Shell PID: $$"
echo "Shell PPID: $PPID"
echo "Current directory: $(pwd)"
echo ""
echo "### Process Tree (full) ###"
pstree -ap 2>/dev/null || ps auxf || true
echo ""
echo "### All Python/Pytest Processes ###"
ps -ef | grep -E 'python|pytest' | grep -v grep || true
echo ""
echo "### Embedding Server Processes ###"
ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true
echo ""
echo "### Network Listeners ###"
ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true
echo ""
echo "### Open File Descriptors (lsof) ###"
lsof -p $$ 2>/dev/null | head -20 || true
echo ""
echo "### Zombie Processes ###"
ps aux | grep '<defunct>' || echo "No zombie processes"
echo ""
echo "### Current Jobs ###"
jobs -l || true
echo ""
echo "### /proc/PID/fd for current shell ###"
ls -la /proc/$$/fd 2>/dev/null || true
echo ""
echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
}
# Enable verbose logging for debugging
export PYTHONUNBUFFERED=1
export PYTEST_CURRENT_TEST=1
# Run all tests with extensive logging
if [[ "$RUNNER_OS" == "Linux" ]]; then
echo "🚀 Starting Linux test execution with timeout..."
echo "Current time: $(date)"
echo "Shell PID: $$"
echo "Python: $(python --version)"
echo "Pytest: $(pytest --version)"
# Show environment variables for debugging
echo "📦 Environment variables:"
env | grep -E "PYTHON|PYTEST|CI|RUNNER" | sort
# Set trap for diagnostics
trap diag INT TERM EXIT
echo "📋 Pre-test diagnostics:"
ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
# Check for any listening ports before test
echo "🔌 Pre-test network state:"
ss -ltn 2>/dev/null | grep -E "555[0-9]|556[0-9]" || echo "No embedding server ports open"
echo "🏃 Running pytest with 180s timeout..."
timeout --preserve-status --signal=INT --kill-after=10 180 bash -c '
echo "⏱️ Pytest starting at: $(date)"
echo "Running command: pytest tests/ -vv --maxfail=3 --tb=short --capture=no"
# Run pytest with maximum verbosity and no output capture
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=DEBUG 2>&1 | tee pytest.log
PYTEST_EXIT=${PIPESTATUS[0]}
echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
echo "Last 20 lines of pytest output:"
tail -20 pytest.log || true
# Immediately check for leftover processes
echo "🔍 Post-pytest process check:"
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes"
# Clean up any children before exit
echo "🧹 Cleaning up child processes..."
pkill -TERM -P $$ 2>/dev/null || true
sleep 0.5
pkill -KILL -P $$ 2>/dev/null || true
echo "📊 Final check before exit:"
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean"
exit $PYTEST_EXIT
'
EXIT_CODE=$?
echo "🔚 Timeout command exited with code: $EXIT_CODE"
if [ $EXIT_CODE -eq 124 ]; then
echo "⚠️ TIMEOUT TRIGGERED - Tests took more than 180 seconds!"
echo "📸 Capturing full diagnostics..."
diag
# Run diagnostic script if available
if [ -f scripts/diagnose_hang.sh ]; then
echo "🔍 Running diagnostic script..."
bash scripts/diagnose_hang.sh || true
fi
# More aggressive cleanup
echo "💀 Killing all Python processes owned by runner..."
pkill -9 -u runner python || true
pkill -9 -u runner pytest || true
elif [ $EXIT_CODE -ne 0 ]; then
echo "❌ Tests failed with exit code: $EXIT_CODE"
else
echo "✅ All tests passed!"
fi
# Always show final state
echo "📍 Final state check:"
ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
exit $EXIT_CODE
else
# For macOS/Windows, run without GNU timeout
echo "🚀 Running tests on $RUNNER_OS..."
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=INFO
fi
# Provide tmate session on test failure for debugging
- name: Setup tmate session on failure
if: ${{ failure() && (inputs.debug_enabled || contains(github.event.head_commit.message, '[debug]')) }}
uses: mxschmitt/action-tmate@v3
with:
timeout-minutes: 30
limit-access-to-actor: true
- name: Run sanity checks (optional)
run: |
# Activate virtual environment
source .venv/bin/activate || source .venv/Scripts/activate
# Run distance function tests if available
if [ -f test/sanity_checks/test_distance_functions.py ]; then
echo "Running distance function sanity checks..."
python test/sanity_checks/test_distance_functions.py || echo "⚠️ Distance function test failed, continuing..."
fi
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: packages-${{ matrix.os }}-py${{ matrix.python }}
path: packages/*/dist/