Files
LEANN/.github/workflows/build-reusable.yml
Andy Lee 8d06aa99f4 feat: add comprehensive hang detection for pytest CI debugging
- Add Python faulthandler integration with signal-triggered stack dumps
- Implement periodic stack dumps at 5min and 10min intervals
- Add external process monitoring with SIGUSR1 signal on hang detection
- Use debug_pytest.py wrapper to capture exact hang location in C++ cleanup
- Enhance CPU stability monitoring to trigger precise stack traces

This addresses the persistent pytest hanging issue in Ubuntu 22.04 CI by
providing detailed stack traces to identify the exact code location where
the hang occurs during test cleanup phase.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-12 12:42:16 -07:00

471 lines
19 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
name: Reusable Build
on:
workflow_call:
inputs:
ref:
description: 'Git ref to build'
required: false
type: string
default: ''
jobs:
lint:
name: Lint and Format Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Install ruff
run: |
uv tool install ruff
- name: Run ruff check
run: |
ruff check .
- name: Run ruff format check
run: |
ruff format --check .
build:
needs: lint
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
strategy:
matrix:
include:
- os: ubuntu-22.04
python: '3.9'
- os: ubuntu-22.04
python: '3.10'
- os: ubuntu-22.04
python: '3.11'
- os: ubuntu-22.04
python: '3.12'
- os: ubuntu-22.04
python: '3.13'
- os: macos-14
python: '3.9'
- os: macos-14
python: '3.10'
- os: macos-14
python: '3.11'
- os: macos-14
python: '3.12'
- os: macos-14
python: '3.13'
- os: macos-13
python: '3.9'
- os: macos-13
python: '3.10'
- os: macos-13
python: '3.11'
- os: macos-13
python: '3.12'
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Install system dependencies (Ubuntu)
if: runner.os == 'Linux'
run: |
sudo apt-get update
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
# Install Intel MKL for DiskANN
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
source /opt/intel/oneapi/setvars.sh
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Install system dependencies (macOS)
if: runner.os == 'macOS'
run: |
# Don't install LLVM, use system clang for better compatibility
brew install libomp boost protobuf zeromq
- name: Install build dependencies
run: |
uv pip install --system scikit-build-core numpy swig Cython pybind11
if [[ "$RUNNER_OS" == "Linux" ]]; then
uv pip install --system auditwheel
else
uv pip install --system delocate
fi
- name: Set macOS environment variables
if: runner.os == 'macOS'
run: |
# Use brew --prefix to automatically detect Homebrew installation path
HOMEBREW_PREFIX=$(brew --prefix)
echo "HOMEBREW_PREFIX=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
echo "OpenMP_ROOT=${HOMEBREW_PREFIX}/opt/libomp" >> $GITHUB_ENV
# Set CMAKE_PREFIX_PATH to let CMake find all packages automatically
echo "CMAKE_PREFIX_PATH=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
# Set compiler flags for OpenMP (required for both backends)
echo "LDFLAGS=-L${HOMEBREW_PREFIX}/opt/libomp/lib" >> $GITHUB_ENV
echo "CPPFLAGS=-I${HOMEBREW_PREFIX}/opt/libomp/include" >> $GITHUB_ENV
- name: Build packages
run: |
# Build core (platform independent)
cd packages/leann-core
uv build
cd ../..
# Build HNSW backend
cd packages/leann-backend-hnsw
if [[ "${{ matrix.os }}" == macos-* ]]; then
# Use system clang for better compatibility
export CC=clang
export CXX=clang++
export MACOSX_DEPLOYMENT_TARGET=11.0
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
fi
cd ../..
# Build DiskANN backend
cd packages/leann-backend-diskann
if [[ "${{ matrix.os }}" == macos-* ]]; then
# Use system clang for better compatibility
export CC=clang
export CXX=clang++
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
export MACOSX_DEPLOYMENT_TARGET=13.3
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
else
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
fi
cd ../..
# Build meta package (platform independent)
cd packages/leann
uv build
cd ../..
- name: Repair wheels (Linux)
if: runner.os == 'Linux'
run: |
# Repair HNSW wheel
cd packages/leann-backend-hnsw
if [ -d dist ]; then
auditwheel repair dist/*.whl -w dist_repaired
rm -rf dist
mv dist_repaired dist
fi
cd ../..
# Repair DiskANN wheel
cd packages/leann-backend-diskann
if [ -d dist ]; then
auditwheel repair dist/*.whl -w dist_repaired
rm -rf dist
mv dist_repaired dist
fi
cd ../..
- name: Repair wheels (macOS)
if: runner.os == 'macOS'
run: |
# Repair HNSW wheel
cd packages/leann-backend-hnsw
if [ -d dist ]; then
delocate-wheel -w dist_repaired -v dist/*.whl
rm -rf dist
mv dist_repaired dist
fi
cd ../..
# Repair DiskANN wheel
cd packages/leann-backend-diskann
if [ -d dist ]; then
delocate-wheel -w dist_repaired -v dist/*.whl
rm -rf dist
mv dist_repaired dist
fi
cd ../..
- name: List built packages
run: |
echo "📦 Built packages:"
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
- name: Install built packages for testing
run: |
# Create a virtual environment with the correct Python version
uv venv --python ${{ matrix.python }}
source .venv/bin/activate || source .venv/Scripts/activate
# Install packages using --find-links to prioritize local builds
uv pip install --find-links packages/leann-core/dist --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist packages/leann-core/dist/*.whl || uv pip install --find-links packages/leann-core/dist packages/leann-core/dist/*.tar.gz
uv pip install --find-links packages/leann-core/dist packages/leann-backend-hnsw/dist/*.whl
uv pip install --find-links packages/leann-core/dist packages/leann-backend-diskann/dist/*.whl
uv pip install packages/leann/dist/*.whl || uv pip install packages/leann/dist/*.tar.gz
# Install test dependencies using extras
uv pip install -e ".[test]"
- name: Run tests with pytest
env:
CI: true # Mark as CI environment to skip memory-intensive tests
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
HF_HUB_DISABLE_SYMLINKS: 1
TOKENIZERS_PARALLELISM: false
PYTORCH_ENABLE_MPS_FALLBACK: 0 # Disable MPS on macOS CI to avoid memory issues
OMP_NUM_THREADS: 1 # Disable OpenMP parallelism to avoid libomp crashes
MKL_NUM_THREADS: 1 # Single thread for MKL operations
run: |
# Activate virtual environment
source .venv/bin/activate || source .venv/Scripts/activate
# Add targeted debugging for pytest hangs (especially Ubuntu 22.04)
if [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then
echo "🔍 [HANG DEBUG] Ubuntu 22.04 detected - enabling enhanced process monitoring"
# Create Python script to inject stack trace dumping into pytest
cat > debug_pytest.py << 'EOF'
import signal
import faulthandler
import threading
import time
import sys
import traceback
def setup_hang_detection():
"""Setup signal handlers and faulthandler for hang detection"""
# Enable faulthandler for automatic stack dumps
faulthandler.enable()
def dump_all_stacks(signum, frame):
print(f"\n🔥 [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:")
faulthandler.dump_traceback()
# Also dump current frames manually
for thread_id, frame in sys._current_frames().items():
print(f"\n📍 Thread {thread_id}:")
traceback.print_stack(frame)
def periodic_stack_dump():
"""Periodically dump stacks to catch hang location"""
time.sleep(300) # Wait 5 minutes
print(f"\n⏰ [HANG DEBUG] Periodic stack dump at {time.time()}:")
for thread_id, frame in sys._current_frames().items():
print(f"\n📍 Thread {thread_id}:")
traceback.print_stack(frame)
time.sleep(300) # Wait another 5 minutes if still running
print(f"\n⚠ [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):")
faulthandler.dump_traceback()
# Register signal handlers for external debugging
signal.signal(signal.SIGUSR1, dump_all_stacks)
signal.signal(signal.SIGUSR2, dump_all_stacks)
# Start periodic dumping thread
dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True)
dump_thread.start()
if __name__ == "__main__":
setup_hang_detection()
# Re-exec pytest with debugging enabled
import subprocess
result = subprocess.run([sys.executable, "-m", "pytest"] + sys.argv[1:])
sys.exit(result.returncode)
EOF
# Pre-test state
echo "📊 [HANG DEBUG] Pre-test process state:"
ps aux | grep -E "(python|embedding|zmq)" | grep -v grep || echo "No relevant processes"
echo "🔌 [HANG DEBUG] Pre-test network state:"
ss -tulpn | grep -E "(555[0-9]|556[0-9])" || echo "No embedding server ports"
# Function to monitor processes during test
monitor_processes() {
while true; do
sleep 30
echo "⏰ [HANG DEBUG] $(date): Process check during test execution"
ps aux | grep -E "(python|pytest|embedding)" | grep -v grep | head -10
ss -tulpn | grep -E "(555[0-9]|556[0-9])" || echo "No ports"
done
}
# Start background monitoring
monitor_processes &
MONITOR_PID=$!
echo "🔍 [HANG DEBUG] Started background monitor (PID: $MONITOR_PID)"
# Run pytest with enhanced real-time monitoring (no dependency on pytest logs)
echo "🚀 [HANG DEBUG] Starting pytest with 600s timeout and external monitoring..."
# Start independent process monitor that doesn't rely on pytest output
external_monitor() {
local pytest_pid=$1
local start_time=$(date +%s)
local last_cpu_check=0
local stable_count=0
while true; do
sleep 5
current_time=$(date +%s)
elapsed=$((current_time - start_time))
# Check if pytest process still exists
if ! kill -0 $pytest_pid 2>/dev/null; then
echo "📊 [EXTERNAL] $(date): Pytest process $pytest_pid ended after ${elapsed}s"
break
fi
# Get detailed process info
ps_info=$(ps -p $pytest_pid -o pid,ppid,time,pcpu,pmem,state,comm 2>/dev/null || echo "PROCESS_GONE")
if [ "$ps_info" != "PROCESS_GONE" ]; then
echo "📊 [EXTERNAL] $(date): Process $pytest_pid - ${ps_info}"
# Extract CPU percentage and check for stability
current_cpu=$(echo "$ps_info" | tail -1 | awk '{print $4}' | cut -d. -f1)
if [ "$current_cpu" = "$last_cpu_check" ] && [ "$current_cpu" -lt 5 ]; then
stable_count=$((stable_count + 1))
if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU
echo "⚠️ [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s"
echo "🔍 [EXTERNAL] $(date): Sending SIGUSR1 to dump stack traces..."
kill -USR1 $pytest_pid 2>/dev/null || echo "Failed to send signal"
fi
else
stable_count=0
fi
last_cpu_check=$current_cpu
# Check for zombie/stopped state
state=$(echo "$ps_info" | tail -1 | awk '{print $6}')
if [ "$state" = "Z" ] || [ "$state" = "T" ]; then
echo "💀 [EXTERNAL] $(date): Process in abnormal state: $state"
fi
fi
# Check for orphaned Python processes
orphan_count=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l)
if [ $orphan_count -gt 1 ]; then
echo "🔍 [EXTERNAL] $(date): Found $orphan_count pytest-related processes"
ps aux | grep -E "python.*pytest" | grep -v grep
fi
# Emergency timeout
if [ $elapsed -gt 650 ]; then
echo "💥 [EXTERNAL] $(date): Emergency timeout reached, force killing pytest"
kill -KILL $pytest_pid 2>/dev/null || true
break
fi
done
}
# Run pytest in background so we can monitor it externally
python -u -c "import sys, time; print(f'🔍 [REALTIME] {time.strftime(\"%H:%M:%S\")} Starting pytest...', flush=True)"
timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c '
echo "▶️ [HANG DEBUG] Pytest starting at: $(date)"
# Force unbuffered output and immediate flush
stdbuf -o0 -e0 python debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do
printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line"
# Force flush after each line
sync
done
PYTEST_RESULT=${PIPESTATUS[0]}
echo "✅ [HANG DEBUG] Pytest completed at: $(date) with exit code: $PYTEST_RESULT"
exit $PYTEST_RESULT
' &
PYTEST_PID=$!
echo "🔍 [HANG DEBUG] Pytest started with PID: $PYTEST_PID"
# Start external monitoring
external_monitor $PYTEST_PID &
EXTERNAL_MONITOR_PID=$!
# Wait for pytest to complete
wait $PYTEST_PID
PYTEST_EXIT=$?
echo "🏁 [HANG DEBUG] Pytest process ended with exit code: $PYTEST_EXIT"
# Stop external monitor
kill $EXTERNAL_MONITOR_PID 2>/dev/null || true
# Final cleanup check
echo "🧹 [HANG DEBUG] Final cleanup check..."
REMAINING_PROCS=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l)
if [ $REMAINING_PROCS -gt 0 ]; then
echo "⚠️ [HANG DEBUG] Found $REMAINING_PROCS remaining pytest processes after completion"
ps aux | grep -E "python.*pytest" | grep -v grep
echo "💀 [HANG DEBUG] Force killing remaining processes..."
ps aux | grep -E "python.*pytest" | grep -v grep | awk "{print \$2}" | xargs -r kill -KILL
else
echo "✅ [HANG DEBUG] No remaining pytest processes found"
fi
PYTEST_EXIT=$?
# Stop background monitoring
kill $MONITOR_PID 2>/dev/null || true
echo "🔚 [HANG DEBUG] Pytest exit code: $PYTEST_EXIT"
if [ $PYTEST_EXIT -eq 124 ]; then
echo "⚠️ [HANG DEBUG] TIMEOUT! Pytest hung for >600s"
echo "🔍 [HANG DEBUG] Final process state:"
ps aux | grep -E "(python|pytest|embedding)" | grep -v grep
echo "🔍 [HANG DEBUG] Final network state:"
ss -tulpn | grep -E "(555[0-9]|556[0-9])" || echo "No ports"
echo "💀 [HANG DEBUG] Killing remaining processes..."
pkill -TERM -f "pytest\|embedding_server\|zmq" || true
sleep 3
pkill -KILL -f "pytest\|embedding_server\|zmq" || true
fi
exit $PYTEST_EXIT
else
# For non-Ubuntu or non-22.04, run normally
echo "🚀 [HANG DEBUG] Running tests on ${{ matrix.os }} (normal mode)"
pytest tests/ -v --tb=short
fi
- name: Run sanity checks (optional)
run: |
# Activate virtual environment
source .venv/bin/activate || source .venv/Scripts/activate
# Run distance function tests if available
if [ -f test/sanity_checks/test_distance_functions.py ]; then
echo "Running distance function sanity checks..."
python test/sanity_checks/test_distance_functions.py || echo "⚠️ Distance function test failed, continuing..."
fi
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: packages-${{ matrix.os }}-py${{ matrix.python }}
path: packages/*/dist/