- Add Python faulthandler integration with signal-triggered stack dumps - Implement periodic stack dumps at 5min and 10min intervals - Add external process monitoring with SIGUSR1 signal on hang detection - Use debug_pytest.py wrapper to capture exact hang location in C++ cleanup - Enhance CPU stability monitoring to trigger precise stack traces This addresses the persistent pytest hanging issue in Ubuntu 22.04 CI by providing detailed stack traces to identify the exact code location where the hang occurs during test cleanup phase. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
471 lines
19 KiB
YAML
471 lines
19 KiB
YAML
name: Reusable Build
|
||
|
||
on:
|
||
workflow_call:
|
||
inputs:
|
||
ref:
|
||
description: 'Git ref to build'
|
||
required: false
|
||
type: string
|
||
default: ''
|
||
|
||
jobs:
|
||
lint:
|
||
name: Lint and Format Check
|
||
runs-on: ubuntu-latest
|
||
steps:
|
||
- uses: actions/checkout@v4
|
||
with:
|
||
ref: ${{ inputs.ref }}
|
||
|
||
- name: Setup Python
|
||
uses: actions/setup-python@v5
|
||
with:
|
||
python-version: '3.11'
|
||
|
||
- name: Install uv
|
||
uses: astral-sh/setup-uv@v4
|
||
|
||
- name: Install ruff
|
||
run: |
|
||
uv tool install ruff
|
||
|
||
- name: Run ruff check
|
||
run: |
|
||
ruff check .
|
||
|
||
- name: Run ruff format check
|
||
run: |
|
||
ruff format --check .
|
||
|
||
build:
|
||
needs: lint
|
||
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
|
||
strategy:
|
||
matrix:
|
||
include:
|
||
- os: ubuntu-22.04
|
||
python: '3.9'
|
||
- os: ubuntu-22.04
|
||
python: '3.10'
|
||
- os: ubuntu-22.04
|
||
python: '3.11'
|
||
- os: ubuntu-22.04
|
||
python: '3.12'
|
||
- os: ubuntu-22.04
|
||
python: '3.13'
|
||
- os: macos-14
|
||
python: '3.9'
|
||
- os: macos-14
|
||
python: '3.10'
|
||
- os: macos-14
|
||
python: '3.11'
|
||
- os: macos-14
|
||
python: '3.12'
|
||
- os: macos-14
|
||
python: '3.13'
|
||
- os: macos-13
|
||
python: '3.9'
|
||
- os: macos-13
|
||
python: '3.10'
|
||
- os: macos-13
|
||
python: '3.11'
|
||
- os: macos-13
|
||
python: '3.12'
|
||
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
|
||
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
|
||
runs-on: ${{ matrix.os }}
|
||
|
||
steps:
|
||
- uses: actions/checkout@v4
|
||
with:
|
||
ref: ${{ inputs.ref }}
|
||
submodules: recursive
|
||
|
||
- name: Setup Python
|
||
uses: actions/setup-python@v5
|
||
with:
|
||
python-version: ${{ matrix.python }}
|
||
|
||
- name: Install uv
|
||
uses: astral-sh/setup-uv@v4
|
||
|
||
- name: Install system dependencies (Ubuntu)
|
||
if: runner.os == 'Linux'
|
||
run: |
|
||
sudo apt-get update
|
||
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
|
||
pkg-config libopenblas-dev patchelf libabsl-dev libaio-dev libprotobuf-dev
|
||
|
||
# Install Intel MKL for DiskANN
|
||
wget -q https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940.sh
|
||
sudo sh intel-onemkl-2025.0.0.940.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s
|
||
source /opt/intel/oneapi/setvars.sh
|
||
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
|
||
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
|
||
|
||
- name: Install system dependencies (macOS)
|
||
if: runner.os == 'macOS'
|
||
run: |
|
||
# Don't install LLVM, use system clang for better compatibility
|
||
brew install libomp boost protobuf zeromq
|
||
|
||
- name: Install build dependencies
|
||
run: |
|
||
uv pip install --system scikit-build-core numpy swig Cython pybind11
|
||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||
uv pip install --system auditwheel
|
||
else
|
||
uv pip install --system delocate
|
||
fi
|
||
|
||
- name: Set macOS environment variables
|
||
if: runner.os == 'macOS'
|
||
run: |
|
||
# Use brew --prefix to automatically detect Homebrew installation path
|
||
HOMEBREW_PREFIX=$(brew --prefix)
|
||
echo "HOMEBREW_PREFIX=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||
echo "OpenMP_ROOT=${HOMEBREW_PREFIX}/opt/libomp" >> $GITHUB_ENV
|
||
|
||
# Set CMAKE_PREFIX_PATH to let CMake find all packages automatically
|
||
echo "CMAKE_PREFIX_PATH=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||
|
||
# Set compiler flags for OpenMP (required for both backends)
|
||
echo "LDFLAGS=-L${HOMEBREW_PREFIX}/opt/libomp/lib" >> $GITHUB_ENV
|
||
echo "CPPFLAGS=-I${HOMEBREW_PREFIX}/opt/libomp/include" >> $GITHUB_ENV
|
||
|
||
- name: Build packages
|
||
run: |
|
||
# Build core (platform independent)
|
||
cd packages/leann-core
|
||
uv build
|
||
cd ../..
|
||
|
||
# Build HNSW backend
|
||
cd packages/leann-backend-hnsw
|
||
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||
# Use system clang for better compatibility
|
||
export CC=clang
|
||
export CXX=clang++
|
||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||
else
|
||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||
fi
|
||
cd ../..
|
||
|
||
# Build DiskANN backend
|
||
cd packages/leann-backend-diskann
|
||
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||
# Use system clang for better compatibility
|
||
export CC=clang
|
||
export CXX=clang++
|
||
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||
else
|
||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||
fi
|
||
cd ../..
|
||
|
||
# Build meta package (platform independent)
|
||
cd packages/leann
|
||
uv build
|
||
cd ../..
|
||
|
||
- name: Repair wheels (Linux)
|
||
if: runner.os == 'Linux'
|
||
run: |
|
||
# Repair HNSW wheel
|
||
cd packages/leann-backend-hnsw
|
||
if [ -d dist ]; then
|
||
auditwheel repair dist/*.whl -w dist_repaired
|
||
rm -rf dist
|
||
mv dist_repaired dist
|
||
fi
|
||
cd ../..
|
||
|
||
# Repair DiskANN wheel
|
||
cd packages/leann-backend-diskann
|
||
if [ -d dist ]; then
|
||
auditwheel repair dist/*.whl -w dist_repaired
|
||
rm -rf dist
|
||
mv dist_repaired dist
|
||
fi
|
||
cd ../..
|
||
|
||
- name: Repair wheels (macOS)
|
||
if: runner.os == 'macOS'
|
||
run: |
|
||
# Repair HNSW wheel
|
||
cd packages/leann-backend-hnsw
|
||
if [ -d dist ]; then
|
||
delocate-wheel -w dist_repaired -v dist/*.whl
|
||
rm -rf dist
|
||
mv dist_repaired dist
|
||
fi
|
||
cd ../..
|
||
|
||
# Repair DiskANN wheel
|
||
cd packages/leann-backend-diskann
|
||
if [ -d dist ]; then
|
||
delocate-wheel -w dist_repaired -v dist/*.whl
|
||
rm -rf dist
|
||
mv dist_repaired dist
|
||
fi
|
||
cd ../..
|
||
|
||
- name: List built packages
|
||
run: |
|
||
echo "📦 Built packages:"
|
||
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
||
|
||
|
||
- name: Install built packages for testing
|
||
run: |
|
||
# Create a virtual environment with the correct Python version
|
||
uv venv --python ${{ matrix.python }}
|
||
source .venv/bin/activate || source .venv/Scripts/activate
|
||
|
||
# Install packages using --find-links to prioritize local builds
|
||
uv pip install --find-links packages/leann-core/dist --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist packages/leann-core/dist/*.whl || uv pip install --find-links packages/leann-core/dist packages/leann-core/dist/*.tar.gz
|
||
uv pip install --find-links packages/leann-core/dist packages/leann-backend-hnsw/dist/*.whl
|
||
uv pip install --find-links packages/leann-core/dist packages/leann-backend-diskann/dist/*.whl
|
||
uv pip install packages/leann/dist/*.whl || uv pip install packages/leann/dist/*.tar.gz
|
||
|
||
# Install test dependencies using extras
|
||
uv pip install -e ".[test]"
|
||
|
||
- name: Run tests with pytest
|
||
env:
|
||
CI: true # Mark as CI environment to skip memory-intensive tests
|
||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||
HF_HUB_DISABLE_SYMLINKS: 1
|
||
TOKENIZERS_PARALLELISM: false
|
||
PYTORCH_ENABLE_MPS_FALLBACK: 0 # Disable MPS on macOS CI to avoid memory issues
|
||
OMP_NUM_THREADS: 1 # Disable OpenMP parallelism to avoid libomp crashes
|
||
MKL_NUM_THREADS: 1 # Single thread for MKL operations
|
||
run: |
|
||
# Activate virtual environment
|
||
source .venv/bin/activate || source .venv/Scripts/activate
|
||
|
||
# Add targeted debugging for pytest hangs (especially Ubuntu 22.04)
|
||
if [[ "${{ matrix.os }}" == "ubuntu-22.04" ]]; then
|
||
echo "🔍 [HANG DEBUG] Ubuntu 22.04 detected - enabling enhanced process monitoring"
|
||
|
||
# Create Python script to inject stack trace dumping into pytest
|
||
cat > debug_pytest.py << 'EOF'
|
||
import signal
|
||
import faulthandler
|
||
import threading
|
||
import time
|
||
import sys
|
||
import traceback
|
||
|
||
def setup_hang_detection():
|
||
"""Setup signal handlers and faulthandler for hang detection"""
|
||
# Enable faulthandler for automatic stack dumps
|
||
faulthandler.enable()
|
||
|
||
def dump_all_stacks(signum, frame):
|
||
print(f"\n🔥 [HANG DEBUG] SIGNAL {signum} - DUMPING ALL THREAD STACKS:")
|
||
faulthandler.dump_traceback()
|
||
# Also dump current frames manually
|
||
for thread_id, frame in sys._current_frames().items():
|
||
print(f"\n📍 Thread {thread_id}:")
|
||
traceback.print_stack(frame)
|
||
|
||
def periodic_stack_dump():
|
||
"""Periodically dump stacks to catch hang location"""
|
||
time.sleep(300) # Wait 5 minutes
|
||
print(f"\n⏰ [HANG DEBUG] Periodic stack dump at {time.time()}:")
|
||
for thread_id, frame in sys._current_frames().items():
|
||
print(f"\n📍 Thread {thread_id}:")
|
||
traceback.print_stack(frame)
|
||
time.sleep(300) # Wait another 5 minutes if still running
|
||
print(f"\n⚠️ [HANG DEBUG] Final stack dump at {time.time()} (likely hanging):")
|
||
faulthandler.dump_traceback()
|
||
|
||
# Register signal handlers for external debugging
|
||
signal.signal(signal.SIGUSR1, dump_all_stacks)
|
||
signal.signal(signal.SIGUSR2, dump_all_stacks)
|
||
|
||
# Start periodic dumping thread
|
||
dump_thread = threading.Thread(target=periodic_stack_dump, daemon=True)
|
||
dump_thread.start()
|
||
|
||
if __name__ == "__main__":
|
||
setup_hang_detection()
|
||
# Re-exec pytest with debugging enabled
|
||
import subprocess
|
||
result = subprocess.run([sys.executable, "-m", "pytest"] + sys.argv[1:])
|
||
sys.exit(result.returncode)
|
||
EOF
|
||
|
||
# Pre-test state
|
||
echo "📊 [HANG DEBUG] Pre-test process state:"
|
||
ps aux | grep -E "(python|embedding|zmq)" | grep -v grep || echo "No relevant processes"
|
||
|
||
echo "🔌 [HANG DEBUG] Pre-test network state:"
|
||
ss -tulpn | grep -E "(555[0-9]|556[0-9])" || echo "No embedding server ports"
|
||
|
||
# Function to monitor processes during test
|
||
monitor_processes() {
|
||
while true; do
|
||
sleep 30
|
||
echo "⏰ [HANG DEBUG] $(date): Process check during test execution"
|
||
ps aux | grep -E "(python|pytest|embedding)" | grep -v grep | head -10
|
||
ss -tulpn | grep -E "(555[0-9]|556[0-9])" || echo "No ports"
|
||
done
|
||
}
|
||
|
||
# Start background monitoring
|
||
monitor_processes &
|
||
MONITOR_PID=$!
|
||
echo "🔍 [HANG DEBUG] Started background monitor (PID: $MONITOR_PID)"
|
||
|
||
# Run pytest with enhanced real-time monitoring (no dependency on pytest logs)
|
||
echo "🚀 [HANG DEBUG] Starting pytest with 600s timeout and external monitoring..."
|
||
|
||
# Start independent process monitor that doesn't rely on pytest output
|
||
external_monitor() {
|
||
local pytest_pid=$1
|
||
local start_time=$(date +%s)
|
||
local last_cpu_check=0
|
||
local stable_count=0
|
||
|
||
while true; do
|
||
sleep 5
|
||
current_time=$(date +%s)
|
||
elapsed=$((current_time - start_time))
|
||
|
||
# Check if pytest process still exists
|
||
if ! kill -0 $pytest_pid 2>/dev/null; then
|
||
echo "📊 [EXTERNAL] $(date): Pytest process $pytest_pid ended after ${elapsed}s"
|
||
break
|
||
fi
|
||
|
||
# Get detailed process info
|
||
ps_info=$(ps -p $pytest_pid -o pid,ppid,time,pcpu,pmem,state,comm 2>/dev/null || echo "PROCESS_GONE")
|
||
if [ "$ps_info" != "PROCESS_GONE" ]; then
|
||
echo "📊 [EXTERNAL] $(date): Process $pytest_pid - ${ps_info}"
|
||
|
||
# Extract CPU percentage and check for stability
|
||
current_cpu=$(echo "$ps_info" | tail -1 | awk '{print $4}' | cut -d. -f1)
|
||
if [ "$current_cpu" = "$last_cpu_check" ] && [ "$current_cpu" -lt 5 ]; then
|
||
stable_count=$((stable_count + 1))
|
||
if [ $stable_count -ge 6 ]; then # 30 seconds of low CPU
|
||
echo "⚠️ [EXTERNAL] $(date): Process appears hung - CPU stable at ${current_cpu}% for 30s"
|
||
echo "🔍 [EXTERNAL] $(date): Sending SIGUSR1 to dump stack traces..."
|
||
kill -USR1 $pytest_pid 2>/dev/null || echo "Failed to send signal"
|
||
fi
|
||
else
|
||
stable_count=0
|
||
fi
|
||
last_cpu_check=$current_cpu
|
||
|
||
# Check for zombie/stopped state
|
||
state=$(echo "$ps_info" | tail -1 | awk '{print $6}')
|
||
if [ "$state" = "Z" ] || [ "$state" = "T" ]; then
|
||
echo "💀 [EXTERNAL] $(date): Process in abnormal state: $state"
|
||
fi
|
||
fi
|
||
|
||
# Check for orphaned Python processes
|
||
orphan_count=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l)
|
||
if [ $orphan_count -gt 1 ]; then
|
||
echo "🔍 [EXTERNAL] $(date): Found $orphan_count pytest-related processes"
|
||
ps aux | grep -E "python.*pytest" | grep -v grep
|
||
fi
|
||
|
||
# Emergency timeout
|
||
if [ $elapsed -gt 650 ]; then
|
||
echo "💥 [EXTERNAL] $(date): Emergency timeout reached, force killing pytest"
|
||
kill -KILL $pytest_pid 2>/dev/null || true
|
||
break
|
||
fi
|
||
done
|
||
}
|
||
|
||
# Run pytest in background so we can monitor it externally
|
||
python -u -c "import sys, time; print(f'🔍 [REALTIME] {time.strftime(\"%H:%M:%S\")} Starting pytest...', flush=True)"
|
||
timeout --preserve-status --signal=TERM --kill-after=30 600 bash -c '
|
||
echo "▶️ [HANG DEBUG] Pytest starting at: $(date)"
|
||
# Force unbuffered output and immediate flush
|
||
stdbuf -o0 -e0 python debug_pytest.py tests/ -v --tb=short --maxfail=5 -x -s 2>&1 | while IFS= read -r line; do
|
||
printf "%s [PYTEST] %s\n" "$(date +"%H:%M:%S")" "$line"
|
||
# Force flush after each line
|
||
sync
|
||
done
|
||
PYTEST_RESULT=${PIPESTATUS[0]}
|
||
echo "✅ [HANG DEBUG] Pytest completed at: $(date) with exit code: $PYTEST_RESULT"
|
||
exit $PYTEST_RESULT
|
||
' &
|
||
PYTEST_PID=$!
|
||
echo "🔍 [HANG DEBUG] Pytest started with PID: $PYTEST_PID"
|
||
|
||
# Start external monitoring
|
||
external_monitor $PYTEST_PID &
|
||
EXTERNAL_MONITOR_PID=$!
|
||
|
||
# Wait for pytest to complete
|
||
wait $PYTEST_PID
|
||
PYTEST_EXIT=$?
|
||
echo "🏁 [HANG DEBUG] Pytest process ended with exit code: $PYTEST_EXIT"
|
||
|
||
# Stop external monitor
|
||
kill $EXTERNAL_MONITOR_PID 2>/dev/null || true
|
||
|
||
# Final cleanup check
|
||
echo "🧹 [HANG DEBUG] Final cleanup check..."
|
||
REMAINING_PROCS=$(ps aux | grep -E "python.*pytest" | grep -v grep | wc -l)
|
||
if [ $REMAINING_PROCS -gt 0 ]; then
|
||
echo "⚠️ [HANG DEBUG] Found $REMAINING_PROCS remaining pytest processes after completion"
|
||
ps aux | grep -E "python.*pytest" | grep -v grep
|
||
echo "💀 [HANG DEBUG] Force killing remaining processes..."
|
||
ps aux | grep -E "python.*pytest" | grep -v grep | awk "{print \$2}" | xargs -r kill -KILL
|
||
else
|
||
echo "✅ [HANG DEBUG] No remaining pytest processes found"
|
||
fi
|
||
PYTEST_EXIT=$?
|
||
|
||
# Stop background monitoring
|
||
kill $MONITOR_PID 2>/dev/null || true
|
||
|
||
echo "🔚 [HANG DEBUG] Pytest exit code: $PYTEST_EXIT"
|
||
if [ $PYTEST_EXIT -eq 124 ]; then
|
||
echo "⚠️ [HANG DEBUG] TIMEOUT! Pytest hung for >600s"
|
||
echo "🔍 [HANG DEBUG] Final process state:"
|
||
ps aux | grep -E "(python|pytest|embedding)" | grep -v grep
|
||
echo "🔍 [HANG DEBUG] Final network state:"
|
||
ss -tulpn | grep -E "(555[0-9]|556[0-9])" || echo "No ports"
|
||
echo "💀 [HANG DEBUG] Killing remaining processes..."
|
||
pkill -TERM -f "pytest\|embedding_server\|zmq" || true
|
||
sleep 3
|
||
pkill -KILL -f "pytest\|embedding_server\|zmq" || true
|
||
fi
|
||
|
||
exit $PYTEST_EXIT
|
||
else
|
||
# For non-Ubuntu or non-22.04, run normally
|
||
echo "🚀 [HANG DEBUG] Running tests on ${{ matrix.os }} (normal mode)"
|
||
pytest tests/ -v --tb=short
|
||
fi
|
||
|
||
- name: Run sanity checks (optional)
|
||
run: |
|
||
# Activate virtual environment
|
||
source .venv/bin/activate || source .venv/Scripts/activate
|
||
|
||
# Run distance function tests if available
|
||
if [ -f test/sanity_checks/test_distance_functions.py ]; then
|
||
echo "Running distance function sanity checks..."
|
||
python test/sanity_checks/test_distance_functions.py || echo "⚠️ Distance function test failed, continuing..."
|
||
fi
|
||
|
||
- name: Upload artifacts
|
||
uses: actions/upload-artifact@v4
|
||
with:
|
||
name: packages-${{ matrix.os }}-py${{ matrix.python }}
|
||
path: packages/*/dist/
|