Compare commits
57 Commits
clean-stat
...
feature/gr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d9e5d5d6aa | ||
|
|
a437f558a3 | ||
|
|
742c9baabc | ||
|
|
60eef4b440 | ||
|
|
f2c5355c73 | ||
|
|
439debbd3f | ||
|
|
a35bfb0354 | ||
|
|
a6dad47280 | ||
|
|
131f10b286 | ||
|
|
e3762458fc | ||
|
|
05e1efa00a | ||
|
|
6363fc5f83 | ||
|
|
319dc34a24 | ||
|
|
72a5993f02 | ||
|
|
250272a3be | ||
|
|
042da1fe09 | ||
|
|
2d9c183ebb | ||
|
|
a8421c0475 | ||
|
|
0ec00e1a60 | ||
|
|
777b5fed01 | ||
|
|
440ad6e816 | ||
|
|
8714472cd8 | ||
|
|
c799d61a5a | ||
|
|
e409933149 | ||
|
|
bc31876a9f | ||
|
|
e421c44b8b | ||
|
|
af69aa0508 | ||
|
|
575b354976 | ||
|
|
65bbff1d93 | ||
|
|
df798d350d | ||
|
|
3fa6b2aa17 | ||
|
|
ba95554fe7 | ||
|
|
677eb0bae3 | ||
|
|
9cdfcec331 | ||
|
|
f30d1a2530 | ||
|
|
df69a49123 | ||
|
|
65b54ff905 | ||
|
|
4db3e94f35 | ||
|
|
a2568f3ddc | ||
|
|
45bdad4fa7 | ||
|
|
8b538d1ef9 | ||
|
|
ada8bcbc70 | ||
|
|
6061e8f2de | ||
|
|
9842ad8330 | ||
|
|
7d920f9071 | ||
|
|
f28f15000c | ||
|
|
1d657fd9f6 | ||
|
|
d217adbe40 | ||
|
|
f790ec634f | ||
|
|
b8da9d7b12 | ||
|
|
0cb0463929 | ||
|
|
b982241249 | ||
|
|
c66f197e1d | ||
|
|
4a1353761a | ||
|
|
a72090d2ab | ||
|
|
669e622430 | ||
|
|
77d7b60a61 |
9
.github/workflows/build-and-publish.yml
vendored
9
.github/workflows/build-and-publish.yml
vendored
@@ -5,7 +5,16 @@ on:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
debug_enabled:
|
||||
type: boolean
|
||||
description: 'Run with tmate debugging enabled (SSH access to runner)'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
jobs:
|
||||
build:
|
||||
uses: ./.github/workflows/build-reusable.yml
|
||||
with:
|
||||
debug_enabled: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled || false }}
|
||||
|
||||
341
.github/workflows/build-reusable.yml
vendored
341
.github/workflows/build-reusable.yml
vendored
@@ -8,6 +8,11 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: ''
|
||||
debug_enabled:
|
||||
description: 'Enable tmate debugging session for troubleshooting'
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
@@ -28,7 +33,7 @@ jobs:
|
||||
|
||||
- name: Install ruff
|
||||
run: |
|
||||
uv tool install ruff
|
||||
uv tool install ruff==0.12.7
|
||||
|
||||
- name: Run ruff check
|
||||
run: |
|
||||
@@ -54,36 +59,16 @@ jobs:
|
||||
python: '3.12'
|
||||
- os: ubuntu-22.04
|
||||
python: '3.13'
|
||||
- os: macos-14
|
||||
- os: macos-latest
|
||||
python: '3.9'
|
||||
- os: macos-14
|
||||
- os: macos-latest
|
||||
python: '3.10'
|
||||
- os: macos-14
|
||||
- os: macos-latest
|
||||
python: '3.11'
|
||||
- os: macos-14
|
||||
- os: macos-latest
|
||||
python: '3.12'
|
||||
- os: macos-14
|
||||
- os: macos-latest
|
||||
python: '3.13'
|
||||
- os: macos-15
|
||||
python: '3.9'
|
||||
- os: macos-15
|
||||
python: '3.10'
|
||||
- os: macos-15
|
||||
python: '3.11'
|
||||
- os: macos-15
|
||||
python: '3.12'
|
||||
- os: macos-15
|
||||
python: '3.13'
|
||||
- os: macos-13
|
||||
python: '3.9'
|
||||
- os: macos-13
|
||||
python: '3.10'
|
||||
- os: macos-13
|
||||
python: '3.11'
|
||||
- os: macos-13
|
||||
python: '3.12'
|
||||
# Note: macos-13 + Python 3.13 excluded due to PyTorch compatibility
|
||||
# (PyTorch 2.5+ supports Python 3.13 but not Intel Mac x86_64)
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
@@ -129,70 +114,41 @@ jobs:
|
||||
uv pip install --system delocate
|
||||
fi
|
||||
|
||||
- name: Set macOS environment variables
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
# Use brew --prefix to automatically detect Homebrew installation path
|
||||
HOMEBREW_PREFIX=$(brew --prefix)
|
||||
echo "HOMEBREW_PREFIX=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||||
echo "OpenMP_ROOT=${HOMEBREW_PREFIX}/opt/libomp" >> $GITHUB_ENV
|
||||
|
||||
# Set CMAKE_PREFIX_PATH to let CMake find all packages automatically
|
||||
echo "CMAKE_PREFIX_PATH=${HOMEBREW_PREFIX}" >> $GITHUB_ENV
|
||||
|
||||
# Set compiler flags for OpenMP (required for both backends)
|
||||
echo "LDFLAGS=-L${HOMEBREW_PREFIX}/opt/libomp/lib" >> $GITHUB_ENV
|
||||
echo "CPPFLAGS=-I${HOMEBREW_PREFIX}/opt/libomp/include" >> $GITHUB_ENV
|
||||
|
||||
- name: Build packages
|
||||
run: |
|
||||
# Build core (platform independent)
|
||||
# Build core (platform independent) on all platforms for consistency
|
||||
cd packages/leann-core
|
||||
uv build
|
||||
cd ../..
|
||||
|
||||
# Build HNSW backend
|
||||
cd packages/leann-backend-hnsw
|
||||
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||||
# Use system clang for better compatibility
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
# Use system clang instead of homebrew LLVM for better compatibility
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
# Homebrew libraries on each macOS version require matching minimum version
|
||||
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.0
|
||||
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||
fi
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
uv build --wheel --python python
|
||||
else
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
uv build --wheel --python python
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Build DiskANN backend
|
||||
cd packages/leann-backend-diskann
|
||||
if [[ "${{ matrix.os }}" == macos-* ]]; then
|
||||
# Use system clang for better compatibility
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
# Use system clang instead of homebrew LLVM for better compatibility
|
||||
export CC=clang
|
||||
export CXX=clang++
|
||||
# DiskANN requires macOS 13.3+ for sgesdd_ LAPACK function
|
||||
# But Homebrew libraries on each macOS version require matching minimum version
|
||||
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=14.0
|
||||
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=15.0
|
||||
fi
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
# sgesdd_ is only available on macOS 13.3+
|
||||
export MACOSX_DEPLOYMENT_TARGET=13.3
|
||||
uv build --wheel --python python
|
||||
else
|
||||
uv build --wheel --python ${{ matrix.python }} --find-links ${GITHUB_WORKSPACE}/packages/leann-core/dist
|
||||
uv build --wheel --python python
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Build meta package (platform independent)
|
||||
# Build meta package (platform independent) on all platforms
|
||||
cd packages/leann
|
||||
uv build
|
||||
cd ../..
|
||||
@@ -209,10 +165,15 @@ jobs:
|
||||
fi
|
||||
cd ../..
|
||||
|
||||
# Repair DiskANN wheel
|
||||
# Repair DiskANN wheel - use show first to debug
|
||||
cd packages/leann-backend-diskann
|
||||
if [ -d dist ]; then
|
||||
echo "Checking DiskANN wheel contents before repair:"
|
||||
unzip -l dist/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found"
|
||||
auditwheel show dist/*.whl || echo "auditwheel show failed"
|
||||
auditwheel repair dist/*.whl -w dist_repaired
|
||||
echo "Checking DiskANN wheel contents after repair:"
|
||||
unzip -l dist_repaired/*.whl | grep -E "\.so|\.pyd|_diskannpy" || echo "No .so files found after repair"
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
fi
|
||||
@@ -221,24 +182,10 @@ jobs:
|
||||
- name: Repair wheels (macOS)
|
||||
if: runner.os == 'macOS'
|
||||
run: |
|
||||
# Determine deployment target based on runner OS
|
||||
# Must match the Homebrew libraries for each macOS version
|
||||
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
|
||||
HNSW_TARGET="13.0"
|
||||
DISKANN_TARGET="13.3"
|
||||
elif [[ "${{ matrix.os }}" == "macos-14" ]]; then
|
||||
HNSW_TARGET="14.0"
|
||||
DISKANN_TARGET="14.0"
|
||||
elif [[ "${{ matrix.os }}" == "macos-15" ]]; then
|
||||
HNSW_TARGET="15.0"
|
||||
DISKANN_TARGET="15.0"
|
||||
fi
|
||||
|
||||
# Repair HNSW wheel
|
||||
cd packages/leann-backend-hnsw
|
||||
if [ -d dist ]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=$HNSW_TARGET
|
||||
delocate-wheel -w dist_repaired -v --require-target-macos-version $HNSW_TARGET dist/*.whl
|
||||
delocate-wheel -w dist_repaired -v dist/*.whl
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
fi
|
||||
@@ -247,8 +194,7 @@ jobs:
|
||||
# Repair DiskANN wheel
|
||||
cd packages/leann-backend-diskann
|
||||
if [ -d dist ]; then
|
||||
export MACOSX_DEPLOYMENT_TARGET=$DISKANN_TARGET
|
||||
delocate-wheel -w dist_repaired -v --require-target-macos-version $DISKANN_TARGET dist/*.whl
|
||||
delocate-wheel -w dist_repaired -v dist/*.whl
|
||||
rm -rf dist
|
||||
mv dist_repaired dist
|
||||
fi
|
||||
@@ -259,23 +205,71 @@ jobs:
|
||||
echo "📦 Built packages:"
|
||||
find packages/*/dist -name "*.whl" -o -name "*.tar.gz" | sort
|
||||
|
||||
|
||||
- name: Install built packages for testing
|
||||
run: |
|
||||
# Create a virtual environment with the correct Python version
|
||||
uv venv --python ${{ matrix.python }}
|
||||
uv venv --python python${{ matrix.python }}
|
||||
source .venv/bin/activate || source .venv/Scripts/activate
|
||||
|
||||
# Install packages using --find-links to prioritize local builds
|
||||
uv pip install --find-links packages/leann-core/dist --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist packages/leann-core/dist/*.whl || uv pip install --find-links packages/leann-core/dist packages/leann-core/dist/*.tar.gz
|
||||
uv pip install --find-links packages/leann-core/dist packages/leann-backend-hnsw/dist/*.whl
|
||||
uv pip install --find-links packages/leann-core/dist packages/leann-backend-diskann/dist/*.whl
|
||||
uv pip install packages/leann/dist/*.whl || uv pip install packages/leann/dist/*.tar.gz
|
||||
# Install the built wheels directly to ensure we use locally built packages
|
||||
# Use only locally built wheels on all platforms for full consistency
|
||||
FIND_LINKS="--find-links packages/leann-core/dist --find-links packages/leann/dist"
|
||||
FIND_LINKS="$FIND_LINKS --find-links packages/leann-backend-hnsw/dist --find-links packages/leann-backend-diskann/dist"
|
||||
|
||||
uv pip install leann-core leann leann-backend-hnsw leann-backend-diskann \
|
||||
$FIND_LINKS --force-reinstall
|
||||
|
||||
# Install test dependencies using extras
|
||||
uv pip install -e ".[test]"
|
||||
|
||||
# Debug: Check if _diskannpy module is installed correctly
|
||||
echo "Checking installed DiskANN module structure:"
|
||||
python -c "import leann_backend_diskann; print('leann_backend_diskann location:', leann_backend_diskann.__file__)" || echo "Failed to import leann_backend_diskann"
|
||||
python -c "from leann_backend_diskann import _diskannpy; print('_diskannpy imported successfully')" || echo "Failed to import _diskannpy"
|
||||
ls -la $(python -c "import leann_backend_diskann; import os; print(os.path.dirname(leann_backend_diskann.__file__))" 2>/dev/null) 2>/dev/null || echo "Failed to list module directory"
|
||||
|
||||
# Extra debugging for Python 3.13
|
||||
if [[ "${{ matrix.python }}" == "3.13" ]]; then
|
||||
echo "=== Python 3.13 Debug Info ==="
|
||||
echo "Python version details:"
|
||||
python --version
|
||||
python -c "import sys; print(f'sys.version_info: {sys.version_info}')"
|
||||
|
||||
echo "Pytest version:"
|
||||
python -m pytest --version
|
||||
|
||||
echo "Testing basic pytest collection:"
|
||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||
timeout --signal=INT 10 python -m pytest --collect-only tests/test_ci_minimal.py -v || echo "Collection timed out or failed"
|
||||
else
|
||||
# No timeout on macOS/Windows
|
||||
python -m pytest --collect-only tests/test_ci_minimal.py -v || echo "Collection failed"
|
||||
fi
|
||||
|
||||
echo "Testing single simple test:"
|
||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||
timeout --signal=INT 10 python -m pytest tests/test_ci_minimal.py::test_package_imports --full-trace -v || echo "Simple test timed out or failed"
|
||||
else
|
||||
# No timeout on macOS/Windows
|
||||
python -m pytest tests/test_ci_minimal.py::test_package_imports --full-trace -v || echo "Simple test failed"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Enable tmate debugging session if requested
|
||||
- name: Setup tmate session for debugging
|
||||
if: ${{ inputs.debug_enabled }}
|
||||
uses: mxschmitt/action-tmate@v3
|
||||
with:
|
||||
detached: true
|
||||
timeout-minutes: 30
|
||||
limit-access-to-actor: true
|
||||
|
||||
- name: Run tests with pytest
|
||||
# Timeout hierarchy:
|
||||
# 1. Individual test timeout: 20s (see pyproject.toml markers)
|
||||
# 2. Pytest session timeout: 300s (see pyproject.toml [tool.pytest.ini_options])
|
||||
# 3. Outer shell timeout: 360s (300s + 60s buffer for cleanup)
|
||||
# 4. GitHub Actions job timeout: 6 hours (default)
|
||||
env:
|
||||
CI: true # Mark as CI environment to skip memory-intensive tests
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
@@ -288,8 +282,165 @@ jobs:
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate || source .venv/Scripts/activate
|
||||
|
||||
# Run tests
|
||||
pytest -v tests/
|
||||
# Define comprehensive diagnostic function
|
||||
diag() {
|
||||
echo "===== COMPREHENSIVE DIAGNOSTICS BEGIN ====="
|
||||
date
|
||||
echo ""
|
||||
echo "### Current Shell Info ###"
|
||||
echo "Shell PID: $$"
|
||||
echo "Shell PPID: $PPID"
|
||||
echo "Current directory: $(pwd)"
|
||||
echo ""
|
||||
|
||||
echo "### Process Tree (full) ###"
|
||||
pstree -ap 2>/dev/null || ps auxf || true
|
||||
echo ""
|
||||
|
||||
echo "### All Python/Pytest Processes ###"
|
||||
ps -ef | grep -E 'python|pytest' | grep -v grep || true
|
||||
echo ""
|
||||
|
||||
echo "### Embedding Server Processes ###"
|
||||
ps -ef | grep -E 'embedding|zmq|diskann' | grep -v grep || true
|
||||
echo ""
|
||||
|
||||
echo "### Network Listeners ###"
|
||||
ss -ltnp 2>/dev/null || netstat -ltn 2>/dev/null || true
|
||||
echo ""
|
||||
|
||||
echo "### Open File Descriptors (lsof) ###"
|
||||
lsof -p $$ 2>/dev/null | head -20 || true
|
||||
echo ""
|
||||
|
||||
echo "### Zombie Processes ###"
|
||||
ps aux | grep '<defunct>' || echo "No zombie processes"
|
||||
echo ""
|
||||
|
||||
echo "### Current Jobs ###"
|
||||
jobs -l || true
|
||||
echo ""
|
||||
|
||||
echo "### /proc/PID/fd for current shell ###"
|
||||
ls -la /proc/$$/fd 2>/dev/null || true
|
||||
echo ""
|
||||
|
||||
echo "===== COMPREHENSIVE DIAGNOSTICS END ====="
|
||||
}
|
||||
|
||||
# Enable verbose logging for debugging
|
||||
export PYTHONUNBUFFERED=1
|
||||
export PYTEST_CURRENT_TEST=1
|
||||
|
||||
# Run all tests with extensive logging
|
||||
if [[ "$RUNNER_OS" == "Linux" ]]; then
|
||||
echo "🚀 Starting Linux test execution with timeout..."
|
||||
echo "Current time: $(date)"
|
||||
echo "Shell PID: $$"
|
||||
echo "Python: $(python --version)"
|
||||
echo "Pytest: $(pytest --version)"
|
||||
|
||||
# Show environment variables for debugging
|
||||
echo "📦 Environment variables:"
|
||||
env | grep -E "PYTHON|PYTEST|CI|RUNNER" | sort
|
||||
|
||||
# Set trap for diagnostics
|
||||
trap diag INT TERM EXIT
|
||||
|
||||
echo "📋 Pre-test diagnostics:"
|
||||
ps -ef | grep -E 'python|pytest' | grep -v grep || echo "No python/pytest processes before test"
|
||||
|
||||
# Check for any listening ports before test
|
||||
echo "🔌 Pre-test network state:"
|
||||
ss -ltn 2>/dev/null | grep -E "555[0-9]|556[0-9]" || echo "No embedding server ports open"
|
||||
|
||||
# Set timeouts - outer must be larger than pytest's internal timeout
|
||||
# IMPORTANT: Keep PYTEST_TIMEOUT_SEC in sync with pyproject.toml [tool.pytest.ini_options] timeout
|
||||
PYTEST_TIMEOUT_SEC=${PYTEST_TIMEOUT_SEC:-300} # Default 300s, matches pyproject.toml
|
||||
BUFFER_SEC=${TIMEOUT_BUFFER_SEC:-60} # Buffer for cleanup after pytest timeout
|
||||
OUTER_TIMEOUT_SEC=${OUTER_TIMEOUT_SEC:-$((PYTEST_TIMEOUT_SEC + BUFFER_SEC))}
|
||||
|
||||
echo "⏰ Timeout configuration:"
|
||||
echo " - Pytest internal timeout: ${PYTEST_TIMEOUT_SEC}s (from pyproject.toml)"
|
||||
echo " - Cleanup buffer: ${BUFFER_SEC}s"
|
||||
echo " - Outer shell timeout: ${OUTER_TIMEOUT_SEC}s (${PYTEST_TIMEOUT_SEC}s + ${BUFFER_SEC}s buffer)"
|
||||
echo " - This ensures pytest can complete its own timeout handling and cleanup"
|
||||
|
||||
echo "🏃 Running pytest with ${OUTER_TIMEOUT_SEC}s outer timeout..."
|
||||
|
||||
# Export for inner shell
|
||||
export PYTEST_TIMEOUT_SEC OUTER_TIMEOUT_SEC BUFFER_SEC
|
||||
|
||||
timeout --preserve-status --signal=INT --kill-after=10 ${OUTER_TIMEOUT_SEC} bash -c '
|
||||
echo "⏱️ Pytest starting at: $(date)"
|
||||
echo "Running command: pytest tests/ -vv --maxfail=3 --tb=short --capture=no"
|
||||
|
||||
# Run pytest with maximum verbosity and no output capture
|
||||
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=DEBUG 2>&1 | tee pytest.log
|
||||
PYTEST_EXIT=${PIPESTATUS[0]}
|
||||
|
||||
echo "✅ Pytest finished at: $(date) with exit code: $PYTEST_EXIT"
|
||||
echo "Last 20 lines of pytest output:"
|
||||
tail -20 pytest.log || true
|
||||
|
||||
# Immediately check for leftover processes
|
||||
echo "🔍 Post-pytest process check:"
|
||||
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "No leftover processes"
|
||||
|
||||
# Clean up any children before exit
|
||||
echo "🧹 Cleaning up child processes..."
|
||||
pkill -TERM -P $$ 2>/dev/null || true
|
||||
sleep 0.5
|
||||
pkill -KILL -P $$ 2>/dev/null || true
|
||||
|
||||
echo "📊 Final check before exit:"
|
||||
ps -ef | grep -E "python|pytest|embedding" | grep -v grep || echo "All clean"
|
||||
|
||||
exit $PYTEST_EXIT
|
||||
'
|
||||
|
||||
EXIT_CODE=$?
|
||||
echo "🔚 Timeout command exited with code: $EXIT_CODE"
|
||||
|
||||
if [ $EXIT_CODE -eq 124 ]; then
|
||||
echo "⚠️ TIMEOUT TRIGGERED - Tests took more than ${OUTER_TIMEOUT_SEC} seconds!"
|
||||
echo "📸 Capturing full diagnostics..."
|
||||
diag
|
||||
|
||||
# Run diagnostic script if available
|
||||
if [ -f scripts/diagnose_hang.sh ]; then
|
||||
echo "🔍 Running diagnostic script..."
|
||||
bash scripts/diagnose_hang.sh || true
|
||||
fi
|
||||
|
||||
# More aggressive cleanup
|
||||
echo "💀 Killing all Python processes owned by runner..."
|
||||
pkill -9 -u runner python || true
|
||||
pkill -9 -u runner pytest || true
|
||||
elif [ $EXIT_CODE -ne 0 ]; then
|
||||
echo "❌ Tests failed with exit code: $EXIT_CODE"
|
||||
else
|
||||
echo "✅ All tests passed!"
|
||||
fi
|
||||
|
||||
# Always show final state
|
||||
echo "📍 Final state check:"
|
||||
ps -ef | grep -E 'python|pytest|embedding' | grep -v grep || echo "No Python processes remaining"
|
||||
|
||||
exit $EXIT_CODE
|
||||
else
|
||||
# For macOS/Windows, run without GNU timeout
|
||||
echo "🚀 Running tests on $RUNNER_OS..."
|
||||
pytest tests/ -vv --maxfail=3 --tb=short --capture=no --log-cli-level=INFO
|
||||
fi
|
||||
|
||||
# Provide tmate session on test failure for debugging
|
||||
- name: Setup tmate session on failure
|
||||
if: ${{ failure() && (inputs.debug_enabled || contains(github.event.head_commit.message, '[debug]')) }}
|
||||
uses: mxschmitt/action-tmate@v3
|
||||
with:
|
||||
timeout-minutes: 30
|
||||
limit-access-to-actor: true
|
||||
|
||||
- name: Run sanity checks (optional)
|
||||
run: |
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.5.0
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
@@ -10,7 +10,7 @@ repos:
|
||||
- id: debug-statements
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.2.1
|
||||
rev: v0.12.7 # Fixed version to match pyproject.toml
|
||||
hooks:
|
||||
- id: ruff
|
||||
- id: ruff-format
|
||||
|
||||
26
README.md
26
README.md
@@ -3,11 +3,10 @@
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<img src="https://img.shields.io/badge/Python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python Versions">
|
||||
<img src="https://github.com/yichuan-w/LEANN/actions/workflows/build-and-publish.yml/badge.svg" alt="CI Status">
|
||||
<img src="https://img.shields.io/badge/Platform-Ubuntu%20%7C%20macOS%20(ARM64%2FIntel)-lightgrey" alt="Platform">
|
||||
<img src="https://img.shields.io/badge/Python-3.9%2B-blue.svg" alt="Python 3.9+">
|
||||
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="MIT License">
|
||||
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue" alt="MCP Integration">
|
||||
<img src="https://img.shields.io/badge/Platform-Linux%20%7C%20macOS-lightgrey" alt="Platform">
|
||||
<img src="https://img.shields.io/badge/MCP-Native%20Integration-blue?style=flat-square" alt="MCP Integration">
|
||||
</p>
|
||||
|
||||
<h2 align="center" tabindex="-1" class="heading-element" dir="auto">
|
||||
@@ -71,8 +70,6 @@ source .venv/bin/activate
|
||||
uv pip install leann
|
||||
```
|
||||
|
||||
> Low-resource? See “Low-resource setups” in the [Configuration Guide](docs/configuration-guide.md#low-resource-setups).
|
||||
|
||||
<details>
|
||||
<summary>
|
||||
<strong>🔧 Build from Source (Recommended for development)</strong>
|
||||
@@ -192,7 +189,7 @@ All RAG examples share these common parameters. **Interactive mode** is availabl
|
||||
--force-rebuild # Force rebuild index even if it exists
|
||||
|
||||
# Embedding Parameters
|
||||
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text,mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
|
||||
--embedding-model MODEL # e.g., facebook/contriever, text-embedding-3-small, nomic-embed-text, mlx-community/Qwen3-Embedding-0.6B-8bit or nomic-embed-text
|
||||
--embedding-mode MODE # sentence-transformers, openai, mlx, or ollama
|
||||
|
||||
# LLM Parameters (Text generation models)
|
||||
@@ -457,7 +454,7 @@ leann --help
|
||||
**To make it globally available:**
|
||||
```bash
|
||||
# Install the LEANN CLI globally using uv tool
|
||||
uv tool install leann
|
||||
uv tool install leann-core
|
||||
|
||||
# Now you can use leann from anywhere without activating venv
|
||||
leann --help
|
||||
@@ -470,7 +467,7 @@ leann --help
|
||||
### Usage Examples
|
||||
|
||||
```bash
|
||||
# build from a specific directory, and my_docs is the index name(Here you can also build from multiple dict or multiple files)
|
||||
# build from a specific directory, and my_docs is the index name
|
||||
leann build my-docs --docs ./your_documents
|
||||
|
||||
# Search your documents
|
||||
@@ -545,12 +542,16 @@ Options:
|
||||
- **Dynamic batching:** Efficiently batch embedding computations for GPU utilization
|
||||
- **Two-level search:** Smart graph traversal that prioritizes promising nodes
|
||||
|
||||
**Backends:** HNSW (default) for most use cases, with optional DiskANN support for billion-scale datasets.
|
||||
**Backends:**
|
||||
- **HNSW** (default): Ideal for most datasets with maximum storage savings through full recomputation
|
||||
- **DiskANN**: Advanced option with superior search performance, using PQ-based graph traversal with real-time reranking for the best speed-accuracy trade-off
|
||||
|
||||
## Benchmarks
|
||||
|
||||
**[DiskANN vs HNSW Performance Comparison →](benchmarks/diskann_vs_hnsw_speed_comparison.py)** - Compare search performance between both backends
|
||||
|
||||
**[Simple Example: Compare LEANN vs FAISS →](benchmarks/compare_faiss_vs_leann.py)** - See storage savings in action
|
||||
|
||||
**[Simple Example: Compare LEANN vs FAISS →](benchmarks/compare_faiss_vs_leann.py)**
|
||||
### 📊 Storage Comparison
|
||||
|
||||
| System | DPR (2.1M) | Wiki (60M) | Chat (400K) | Email (780K) | Browser (38K) |
|
||||
@@ -609,9 +610,8 @@ We welcome more contributors! Feel free to open issues or submit PRs.
|
||||
|
||||
This work is done at [**Berkeley Sky Computing Lab**](https://sky.cs.berkeley.edu/).
|
||||
|
||||
## Star History
|
||||
---
|
||||
|
||||
[](https://www.star-history.com/#yichuan-w/LEANN&Date)
|
||||
<p align="center">
|
||||
<strong>⭐ Star us on GitHub if Leann is useful for your research or applications!</strong>
|
||||
</p>
|
||||
|
||||
@@ -178,6 +178,9 @@ class BaseRAGExample(ABC):
|
||||
config["host"] = args.llm_host
|
||||
elif args.llm == "hf":
|
||||
config["model"] = args.llm_model or "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
elif args.llm == "simulated":
|
||||
# Simulated LLM doesn't need additional configuration
|
||||
pass
|
||||
|
||||
return config
|
||||
|
||||
|
||||
@@ -1,9 +1,24 @@
|
||||
# 🧪 Leann Sanity Checks
|
||||
# 🧪 LEANN Benchmarks & Testing
|
||||
|
||||
This directory contains comprehensive sanity checks for the Leann system, ensuring all components work correctly across different configurations.
|
||||
This directory contains performance benchmarks and comprehensive tests for the LEANN system, including backend comparisons and sanity checks across different configurations.
|
||||
|
||||
## 📁 Test Files
|
||||
|
||||
### `diskann_vs_hnsw_speed_comparison.py`
|
||||
Performance comparison between DiskANN and HNSW backends:
|
||||
- ✅ **Search latency** comparison with both backends using recompute
|
||||
- ✅ **Index size** and **build time** measurements
|
||||
- ✅ **Score validity** testing (ensures no -inf scores)
|
||||
- ✅ **Configurable dataset sizes** for different scales
|
||||
|
||||
```bash
|
||||
# Quick comparison with 500 docs, 10 queries
|
||||
python benchmarks/diskann_vs_hnsw_speed_comparison.py
|
||||
|
||||
# Large-scale comparison with 2000 docs, 20 queries
|
||||
python benchmarks/diskann_vs_hnsw_speed_comparison.py 2000 20
|
||||
```
|
||||
|
||||
### `test_distance_functions.py`
|
||||
Tests all supported distance functions across DiskANN backend:
|
||||
- ✅ **MIPS** (Maximum Inner Product Search)
|
||||
|
||||
268
benchmarks/diskann_vs_hnsw_speed_comparison.py
Normal file
268
benchmarks/diskann_vs_hnsw_speed_comparison.py
Normal file
@@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DiskANN vs HNSW Search Performance Comparison
|
||||
|
||||
This benchmark compares search performance between DiskANN and HNSW backends:
|
||||
- DiskANN: With graph partitioning enabled (is_recompute=True)
|
||||
- HNSW: With recompute enabled (is_recompute=True)
|
||||
- Tests performance across different dataset sizes
|
||||
- Measures search latency, recall, and index size
|
||||
"""
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def create_test_texts(n_docs: int) -> list[str]:
|
||||
"""Create synthetic test documents for benchmarking."""
|
||||
np.random.seed(42)
|
||||
topics = [
|
||||
"machine learning and artificial intelligence",
|
||||
"natural language processing and text analysis",
|
||||
"computer vision and image recognition",
|
||||
"data science and statistical analysis",
|
||||
"deep learning and neural networks",
|
||||
"information retrieval and search engines",
|
||||
"database systems and data management",
|
||||
"software engineering and programming",
|
||||
"cybersecurity and network protection",
|
||||
"cloud computing and distributed systems",
|
||||
]
|
||||
|
||||
texts = []
|
||||
for i in range(n_docs):
|
||||
topic = topics[i % len(topics)]
|
||||
variation = np.random.randint(1, 100)
|
||||
text = (
|
||||
f"This is document {i} about {topic}. Content variation {variation}. "
|
||||
f"Additional information about {topic} with details and examples. "
|
||||
f"Technical discussion of {topic} including implementation aspects."
|
||||
)
|
||||
texts.append(text)
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def benchmark_backend(
|
||||
backend_name: str, texts: list[str], test_queries: list[str], backend_kwargs: dict[str, Any]
|
||||
) -> dict[str, float]:
|
||||
"""Benchmark a specific backend with the given configuration."""
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
print(f"\n🔧 Testing {backend_name.upper()} backend...")
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
index_path = str(Path(temp_dir) / f"benchmark_{backend_name}.leann")
|
||||
|
||||
# Build index
|
||||
print(f"📦 Building {backend_name} index with {len(texts)} documents...")
|
||||
start_time = time.time()
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name=backend_name,
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
**backend_kwargs,
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
builder.add_text(text)
|
||||
|
||||
builder.build_index(index_path)
|
||||
build_time = time.time() - start_time
|
||||
|
||||
# Measure index size
|
||||
index_dir = Path(index_path).parent
|
||||
index_files = list(index_dir.glob(f"{Path(index_path).stem}.*"))
|
||||
total_size = sum(f.stat().st_size for f in index_files if f.is_file())
|
||||
size_mb = total_size / (1024 * 1024)
|
||||
|
||||
print(f" ✅ Build completed in {build_time:.2f}s, index size: {size_mb:.1f}MB")
|
||||
|
||||
# Search benchmark
|
||||
print("🔍 Running search benchmark...")
|
||||
searcher = LeannSearcher(index_path)
|
||||
|
||||
search_times = []
|
||||
all_results = []
|
||||
|
||||
for query in test_queries:
|
||||
start_time = time.time()
|
||||
results = searcher.search(query, top_k=5)
|
||||
search_time = time.time() - start_time
|
||||
search_times.append(search_time)
|
||||
all_results.append(results)
|
||||
|
||||
avg_search_time = np.mean(search_times) * 1000 # Convert to ms
|
||||
print(f" ✅ Average search time: {avg_search_time:.1f}ms")
|
||||
|
||||
# Check for valid scores (detect -inf issues)
|
||||
all_scores = [
|
||||
result.score
|
||||
for results in all_results
|
||||
for result in results
|
||||
if result.score is not None
|
||||
]
|
||||
valid_scores = [
|
||||
score for score in all_scores if score != float("-inf") and score != float("inf")
|
||||
]
|
||||
score_validity_rate = len(valid_scores) / len(all_scores) if all_scores else 0
|
||||
|
||||
# Clean up
|
||||
try:
|
||||
if hasattr(searcher, "__del__"):
|
||||
searcher.__del__()
|
||||
del searcher
|
||||
del builder
|
||||
gc.collect()
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Resource cleanup error: {e}")
|
||||
|
||||
return {
|
||||
"build_time": build_time,
|
||||
"avg_search_time_ms": avg_search_time,
|
||||
"index_size_mb": size_mb,
|
||||
"score_validity_rate": score_validity_rate,
|
||||
}
|
||||
|
||||
|
||||
def run_comparison(n_docs: int = 500, n_queries: int = 10):
|
||||
"""Run performance comparison between DiskANN and HNSW."""
|
||||
print("🚀 Starting DiskANN vs HNSW Performance Comparison")
|
||||
print(f"📊 Dataset: {n_docs} documents, {n_queries} test queries")
|
||||
|
||||
# Create test data
|
||||
texts = create_test_texts(n_docs)
|
||||
test_queries = [
|
||||
"machine learning algorithms",
|
||||
"natural language processing",
|
||||
"computer vision techniques",
|
||||
"data analysis methods",
|
||||
"neural network architectures",
|
||||
"database query optimization",
|
||||
"software development practices",
|
||||
"security vulnerabilities",
|
||||
"cloud infrastructure",
|
||||
"distributed computing",
|
||||
][:n_queries]
|
||||
|
||||
# HNSW benchmark
|
||||
hnsw_results = benchmark_backend(
|
||||
backend_name="hnsw",
|
||||
texts=texts,
|
||||
test_queries=test_queries,
|
||||
backend_kwargs={
|
||||
"is_recompute": True, # Enable recompute for fair comparison
|
||||
"M": 16,
|
||||
"efConstruction": 200,
|
||||
},
|
||||
)
|
||||
|
||||
# DiskANN benchmark
|
||||
diskann_results = benchmark_backend(
|
||||
backend_name="diskann",
|
||||
texts=texts,
|
||||
test_queries=test_queries,
|
||||
backend_kwargs={
|
||||
"is_recompute": True, # Enable graph partitioning
|
||||
"num_neighbors": 32,
|
||||
"search_list_size": 50,
|
||||
},
|
||||
)
|
||||
|
||||
# Performance comparison
|
||||
print("\n📈 Performance Comparison Results")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"{'Metric':<25} {'HNSW':<15} {'DiskANN':<15} {'Speedup':<10}")
|
||||
print(f"{'-' * 60}")
|
||||
|
||||
# Build time comparison
|
||||
build_speedup = hnsw_results["build_time"] / diskann_results["build_time"]
|
||||
print(
|
||||
f"{'Build Time (s)':<25} {hnsw_results['build_time']:<15.2f} {diskann_results['build_time']:<15.2f} {build_speedup:<10.2f}x"
|
||||
)
|
||||
|
||||
# Search time comparison
|
||||
search_speedup = hnsw_results["avg_search_time_ms"] / diskann_results["avg_search_time_ms"]
|
||||
print(
|
||||
f"{'Search Time (ms)':<25} {hnsw_results['avg_search_time_ms']:<15.1f} {diskann_results['avg_search_time_ms']:<15.1f} {search_speedup:<10.2f}x"
|
||||
)
|
||||
|
||||
# Index size comparison
|
||||
size_ratio = diskann_results["index_size_mb"] / hnsw_results["index_size_mb"]
|
||||
print(
|
||||
f"{'Index Size (MB)':<25} {hnsw_results['index_size_mb']:<15.1f} {diskann_results['index_size_mb']:<15.1f} {size_ratio:<10.2f}x"
|
||||
)
|
||||
|
||||
# Score validity
|
||||
print(
|
||||
f"{'Score Validity (%)':<25} {hnsw_results['score_validity_rate'] * 100:<15.1f} {diskann_results['score_validity_rate'] * 100:<15.1f}"
|
||||
)
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print("\n🎯 Summary:")
|
||||
if search_speedup > 1:
|
||||
print(f" DiskANN is {search_speedup:.2f}x faster than HNSW for search")
|
||||
else:
|
||||
print(f" HNSW is {1 / search_speedup:.2f}x faster than DiskANN for search")
|
||||
|
||||
if size_ratio > 1:
|
||||
print(f" DiskANN uses {size_ratio:.2f}x more storage than HNSW")
|
||||
else:
|
||||
print(f" DiskANN uses {1 / size_ratio:.2f}x less storage than HNSW")
|
||||
|
||||
print(
|
||||
f" Both backends achieved {min(hnsw_results['score_validity_rate'], diskann_results['score_validity_rate']) * 100:.1f}% score validity"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
try:
|
||||
# Handle help request
|
||||
if len(sys.argv) > 1 and sys.argv[1] in ["-h", "--help", "help"]:
|
||||
print("DiskANN vs HNSW Performance Comparison")
|
||||
print("=" * 50)
|
||||
print(f"Usage: python {sys.argv[0]} [n_docs] [n_queries]")
|
||||
print()
|
||||
print("Arguments:")
|
||||
print(" n_docs Number of documents to index (default: 500)")
|
||||
print(" n_queries Number of test queries to run (default: 10)")
|
||||
print()
|
||||
print("Examples:")
|
||||
print(" python benchmarks/diskann_vs_hnsw_speed_comparison.py")
|
||||
print(" python benchmarks/diskann_vs_hnsw_speed_comparison.py 1000")
|
||||
print(" python benchmarks/diskann_vs_hnsw_speed_comparison.py 2000 20")
|
||||
sys.exit(0)
|
||||
|
||||
# Parse command line arguments
|
||||
n_docs = int(sys.argv[1]) if len(sys.argv) > 1 else 500
|
||||
n_queries = int(sys.argv[2]) if len(sys.argv) > 2 else 10
|
||||
|
||||
print("DiskANN vs HNSW Performance Comparison")
|
||||
print("=" * 50)
|
||||
print(f"Dataset: {n_docs} documents, {n_queries} queries")
|
||||
print()
|
||||
|
||||
run_comparison(n_docs=n_docs, n_queries=n_queries)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⚠️ Benchmark interrupted by user")
|
||||
sys.exit(130)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Benchmark failed: {e}")
|
||||
sys.exit(1)
|
||||
finally:
|
||||
# Ensure clean exit
|
||||
try:
|
||||
gc.collect()
|
||||
print("\n🧹 Cleanup completed")
|
||||
except Exception:
|
||||
pass
|
||||
sys.exit(0)
|
||||
@@ -97,16 +97,30 @@ ollama pull nomic-embed-text
|
||||
```
|
||||
|
||||
### DiskANN
|
||||
**Best for**: Large datasets (> 10M vectors, 10GB+ index size) - **⚠️ Beta version, still in active development**
|
||||
- Uses Product Quantization (PQ) for coarse filtering during graph traversal
|
||||
- Novel approach: stores only PQ codes, performs rerank with exact computation in final step
|
||||
- Implements a corner case of double-queue: prunes all neighbors and recomputes at the end
|
||||
**Best for**: Performance-critical applications and large datasets - **Production-ready with automatic graph partitioning**
|
||||
|
||||
**How it works:**
|
||||
- **Product Quantization (PQ) + Real-time Reranking**: Uses compressed PQ codes for fast graph traversal, then recomputes exact embeddings for final candidates
|
||||
- **Automatic Graph Partitioning**: When `is_recompute=True`, automatically partitions large indices and safely removes redundant files to save storage
|
||||
- **Superior Speed-Accuracy Trade-off**: Faster search than HNSW while maintaining high accuracy
|
||||
|
||||
**Trade-offs compared to HNSW:**
|
||||
- ✅ **Faster search latency** (typically 2-8x speedup)
|
||||
- ✅ **Better scaling** for large datasets
|
||||
- ✅ **Smart storage management** with automatic partitioning
|
||||
- ✅ **Better graph locality** with `--ldg-times` parameter for SSD optimization
|
||||
- ⚠️ **Slightly larger index size** due to PQ tables and graph metadata
|
||||
|
||||
```bash
|
||||
# For billion-scale deployments
|
||||
# Recommended for most use cases
|
||||
--backend-name diskann --graph-degree 32 --build-complexity 64
|
||||
|
||||
# For large-scale deployments
|
||||
--backend-name diskann --graph-degree 64 --build-complexity 128
|
||||
```
|
||||
|
||||
**Performance Benchmark**: Run `python benchmarks/diskann_vs_hnsw_speed_comparison.py` to compare DiskANN and HNSW on your system.
|
||||
|
||||
## LLM Selection: Engine and Model Comparison
|
||||
|
||||
### LLM Engines
|
||||
@@ -259,83 +273,28 @@ Every configuration choice involves trade-offs:
|
||||
|
||||
The key is finding the right balance for your specific use case. Start small and simple, measure performance, then scale up only where needed.
|
||||
|
||||
## Low-resource setups
|
||||
## Deep Dive: Critical Configuration Decisions
|
||||
|
||||
If you don’t have a local GPU or builds/searches are too slow, use one or more of the options below.
|
||||
### When to Disable Recomputation
|
||||
|
||||
### 1) Use OpenAI embeddings (no local compute)
|
||||
|
||||
Fastest path with zero local GPU requirements. Set your API key and use OpenAI embeddings during build and search:
|
||||
LEANN's recomputation feature provides exact distance calculations but can be disabled for extreme QPS requirements:
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=sk-...
|
||||
|
||||
# Build with OpenAI embeddings
|
||||
leann build my-index \
|
||||
--embedding-mode openai \
|
||||
--embedding-model text-embedding-3-small
|
||||
|
||||
# Search with OpenAI embeddings (recompute at query time)
|
||||
leann search my-index "your query" \
|
||||
--recompute-embeddings
|
||||
--no-recompute # Disable selective recomputation
|
||||
```
|
||||
|
||||
### 2) Run remote builds with SkyPilot (cloud GPU)
|
||||
**Trade-offs**:
|
||||
- **With recomputation** (default): Exact distances, best quality, higher latency, minimal storage (only stores metadata, recomputes embeddings on-demand)
|
||||
- **Without recomputation**: Must store full embeddings, significantly higher memory and storage usage (10-100x more), but faster search
|
||||
|
||||
Offload embedding generation and index building to a GPU VM using SkyPilot. A template is provided at `sky/leann-build.yaml`.
|
||||
|
||||
```bash
|
||||
# One-time: install and configure SkyPilot
|
||||
pip install skypilot
|
||||
sky launch -c leann-gpu sky/leann-build.yaml
|
||||
|
||||
# Build remotely (template installs uv + leann CLI)
|
||||
sky exec leann-gpu -- "leann build my-index --docs ~/leann-data --backend hnsw --complexity 64 --graph-degree 32"
|
||||
```
|
||||
|
||||
Details: see “Running Builds on SkyPilot (Optional)” below.
|
||||
|
||||
### 3) Disable recomputation to trade storage for speed
|
||||
|
||||
If you need lower latency and have more storage/memory, disable recomputation. This stores full embeddings and avoids recomputing at search time.
|
||||
|
||||
```bash
|
||||
# Build without recomputation (HNSW requires non-compact in this mode)
|
||||
leann build my-index --no-recompute --no-compact
|
||||
|
||||
# Search without recomputation
|
||||
leann search my-index "your query" --no-recompute
|
||||
```
|
||||
|
||||
Trade-offs: lower query-time latency, but significantly higher storage usage.
|
||||
|
||||
## Running Builds on SkyPilot (Optional)
|
||||
|
||||
You can offload embedding generation and index building to a cloud GPU VM using SkyPilot, without changing any LEANN code. This is useful when your local machine lacks a GPU or you want faster throughput.
|
||||
|
||||
### Quick Start
|
||||
|
||||
1) Install SkyPilot by following their docs (`pip install skypilot`), then configure cloud credentials.
|
||||
|
||||
2) Use the provided SkyPilot template:
|
||||
|
||||
```bash
|
||||
sky launch -c leann-gpu sky/leann-build.yaml
|
||||
```
|
||||
|
||||
3) On the remote, either put your data under the mounted path or adjust `file_mounts` in `sky/leann-build.yaml`. Then run the LEANN build:
|
||||
|
||||
```bash
|
||||
sky exec leann-gpu -- "leann build my-index --docs ~/leann-data --backend hnsw --complexity 64 --graph-degree 32"
|
||||
```
|
||||
|
||||
Notes:
|
||||
- The template installs `uv` and the `leann` CLI globally on the remote instance.
|
||||
- Change the `accelerators` and `cloud` settings in `sky/leann-build.yaml` to match your budget/availability (e.g., `A10G:1`, `A100:1`, or CPU-only if you prefer).
|
||||
- You can also build with `diskann` by switching `--backend diskann`.
|
||||
**Disable when**:
|
||||
- You have abundant storage and memory
|
||||
- Need extremely low latency (< 100ms)
|
||||
- Running a read-heavy workload where storage cost is acceptable
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Lessons Learned Developing LEANN](https://yichuan-w.github.io/blog/lessons_learned_in_dev_leann/)
|
||||
- [LEANN Technical Paper](https://arxiv.org/abs/2506.08276)
|
||||
- [DiskANN Original Paper](https://papers.nips.cc/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf)
|
||||
- [SSD-based Graph Partitioning](https://github.com/SonglinLife/SSD_BASED_PLAN)
|
||||
|
||||
8
packages/leann-backend-diskann/CMakeLists.txt
Normal file
8
packages/leann-backend-diskann/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
# packages/leann-backend-diskann/CMakeLists.txt (simplified version)
|
||||
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
project(leann_backend_diskann_wrapper)
|
||||
|
||||
# Tell CMake to directly enter the DiskANN submodule and execute its own CMakeLists.txt
|
||||
# DiskANN will handle everything itself, including compiling Python bindings
|
||||
add_subdirectory(src/third_party/DiskANN)
|
||||
@@ -1 +1,7 @@
|
||||
from . import diskann_backend as diskann_backend
|
||||
from . import graph_partition
|
||||
|
||||
# Export main classes and functions
|
||||
from .graph_partition import GraphPartitioner, partition_graph
|
||||
|
||||
__all__ = ["GraphPartitioner", "diskann_backend", "graph_partition", "partition_graph"]
|
||||
|
||||
@@ -137,6 +137,71 @@ class DiskannBuilder(LeannBackendBuilderInterface):
|
||||
def __init__(self, **kwargs):
|
||||
self.build_params = kwargs
|
||||
|
||||
def _safe_cleanup_after_partition(self, index_dir: Path, index_prefix: str):
|
||||
"""
|
||||
Safely cleanup files after partition.
|
||||
In partition mode, C++ doesn't read _disk.index content,
|
||||
so we can delete it if all derived files exist.
|
||||
"""
|
||||
disk_index_file = index_dir / f"{index_prefix}_disk.index"
|
||||
beam_search_file = index_dir / f"{index_prefix}_disk_beam_search.index"
|
||||
|
||||
# Required files that C++ partition mode needs
|
||||
# Note: C++ generates these with _disk.index suffix
|
||||
disk_suffix = "_disk.index"
|
||||
required_files = [
|
||||
f"{index_prefix}{disk_suffix}_medoids.bin", # Critical: assert fails if missing
|
||||
# Note: _centroids.bin is not created in single-shot build - C++ handles this automatically
|
||||
f"{index_prefix}_pq_pivots.bin", # PQ table
|
||||
f"{index_prefix}_pq_compressed.bin", # PQ compressed vectors
|
||||
]
|
||||
|
||||
# Check if all required files exist
|
||||
missing_files = []
|
||||
for filename in required_files:
|
||||
file_path = index_dir / filename
|
||||
if not file_path.exists():
|
||||
missing_files.append(filename)
|
||||
|
||||
if missing_files:
|
||||
logger.warning(
|
||||
f"Cannot safely delete _disk.index - missing required files: {missing_files}"
|
||||
)
|
||||
logger.info("Keeping all original files for safety")
|
||||
return
|
||||
|
||||
# Calculate space savings
|
||||
space_saved = 0
|
||||
files_to_delete = []
|
||||
|
||||
if disk_index_file.exists():
|
||||
space_saved += disk_index_file.stat().st_size
|
||||
files_to_delete.append(disk_index_file)
|
||||
|
||||
if beam_search_file.exists():
|
||||
space_saved += beam_search_file.stat().st_size
|
||||
files_to_delete.append(beam_search_file)
|
||||
|
||||
# Safe to delete!
|
||||
for file_to_delete in files_to_delete:
|
||||
try:
|
||||
os.remove(file_to_delete)
|
||||
logger.info(f"✅ Safely deleted: {file_to_delete.name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete {file_to_delete.name}: {e}")
|
||||
|
||||
if space_saved > 0:
|
||||
space_saved_mb = space_saved / (1024 * 1024)
|
||||
logger.info(f"💾 Space saved: {space_saved_mb:.1f} MB")
|
||||
|
||||
# Show what files are kept
|
||||
logger.info("📁 Kept essential files for partition mode:")
|
||||
for filename in required_files:
|
||||
file_path = index_dir / filename
|
||||
if file_path.exists():
|
||||
size_mb = file_path.stat().st_size / (1024 * 1024)
|
||||
logger.info(f" - {filename} ({size_mb:.1f} MB)")
|
||||
|
||||
def build(self, data: np.ndarray, ids: list[str], index_path: str, **kwargs):
|
||||
path = Path(index_path)
|
||||
index_dir = path.parent
|
||||
@@ -151,6 +216,17 @@ class DiskannBuilder(LeannBackendBuilderInterface):
|
||||
_write_vectors_to_bin(data, index_dir / data_filename)
|
||||
|
||||
build_kwargs = {**self.build_params, **kwargs}
|
||||
|
||||
# Extract is_recompute from nested backend_kwargs if needed
|
||||
is_recompute = build_kwargs.get("is_recompute", False)
|
||||
if not is_recompute and "backend_kwargs" in build_kwargs:
|
||||
is_recompute = build_kwargs["backend_kwargs"].get("is_recompute", False)
|
||||
|
||||
# Flatten all backend_kwargs parameters to top level for compatibility
|
||||
if "backend_kwargs" in build_kwargs:
|
||||
nested_params = build_kwargs.pop("backend_kwargs")
|
||||
build_kwargs.update(nested_params)
|
||||
|
||||
metric_enum = _get_diskann_metrics().get(
|
||||
build_kwargs.get("distance_metric", "mips").lower()
|
||||
)
|
||||
@@ -185,6 +261,30 @@ class DiskannBuilder(LeannBackendBuilderInterface):
|
||||
build_kwargs.get("pq_disk_bytes", 0),
|
||||
"",
|
||||
)
|
||||
|
||||
# Auto-partition if is_recompute is enabled
|
||||
if build_kwargs.get("is_recompute", False):
|
||||
logger.info("is_recompute=True, starting automatic graph partitioning...")
|
||||
from .graph_partition import partition_graph
|
||||
|
||||
# Partition the index using absolute paths
|
||||
# Convert to absolute paths to avoid issues with working directory changes
|
||||
absolute_index_dir = Path(index_dir).resolve()
|
||||
absolute_index_prefix_path = str(absolute_index_dir / index_prefix)
|
||||
disk_graph_path, partition_bin_path = partition_graph(
|
||||
index_prefix_path=absolute_index_prefix_path,
|
||||
output_dir=str(absolute_index_dir),
|
||||
partition_prefix=index_prefix,
|
||||
)
|
||||
|
||||
# Safe cleanup: In partition mode, C++ doesn't read _disk.index content
|
||||
# but still needs the derived files (_medoids.bin, _centroids.bin, etc.)
|
||||
self._safe_cleanup_after_partition(index_dir, index_prefix)
|
||||
|
||||
logger.info("✅ Graph partitioning completed successfully!")
|
||||
logger.info(f" - Disk graph: {disk_graph_path}")
|
||||
logger.info(f" - Partition file: {partition_bin_path}")
|
||||
|
||||
finally:
|
||||
temp_data_file = index_dir / data_filename
|
||||
if temp_data_file.exists():
|
||||
@@ -213,7 +313,26 @@ class DiskannSearcher(BaseSearcher):
|
||||
|
||||
# For DiskANN, we need to reinitialize the index when zmq_port changes
|
||||
# Store the initialization parameters for later use
|
||||
full_index_prefix = str(self.index_dir / self.index_path.stem)
|
||||
# Note: C++ load method expects the BASE path (without _disk.index suffix)
|
||||
# C++ internally constructs: index_prefix + "_disk.index"
|
||||
index_name = self.index_path.stem # "simple_test.leann" -> "simple_test"
|
||||
diskann_index_prefix = str(self.index_dir / index_name) # /path/to/simple_test
|
||||
full_index_prefix = diskann_index_prefix # /path/to/simple_test (base path)
|
||||
|
||||
# Auto-detect partition files and set partition_prefix
|
||||
partition_graph_file = self.index_dir / f"{index_name}_disk_graph.index"
|
||||
partition_bin_file = self.index_dir / f"{index_name}_partition.bin"
|
||||
|
||||
partition_prefix = ""
|
||||
if partition_graph_file.exists() and partition_bin_file.exists():
|
||||
# C++ expects full path prefix, not just filename
|
||||
partition_prefix = str(self.index_dir / index_name) # /path/to/simple_test
|
||||
logger.info(
|
||||
f"✅ Detected partition files, using partition_prefix='{partition_prefix}'"
|
||||
)
|
||||
else:
|
||||
logger.debug("No partition files detected, using standard index files")
|
||||
|
||||
self._init_params = {
|
||||
"metric_enum": metric_enum,
|
||||
"full_index_prefix": full_index_prefix,
|
||||
@@ -221,8 +340,14 @@ class DiskannSearcher(BaseSearcher):
|
||||
"num_nodes_to_cache": kwargs.get("num_nodes_to_cache", 0),
|
||||
"cache_mechanism": 1,
|
||||
"pq_prefix": "",
|
||||
"partition_prefix": "",
|
||||
"partition_prefix": partition_prefix,
|
||||
}
|
||||
|
||||
# Log partition configuration for debugging
|
||||
if partition_prefix:
|
||||
logger.info(
|
||||
f"✅ Detected partition files, using partition_prefix='{partition_prefix}'"
|
||||
)
|
||||
self._diskannpy = diskannpy
|
||||
self._current_zmq_port = None
|
||||
self._index = None
|
||||
@@ -334,3 +459,25 @@ class DiskannSearcher(BaseSearcher):
|
||||
string_labels = [[str(int_label) for int_label in batch_labels] for batch_labels in labels]
|
||||
|
||||
return {"labels": string_labels, "distances": distances}
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup DiskANN-specific resources including C++ index."""
|
||||
# Call parent cleanup first
|
||||
super().cleanup()
|
||||
|
||||
# Delete the C++ index to trigger destructors
|
||||
try:
|
||||
if hasattr(self, "_index") and self._index is not None:
|
||||
del self._index
|
||||
self._index = None
|
||||
self._current_zmq_port = None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Force garbage collection to ensure C++ objects are destroyed
|
||||
try:
|
||||
import gc
|
||||
|
||||
gc.collect()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -81,7 +81,8 @@ def create_diskann_embedding_server(
|
||||
with open(passages_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
passages = PassageManager(meta["passage_sources"])
|
||||
logger.info(f"Loading PassageManager with metadata_file_path: {passages_file}")
|
||||
passages = PassageManager(meta["passage_sources"], metadata_file_path=passages_file)
|
||||
logger.info(
|
||||
f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
|
||||
)
|
||||
@@ -99,6 +100,7 @@ def create_diskann_embedding_server(
|
||||
socket = context.socket(
|
||||
zmq.REP
|
||||
) # REP socket for both BaseSearcher and DiskANN C++ REQ clients
|
||||
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
|
||||
socket.bind(f"tcp://*:{zmq_port}")
|
||||
logger.info(f"DiskANN ZMQ REP server listening on port {zmq_port}")
|
||||
|
||||
|
||||
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Graph Partition Module for LEANN DiskANN Backend
|
||||
|
||||
This module provides Python bindings for the graph partition functionality
|
||||
of DiskANN, allowing users to partition disk-based indices for better
|
||||
performance.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class GraphPartitioner:
|
||||
"""
|
||||
A Python interface for DiskANN's graph partition functionality.
|
||||
|
||||
This class provides methods to partition disk-based indices for improved
|
||||
search performance and memory efficiency.
|
||||
"""
|
||||
|
||||
def __init__(self, build_type: str = "release"):
|
||||
"""
|
||||
Initialize the GraphPartitioner.
|
||||
|
||||
Args:
|
||||
build_type: Build type for the executables ("debug" or "release")
|
||||
"""
|
||||
self.build_type = build_type
|
||||
self._ensure_executables()
|
||||
|
||||
def _get_executable_path(self, name: str) -> str:
|
||||
"""Get the path to a graph partition executable."""
|
||||
# Get the directory where this Python module is located
|
||||
module_dir = Path(__file__).parent
|
||||
# Navigate to the graph_partition directory
|
||||
graph_partition_dir = module_dir.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
executable_path = graph_partition_dir / "build" / self.build_type / "graph_partition" / name
|
||||
|
||||
if not executable_path.exists():
|
||||
raise FileNotFoundError(f"Executable {name} not found at {executable_path}")
|
||||
|
||||
return str(executable_path)
|
||||
|
||||
def _ensure_executables(self):
|
||||
"""Ensure that the required executables are built."""
|
||||
try:
|
||||
self._get_executable_path("partitioner")
|
||||
self._get_executable_path("index_relayout")
|
||||
except FileNotFoundError:
|
||||
# Try to build the executables automatically
|
||||
print("Executables not found, attempting to build them...")
|
||||
self._build_executables()
|
||||
|
||||
def _build_executables(self):
|
||||
"""Build the required executables."""
|
||||
graph_partition_dir = (
|
||||
Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
)
|
||||
original_dir = os.getcwd()
|
||||
|
||||
try:
|
||||
os.chdir(graph_partition_dir)
|
||||
|
||||
# Clean any existing build
|
||||
if (graph_partition_dir / "build").exists():
|
||||
shutil.rmtree(graph_partition_dir / "build")
|
||||
|
||||
# Run the build script
|
||||
cmd = ["./build.sh", self.build_type, "split_graph", "/tmp/dummy"]
|
||||
subprocess.run(cmd, capture_output=True, text=True, cwd=graph_partition_dir)
|
||||
|
||||
# Check if executables were created
|
||||
partitioner_path = self._get_executable_path("partitioner")
|
||||
relayout_path = self._get_executable_path("index_relayout")
|
||||
|
||||
print(f"✅ Built partitioner: {partitioner_path}")
|
||||
print(f"✅ Built index_relayout: {relayout_path}")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to build executables: {e}")
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
def partition_graph(
|
||||
self,
|
||||
index_prefix_path: str,
|
||||
output_dir: Optional[str] = None,
|
||||
partition_prefix: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Partition a disk-based index for improved performance.
|
||||
|
||||
Args:
|
||||
index_prefix_path: Path to the index prefix (e.g., "/path/to/index")
|
||||
output_dir: Output directory for results (defaults to parent of index_prefix_path)
|
||||
partition_prefix: Prefix for output files (defaults to basename of index_prefix_path)
|
||||
**kwargs: Additional parameters for graph partitioning:
|
||||
- gp_times: Number of LDG partition iterations (default: 10)
|
||||
- lock_nums: Number of lock nodes (default: 10)
|
||||
- cut: Cut adjacency list degree (default: 100)
|
||||
- scale_factor: Scale factor (default: 1)
|
||||
- data_type: Data type (default: "float")
|
||||
- thread_nums: Number of threads (default: 10)
|
||||
|
||||
Returns:
|
||||
Tuple of (disk_graph_index_path, partition_bin_path)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the partitioning process fails
|
||||
"""
|
||||
# Set default parameters
|
||||
params = {
|
||||
"gp_times": 10,
|
||||
"lock_nums": 10,
|
||||
"cut": 100,
|
||||
"scale_factor": 1,
|
||||
"data_type": "float",
|
||||
"thread_nums": 10,
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# Determine output directory
|
||||
if output_dir is None:
|
||||
output_dir = str(Path(index_prefix_path).parent)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Determine partition prefix
|
||||
if partition_prefix is None:
|
||||
partition_prefix = Path(index_prefix_path).name
|
||||
|
||||
# Get executable paths
|
||||
partitioner_path = self._get_executable_path("partitioner")
|
||||
relayout_path = self._get_executable_path("index_relayout")
|
||||
|
||||
# Create temporary directory for processing
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Change to the graph_partition directory for temporary files
|
||||
graph_partition_dir = (
|
||||
Path(__file__).parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
)
|
||||
original_dir = os.getcwd()
|
||||
|
||||
try:
|
||||
os.chdir(graph_partition_dir)
|
||||
|
||||
# Create temporary data directory
|
||||
temp_data_dir = Path(temp_dir) / "data"
|
||||
temp_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Set up paths for temporary files
|
||||
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
|
||||
graph_gp_path = (
|
||||
graph_path
|
||||
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
|
||||
)
|
||||
graph_gp_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find input index file
|
||||
old_index_file = f"{index_prefix_path}_disk_beam_search.index"
|
||||
if not os.path.exists(old_index_file):
|
||||
old_index_file = f"{index_prefix_path}_disk.index"
|
||||
|
||||
if not os.path.exists(old_index_file):
|
||||
raise RuntimeError(f"Index file not found: {old_index_file}")
|
||||
|
||||
# Run partitioner
|
||||
gp_file_path = graph_gp_path / "_part.bin"
|
||||
partitioner_cmd = [
|
||||
partitioner_path,
|
||||
"--index_file",
|
||||
old_index_file,
|
||||
"--data_type",
|
||||
params["data_type"],
|
||||
"--gp_file",
|
||||
str(gp_file_path),
|
||||
"-T",
|
||||
str(params["thread_nums"]),
|
||||
"--ldg_times",
|
||||
str(params["gp_times"]),
|
||||
"--scale",
|
||||
str(params["scale_factor"]),
|
||||
"--mode",
|
||||
"1",
|
||||
]
|
||||
|
||||
print(f"Running partitioner: {' '.join(partitioner_cmd)}")
|
||||
result = subprocess.run(
|
||||
partitioner_cmd, capture_output=True, text=True, cwd=graph_partition_dir
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Partitioner failed with return code {result.returncode}.\n"
|
||||
f"stdout: {result.stdout}\n"
|
||||
f"stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
# Run relayout
|
||||
part_tmp_index = graph_gp_path / "_part_tmp.index"
|
||||
relayout_cmd = [
|
||||
relayout_path,
|
||||
old_index_file,
|
||||
str(gp_file_path),
|
||||
params["data_type"],
|
||||
"1",
|
||||
]
|
||||
|
||||
print(f"Running relayout: {' '.join(relayout_cmd)}")
|
||||
result = subprocess.run(
|
||||
relayout_cmd, capture_output=True, text=True, cwd=graph_partition_dir
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"Relayout failed with return code {result.returncode}.\n"
|
||||
f"stdout: {result.stdout}\n"
|
||||
f"stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
# Copy results to output directory
|
||||
disk_graph_path = Path(output_dir) / f"{partition_prefix}_disk_graph.index"
|
||||
partition_bin_path = Path(output_dir) / f"{partition_prefix}_partition.bin"
|
||||
|
||||
shutil.copy2(part_tmp_index, disk_graph_path)
|
||||
shutil.copy2(gp_file_path, partition_bin_path)
|
||||
|
||||
print(f"Results copied to: {output_dir}")
|
||||
return str(disk_graph_path), str(partition_bin_path)
|
||||
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
def get_partition_info(self, partition_bin_path: str) -> dict:
|
||||
"""
|
||||
Get information about a partition file.
|
||||
|
||||
Args:
|
||||
partition_bin_path: Path to the partition binary file
|
||||
|
||||
Returns:
|
||||
Dictionary containing partition information
|
||||
"""
|
||||
if not os.path.exists(partition_bin_path):
|
||||
raise FileNotFoundError(f"Partition file not found: {partition_bin_path}")
|
||||
|
||||
# For now, return basic file information
|
||||
# In the future, this could parse the binary file for detailed info
|
||||
stat = os.stat(partition_bin_path)
|
||||
return {
|
||||
"file_size": stat.st_size,
|
||||
"file_path": partition_bin_path,
|
||||
"modified_time": stat.st_mtime,
|
||||
}
|
||||
|
||||
|
||||
def partition_graph(
|
||||
index_prefix_path: str,
|
||||
output_dir: Optional[str] = None,
|
||||
partition_prefix: Optional[str] = None,
|
||||
build_type: str = "release",
|
||||
**kwargs,
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Convenience function to partition a graph index.
|
||||
|
||||
Args:
|
||||
index_prefix_path: Path to the index prefix
|
||||
output_dir: Output directory (defaults to parent of index_prefix_path)
|
||||
partition_prefix: Prefix for output files (defaults to basename of index_prefix_path)
|
||||
build_type: Build type for executables ("debug" or "release")
|
||||
**kwargs: Additional parameters for graph partitioning
|
||||
|
||||
Returns:
|
||||
Tuple of (disk_graph_index_path, partition_bin_path)
|
||||
"""
|
||||
partitioner = GraphPartitioner(build_type=build_type)
|
||||
return partitioner.partition_graph(index_prefix_path, output_dir, partition_prefix, **kwargs)
|
||||
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
# Example: partition an index
|
||||
try:
|
||||
disk_graph_path, partition_bin_path = partition_graph(
|
||||
"/path/to/your/index_prefix", gp_times=10, lock_nums=10, cut=100
|
||||
)
|
||||
print("Partitioning completed successfully!")
|
||||
print(f"Disk graph index: {disk_graph_path}")
|
||||
print(f"Partition binary: {partition_bin_path}")
|
||||
except Exception as e:
|
||||
print(f"Partitioning failed: {e}")
|
||||
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simplified Graph Partition Module for LEANN DiskANN Backend
|
||||
|
||||
This module provides a simple Python interface for graph partitioning
|
||||
that directly calls the existing executables.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def partition_graph_simple(
|
||||
index_prefix_path: str, output_dir: Optional[str] = None, **kwargs
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Simple function to partition a graph index.
|
||||
|
||||
Args:
|
||||
index_prefix_path: Path to the index prefix (e.g., "/path/to/index")
|
||||
output_dir: Output directory (defaults to parent of index_prefix_path)
|
||||
**kwargs: Additional parameters for graph partitioning
|
||||
|
||||
Returns:
|
||||
Tuple of (disk_graph_index_path, partition_bin_path)
|
||||
"""
|
||||
# Set default parameters
|
||||
params = {
|
||||
"gp_times": 10,
|
||||
"lock_nums": 10,
|
||||
"cut": 100,
|
||||
"scale_factor": 1,
|
||||
"data_type": "float",
|
||||
"thread_nums": 10,
|
||||
**kwargs,
|
||||
}
|
||||
|
||||
# Determine output directory
|
||||
if output_dir is None:
|
||||
output_dir = str(Path(index_prefix_path).parent)
|
||||
|
||||
# Find the graph_partition directory
|
||||
current_file = Path(__file__)
|
||||
graph_partition_dir = current_file.parent.parent / "third_party" / "DiskANN" / "graph_partition"
|
||||
|
||||
if not graph_partition_dir.exists():
|
||||
raise RuntimeError(f"Graph partition directory not found: {graph_partition_dir}")
|
||||
|
||||
# Find input index file
|
||||
old_index_file = f"{index_prefix_path}_disk_beam_search.index"
|
||||
if not os.path.exists(old_index_file):
|
||||
old_index_file = f"{index_prefix_path}_disk.index"
|
||||
|
||||
if not os.path.exists(old_index_file):
|
||||
raise RuntimeError(f"Index file not found: {old_index_file}")
|
||||
|
||||
# Create temporary directory for processing
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
temp_data_dir = Path(temp_dir) / "data"
|
||||
temp_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Set up paths for temporary files
|
||||
graph_path = temp_data_dir / "starling" / "_M_R_L_B" / "GRAPH"
|
||||
graph_gp_path = (
|
||||
graph_path
|
||||
/ f"GP_TIMES_{params['gp_times']}_LOCK_{params['lock_nums']}_GP_USE_FREQ0_CUT{params['cut']}_SCALE{params['scale_factor']}"
|
||||
)
|
||||
graph_gp_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run the build script with our parameters
|
||||
cmd = [str(graph_partition_dir / "build.sh"), "release", "split_graph", index_prefix_path]
|
||||
|
||||
# Set environment variables for parameters
|
||||
env = os.environ.copy()
|
||||
env.update(
|
||||
{
|
||||
"GP_TIMES": str(params["gp_times"]),
|
||||
"GP_LOCK_NUMS": str(params["lock_nums"]),
|
||||
"GP_CUT": str(params["cut"]),
|
||||
"GP_SCALE_F": str(params["scale_factor"]),
|
||||
"DATA_TYPE": params["data_type"],
|
||||
"GP_T": str(params["thread_nums"]),
|
||||
}
|
||||
)
|
||||
|
||||
print(f"Running graph partition with command: {' '.join(cmd)}")
|
||||
print(f"Working directory: {graph_partition_dir}")
|
||||
|
||||
# Run the command
|
||||
result = subprocess.run(
|
||||
cmd, env=env, capture_output=True, text=True, cwd=graph_partition_dir
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"Command failed with return code {result.returncode}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
print(f"stderr: {result.stderr}")
|
||||
raise RuntimeError(
|
||||
f"Graph partitioning failed with return code {result.returncode}.\n"
|
||||
f"stdout: {result.stdout}\n"
|
||||
f"stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
# Check if output files were created
|
||||
disk_graph_path = Path(output_dir) / "_disk_graph.index"
|
||||
partition_bin_path = Path(output_dir) / "_partition.bin"
|
||||
|
||||
if not disk_graph_path.exists():
|
||||
raise RuntimeError(f"Expected output file not found: {disk_graph_path}")
|
||||
|
||||
if not partition_bin_path.exists():
|
||||
raise RuntimeError(f"Expected output file not found: {partition_bin_path}")
|
||||
|
||||
print("✅ Partitioning completed successfully!")
|
||||
print(f" Disk graph index: {disk_graph_path}")
|
||||
print(f" Partition binary: {partition_bin_path}")
|
||||
|
||||
return str(disk_graph_path), str(partition_bin_path)
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
disk_graph_path, partition_bin_path = partition_graph_simple(
|
||||
"/Users/yichuan/Desktop/release2/leann/diskannbuild/test_doc_files",
|
||||
gp_times=5,
|
||||
lock_nums=5,
|
||||
cut=50,
|
||||
)
|
||||
print("Success! Output files:")
|
||||
print(f" - {disk_graph_path}")
|
||||
print(f" - {partition_bin_path}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-diskann"
|
||||
version = "0.2.9"
|
||||
dependencies = ["leann-core==0.2.9", "numpy", "protobuf>=3.19.0"]
|
||||
version = "0.2.7"
|
||||
dependencies = ["leann-core==0.2.7", "numpy", "protobuf>=3.19.0"]
|
||||
|
||||
[tool.scikit-build]
|
||||
# Key: simplified CMake path
|
||||
@@ -17,5 +17,3 @@ editable.mode = "redirect"
|
||||
cmake.build-type = "Release"
|
||||
build.verbose = true
|
||||
build.tool-args = ["-j8"]
|
||||
# Let CMake find packages via Homebrew prefix
|
||||
cmake.define = {CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}, OpenMP_ROOT = {env = "OpenMP_ROOT"}}
|
||||
|
||||
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: 04048bb302...b2dc4ea2c7
@@ -5,20 +5,11 @@ set(CMAKE_CXX_COMPILER_WORKS 1)
|
||||
|
||||
# Set OpenMP path for macOS
|
||||
if(APPLE)
|
||||
# Detect Homebrew installation path (Apple Silicon vs Intel)
|
||||
if(EXISTS "/opt/homebrew/opt/libomp")
|
||||
set(HOMEBREW_PREFIX "/opt/homebrew")
|
||||
elseif(EXISTS "/usr/local/opt/libomp")
|
||||
set(HOMEBREW_PREFIX "/usr/local")
|
||||
else()
|
||||
message(FATAL_ERROR "Could not find libomp installation. Please install with: brew install libomp")
|
||||
endif()
|
||||
|
||||
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include")
|
||||
set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
|
||||
set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/opt/libomp/include")
|
||||
set(OpenMP_C_LIB_NAMES "omp")
|
||||
set(OpenMP_CXX_LIB_NAMES "omp")
|
||||
set(OpenMP_omp_LIBRARY "${HOMEBREW_PREFIX}/opt/libomp/lib/libomp.dylib")
|
||||
set(OpenMP_omp_LIBRARY "/opt/homebrew/opt/libomp/lib/libomp.dylib")
|
||||
|
||||
# Force use of system libc++ to avoid version mismatch
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import argparse
|
||||
import gc # Import garbage collector interface
|
||||
import logging
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
@@ -7,6 +8,12 @@ import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Set up logging to avoid print buffer issues
|
||||
logger = logging.getLogger(__name__)
|
||||
LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper()
|
||||
log_level = getattr(logging, LOG_LEVEL, logging.WARNING)
|
||||
logger.setLevel(log_level)
|
||||
|
||||
# --- FourCCs (add more if needed) ---
|
||||
INDEX_HNSW_FLAT_FOURCC = int.from_bytes(b"IHNf", "little")
|
||||
# Add other HNSW fourccs if you expect different storage types inside HNSW
|
||||
@@ -243,6 +250,12 @@ def convert_hnsw_graph_to_csr(input_filename, output_filename, prune_embeddings=
|
||||
output_filename: Output CSR index file
|
||||
prune_embeddings: Whether to prune embedding storage (write NULL storage marker)
|
||||
"""
|
||||
# Disable buffering for print statements to avoid deadlock in CI/pytest
|
||||
import functools
|
||||
|
||||
global print
|
||||
print = functools.partial(print, flush=True)
|
||||
|
||||
print(f"Starting conversion: {input_filename} -> {output_filename}")
|
||||
start_time = time.time()
|
||||
original_hnsw_data = {}
|
||||
|
||||
@@ -245,3 +245,25 @@ class HNSWSearcher(BaseSearcher):
|
||||
string_labels = [[str(int_label) for int_label in batch_labels] for batch_labels in labels]
|
||||
|
||||
return {"labels": string_labels, "distances": distances}
|
||||
|
||||
def cleanup(self):
|
||||
"""Cleanup HNSW-specific resources including C++ ZMQ connections."""
|
||||
# Call parent cleanup first
|
||||
super().cleanup()
|
||||
|
||||
# Additional cleanup for C++ side ZMQ connections
|
||||
# The ZmqDistanceComputer in C++ uses ZMQ connections that need cleanup
|
||||
try:
|
||||
# Delete the index to trigger C++ destructors
|
||||
if hasattr(self, "index"):
|
||||
del self.index
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Force garbage collection to ensure C++ objects are destroyed
|
||||
try:
|
||||
import gc
|
||||
|
||||
gc.collect()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -10,7 +10,7 @@ import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from typing import Optional
|
||||
|
||||
import msgpack
|
||||
import numpy as np
|
||||
@@ -34,7 +34,7 @@ if not logger.handlers:
|
||||
|
||||
|
||||
def create_hnsw_embedding_server(
|
||||
passages_file: Union[str, None] = None,
|
||||
passages_file: Optional[str] = None,
|
||||
zmq_port: int = 5555,
|
||||
model_name: str = "sentence-transformers/all-mpnet-base-v2",
|
||||
distance_metric: str = "mips",
|
||||
@@ -82,21 +82,8 @@ def create_hnsw_embedding_server(
|
||||
with open(passages_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
# Convert relative paths to absolute paths based on metadata file location
|
||||
metadata_dir = Path(passages_file).parent.parent # Go up one level from the metadata file
|
||||
passage_sources = []
|
||||
for source in meta["passage_sources"]:
|
||||
source_copy = source.copy()
|
||||
# Convert relative paths to absolute paths
|
||||
if not Path(source_copy["path"]).is_absolute():
|
||||
source_copy["path"] = str(metadata_dir / source_copy["path"])
|
||||
if not Path(source_copy["index_path"]).is_absolute():
|
||||
source_copy["index_path"] = str(metadata_dir / source_copy["index_path"])
|
||||
passage_sources.append(source_copy)
|
||||
|
||||
passages = PassageManager(passage_sources)
|
||||
# Use index dimensions from metadata for shaping fallback responses
|
||||
embedding_dim: int = int(meta.get("dimensions", 0))
|
||||
# Let PassageManager handle path resolution uniformly
|
||||
passages = PassageManager(meta["passage_sources"], metadata_file_path=passages_file)
|
||||
logger.info(
|
||||
f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
|
||||
)
|
||||
@@ -105,15 +92,13 @@ def create_hnsw_embedding_server(
|
||||
"""ZMQ server thread"""
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.REP)
|
||||
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
|
||||
socket.bind(f"tcp://*:{zmq_port}")
|
||||
logger.info(f"HNSW ZMQ server listening on port {zmq_port}")
|
||||
|
||||
socket.setsockopt(zmq.RCVTIMEO, 300000)
|
||||
socket.setsockopt(zmq.SNDTIMEO, 300000)
|
||||
|
||||
# Track last request type for safe fallback responses on exceptions
|
||||
last_request_type = "unknown" # one of: 'text', 'distance', 'embedding', 'unknown'
|
||||
last_request_length = 0
|
||||
while True:
|
||||
try:
|
||||
message_bytes = socket.recv()
|
||||
@@ -126,8 +111,6 @@ def create_hnsw_embedding_server(
|
||||
if isinstance(request_payload, list) and len(request_payload) > 0:
|
||||
# Check if this is a direct text request (list of strings)
|
||||
if all(isinstance(item, str) for item in request_payload):
|
||||
last_request_type = "text"
|
||||
last_request_length = len(request_payload)
|
||||
logger.info(
|
||||
f"Processing direct text embedding request for {len(request_payload)} texts in {embedding_mode} mode"
|
||||
)
|
||||
@@ -152,66 +135,43 @@ def create_hnsw_embedding_server(
|
||||
):
|
||||
node_ids = request_payload[0]
|
||||
query_vector = np.array(request_payload[1], dtype=np.float32)
|
||||
last_request_type = "distance"
|
||||
last_request_length = len(node_ids)
|
||||
|
||||
logger.debug("Distance calculation request received")
|
||||
logger.debug(f" Node IDs: {node_ids}")
|
||||
logger.debug(f" Query vector dim: {len(query_vector)}")
|
||||
|
||||
# Get embeddings for node IDs, tolerate missing IDs
|
||||
texts: list[str] = []
|
||||
found_indices: list[int] = []
|
||||
for idx, nid in enumerate(node_ids):
|
||||
# Get embeddings for node IDs
|
||||
texts = []
|
||||
for nid in node_ids:
|
||||
try:
|
||||
passage_data = passages.get_passage(str(nid))
|
||||
txt = passage_data.get("text", "")
|
||||
if isinstance(txt, str) and len(txt) > 0:
|
||||
texts.append(txt)
|
||||
found_indices.append(idx)
|
||||
else:
|
||||
logger.error(f"Empty text for passage ID {nid}")
|
||||
txt = passage_data["text"]
|
||||
texts.append(txt)
|
||||
except KeyError:
|
||||
logger.error(f"Passage ID {nid} not found")
|
||||
raise RuntimeError(f"FATAL: Passage with ID {nid} not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Exception looking up passage ID {nid}: {e}")
|
||||
raise
|
||||
|
||||
# Prepare full-length response distances with safe fallbacks
|
||||
large_distance = 1e9
|
||||
response_distances = [large_distance] * len(node_ids)
|
||||
|
||||
if texts:
|
||||
try:
|
||||
# Process embeddings only for found indices
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
|
||||
# Calculate distances for found embeddings only
|
||||
if distance_metric == "l2":
|
||||
partial_distances = np.sum(
|
||||
np.square(embeddings - query_vector.reshape(1, -1)), axis=1
|
||||
)
|
||||
else: # mips or cosine
|
||||
partial_distances = -np.dot(embeddings, query_vector)
|
||||
|
||||
# Place computed distances back into the full response array
|
||||
for pos, dval in zip(
|
||||
found_indices, partial_distances.flatten().tolist()
|
||||
):
|
||||
response_distances[pos] = float(dval)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Distance computation error, falling back to large distances: {e}"
|
||||
)
|
||||
|
||||
# Always reply with exactly len(node_ids) distances
|
||||
response_bytes = msgpack.packb([response_distances], use_single_float=True)
|
||||
logger.debug(
|
||||
f"Sending distance response with {len(response_distances)} distances (found={len(found_indices)})"
|
||||
# Process embeddings
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
|
||||
# Calculate distances
|
||||
if distance_metric == "l2":
|
||||
distances = np.sum(
|
||||
np.square(embeddings - query_vector.reshape(1, -1)), axis=1
|
||||
)
|
||||
else: # mips or cosine
|
||||
distances = -np.dot(embeddings, query_vector)
|
||||
|
||||
response_payload = distances.flatten().tolist()
|
||||
response_bytes = msgpack.packb([response_payload], use_single_float=True)
|
||||
logger.debug(f"Sending distance response with {len(distances)} distances")
|
||||
|
||||
socket.send(response_bytes)
|
||||
e2e_end = time.time()
|
||||
logger.info(f"⏱️ Distance calculation E2E time: {e2e_end - e2e_start:.6f}s")
|
||||
@@ -231,61 +191,40 @@ def create_hnsw_embedding_server(
|
||||
|
||||
node_ids = request_payload[0]
|
||||
logger.debug(f"Request for {len(node_ids)} node embeddings")
|
||||
last_request_type = "embedding"
|
||||
last_request_length = len(node_ids)
|
||||
|
||||
# Allocate output buffer (B, D) and fill with zeros for robustness
|
||||
if embedding_dim <= 0:
|
||||
logger.error("Embedding dimension unknown; cannot serve embedding request")
|
||||
dims = [0, 0]
|
||||
data = []
|
||||
else:
|
||||
dims = [len(node_ids), embedding_dim]
|
||||
data = [0.0] * (dims[0] * dims[1])
|
||||
|
||||
# Look up texts by node IDs; compute embeddings where available
|
||||
texts: list[str] = []
|
||||
found_indices: list[int] = []
|
||||
for idx, nid in enumerate(node_ids):
|
||||
# Look up texts by node IDs
|
||||
texts = []
|
||||
for nid in node_ids:
|
||||
try:
|
||||
passage_data = passages.get_passage(str(nid))
|
||||
txt = passage_data.get("text", "")
|
||||
if isinstance(txt, str) and len(txt) > 0:
|
||||
texts.append(txt)
|
||||
found_indices.append(idx)
|
||||
else:
|
||||
logger.error(f"Empty text for passage ID {nid}")
|
||||
txt = passage_data["text"]
|
||||
if not txt:
|
||||
raise RuntimeError(f"FATAL: Empty text for passage ID {nid}")
|
||||
texts.append(txt)
|
||||
except KeyError:
|
||||
logger.error(f"Passage with ID {nid} not found")
|
||||
raise RuntimeError(f"FATAL: Passage with ID {nid} not found")
|
||||
except Exception as e:
|
||||
logger.error(f"Exception looking up passage ID {nid}: {e}")
|
||||
raise
|
||||
|
||||
if texts:
|
||||
try:
|
||||
# Process embeddings for found texts only
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
# Process embeddings
|
||||
embeddings = compute_embeddings(texts, model_name, mode=embedding_mode)
|
||||
logger.info(
|
||||
f"Computed embeddings for {len(texts)} texts, shape: {embeddings.shape}"
|
||||
)
|
||||
|
||||
if np.isnan(embeddings).any() or np.isinf(embeddings).any():
|
||||
logger.error(
|
||||
f"NaN or Inf detected in embeddings! Requested IDs: {node_ids[:5]}..."
|
||||
)
|
||||
dims = [0, embedding_dim]
|
||||
data = []
|
||||
else:
|
||||
# Copy computed embeddings into the correct positions
|
||||
emb_f32 = np.ascontiguousarray(embeddings, dtype=np.float32)
|
||||
flat = emb_f32.flatten().tolist()
|
||||
for j, pos in enumerate(found_indices):
|
||||
start = pos * embedding_dim
|
||||
end = start + embedding_dim
|
||||
data[start:end] = flat[j * embedding_dim : (j + 1) * embedding_dim]
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding computation error, returning zeros: {e}")
|
||||
# Serialization and response
|
||||
if np.isnan(embeddings).any() or np.isinf(embeddings).any():
|
||||
logger.error(
|
||||
f"NaN or Inf detected in embeddings! Requested IDs: {node_ids[:5]}..."
|
||||
)
|
||||
raise AssertionError()
|
||||
|
||||
response_payload = [dims, data]
|
||||
hidden_contiguous_f32 = np.ascontiguousarray(embeddings, dtype=np.float32)
|
||||
response_payload = [
|
||||
list(hidden_contiguous_f32.shape),
|
||||
hidden_contiguous_f32.flatten().tolist(),
|
||||
]
|
||||
response_bytes = msgpack.packb(response_payload, use_single_float=True)
|
||||
|
||||
socket.send(response_bytes)
|
||||
@@ -300,22 +239,7 @@ def create_hnsw_embedding_server(
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
# Fallback to a safe, minimal-structure response to avoid client crashes
|
||||
if last_request_type == "distance":
|
||||
# Return a vector of large distances with the expected length
|
||||
fallback_len = max(0, int(last_request_length))
|
||||
large_distance = 1e9
|
||||
safe_response = [[large_distance] * fallback_len]
|
||||
elif last_request_type == "embedding":
|
||||
# Return an empty embedding block with known dimension if available
|
||||
if embedding_dim > 0:
|
||||
safe_response = [[0, embedding_dim], []]
|
||||
else:
|
||||
safe_response = [[0, 0], []]
|
||||
else:
|
||||
# Unknown request type: default to empty embedding structure
|
||||
safe_response = [[0, int(embedding_dim) if embedding_dim > 0 else 0], []]
|
||||
socket.send(msgpack.packb(safe_response, use_single_float=True))
|
||||
socket.send(msgpack.packb([[], []]))
|
||||
|
||||
zmq_thread = threading.Thread(target=zmq_server_thread, daemon=True)
|
||||
zmq_thread.start()
|
||||
|
||||
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "leann-backend-hnsw"
|
||||
version = "0.2.9"
|
||||
version = "0.2.7"
|
||||
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
|
||||
dependencies = [
|
||||
"leann-core==0.2.9",
|
||||
"leann-core==0.2.7",
|
||||
"numpy",
|
||||
"pyzmq>=23.0.0",
|
||||
"msgpack>=1.0.0",
|
||||
@@ -22,8 +22,6 @@ cmake.build-type = "Release"
|
||||
build.verbose = true
|
||||
build.tool-args = ["-j8"]
|
||||
|
||||
# CMake definitions to optimize compilation and find Homebrew packages
|
||||
# CMake definitions to optimize compilation
|
||||
[tool.scikit-build.cmake.define]
|
||||
CMAKE_BUILD_PARALLEL_LEVEL = "8"
|
||||
CMAKE_PREFIX_PATH = {env = "CMAKE_PREFIX_PATH"}
|
||||
OpenMP_ROOT = {env = "OpenMP_ROOT"}
|
||||
|
||||
Submodule packages/leann-backend-hnsw/third_party/faiss updated: 4a2c0d67d3...ff22e2c86b
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann-core"
|
||||
version = "0.2.9"
|
||||
version = "0.2.7"
|
||||
description = "Core API and plugin system for LEANN"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
@@ -33,8 +33,8 @@ dependencies = [
|
||||
"pdfplumber>=0.10.0",
|
||||
"nbconvert>=7.0.0", # For .ipynb file support
|
||||
"gitignore-parser>=0.1.12", # For proper .gitignore handling
|
||||
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -87,21 +87,26 @@ def compute_embeddings_via_server(chunks: list[str], model_name: str, port: int)
|
||||
# Connect to embedding server
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.REQ)
|
||||
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
|
||||
socket.setsockopt(zmq.RCVTIMEO, 1000) # 1s timeout on receive
|
||||
socket.setsockopt(zmq.SNDTIMEO, 1000) # 1s timeout on send
|
||||
socket.setsockopt(zmq.IMMEDIATE, 1) # Don't wait for connection
|
||||
socket.connect(f"tcp://localhost:{port}")
|
||||
|
||||
# Send chunks to server for embedding computation
|
||||
request = chunks
|
||||
socket.send(msgpack.packb(request))
|
||||
try:
|
||||
# Send chunks to server for embedding computation
|
||||
request = chunks
|
||||
socket.send(msgpack.packb(request))
|
||||
|
||||
# Receive embeddings from server
|
||||
response = socket.recv()
|
||||
embeddings_list = msgpack.unpackb(response)
|
||||
# Receive embeddings from server
|
||||
response = socket.recv()
|
||||
embeddings_list = msgpack.unpackb(response)
|
||||
|
||||
# Convert back to numpy array
|
||||
embeddings = np.array(embeddings_list, dtype=np.float32)
|
||||
|
||||
socket.close()
|
||||
context.term()
|
||||
# Convert back to numpy array
|
||||
embeddings = np.array(embeddings_list, dtype=np.float32)
|
||||
finally:
|
||||
socket.close(linger=0)
|
||||
context.term()
|
||||
|
||||
return embeddings
|
||||
|
||||
@@ -115,7 +120,9 @@ class SearchResult:
|
||||
|
||||
|
||||
class PassageManager:
|
||||
def __init__(self, passage_sources: list[dict[str, Any]]):
|
||||
def __init__(
|
||||
self, passage_sources: list[dict[str, Any]], metadata_file_path: Optional[str] = None
|
||||
):
|
||||
self.offset_maps = {}
|
||||
self.passage_files = {}
|
||||
self.global_offset_map = {} # Combined map for fast lookup
|
||||
@@ -125,10 +132,26 @@ class PassageManager:
|
||||
passage_file = source["path"]
|
||||
index_file = source["index_path"] # .idx file
|
||||
|
||||
# Fix path resolution for Colab and other environments
|
||||
# Fix path resolution - relative paths should be relative to metadata file directory
|
||||
if not Path(index_file).is_absolute():
|
||||
# If relative path, try to resolve it properly
|
||||
index_file = str(Path(index_file).resolve())
|
||||
if metadata_file_path:
|
||||
# Resolve relative to metadata file directory
|
||||
metadata_dir = Path(metadata_file_path).parent
|
||||
logger.debug(
|
||||
f"PassageManager: Resolving relative paths from metadata_dir: {metadata_dir}"
|
||||
)
|
||||
index_file = str((metadata_dir / index_file).resolve())
|
||||
passage_file = str((metadata_dir / passage_file).resolve())
|
||||
logger.debug(f"PassageManager: Resolved index_file: {index_file}")
|
||||
else:
|
||||
# Fallback to current directory resolution (legacy behavior)
|
||||
logger.warning(
|
||||
"PassageManager: No metadata_file_path provided, using fallback resolution from cwd"
|
||||
)
|
||||
logger.debug(f"PassageManager: Current working directory: {Path.cwd()}")
|
||||
index_file = str(Path(index_file).resolve())
|
||||
passage_file = str(Path(passage_file).resolve())
|
||||
logger.debug(f"PassageManager: Fallback resolved index_file: {index_file}")
|
||||
|
||||
if not Path(index_file).exists():
|
||||
raise FileNotFoundError(f"Passage index file not found: {index_file}")
|
||||
@@ -314,8 +337,8 @@ class LeannBuilder:
|
||||
"passage_sources": [
|
||||
{
|
||||
"type": "jsonl",
|
||||
"path": str(passages_file),
|
||||
"index_path": str(offset_file),
|
||||
"path": passages_file.name, # Use relative path (just filename)
|
||||
"index_path": offset_file.name, # Use relative path (just filename)
|
||||
}
|
||||
],
|
||||
}
|
||||
@@ -430,8 +453,8 @@ class LeannBuilder:
|
||||
"passage_sources": [
|
||||
{
|
||||
"type": "jsonl",
|
||||
"path": str(passages_file),
|
||||
"index_path": str(offset_file),
|
||||
"path": passages_file.name, # Use relative path (just filename)
|
||||
"index_path": offset_file.name, # Use relative path (just filename)
|
||||
}
|
||||
],
|
||||
"built_from_precomputed_embeddings": True,
|
||||
@@ -473,7 +496,9 @@ class LeannSearcher:
|
||||
self.embedding_model = self.meta_data["embedding_model"]
|
||||
# Support both old and new format
|
||||
self.embedding_mode = self.meta_data.get("embedding_mode", "sentence-transformers")
|
||||
self.passage_manager = PassageManager(self.meta_data.get("passage_sources", []))
|
||||
self.passage_manager = PassageManager(
|
||||
self.meta_data.get("passage_sources", []), metadata_file_path=self.meta_path_str
|
||||
)
|
||||
backend_factory = BACKEND_REGISTRY.get(backend_name)
|
||||
if backend_factory is None:
|
||||
raise ValueError(f"Backend '{backend_name}' not found.")
|
||||
@@ -546,13 +571,13 @@ class LeannSearcher:
|
||||
zmq_port=zmq_port,
|
||||
**kwargs,
|
||||
)
|
||||
time.time() - start_time
|
||||
# logger.info(f" Search time: {search_time} seconds")
|
||||
logger.info(f" Backend returned: labels={len(results.get('labels', [[]])[0])} results")
|
||||
|
||||
enriched_results = []
|
||||
if "labels" in results and "distances" in results:
|
||||
logger.info(f" Processing {len(results['labels'][0])} passage IDs:")
|
||||
# Python 3.9 does not support zip(strict=...); lengths are expected to match
|
||||
for i, (string_id, dist) in enumerate(
|
||||
zip(results["labels"][0], results["distances"][0])
|
||||
):
|
||||
@@ -580,13 +605,39 @@ class LeannSearcher:
|
||||
)
|
||||
except KeyError:
|
||||
RED = "\033[91m"
|
||||
RESET = "\033[0m"
|
||||
logger.error(
|
||||
f" {RED}✗{RESET} [{i + 1:2d}] ID: '{string_id}' -> {RED}ERROR: Passage not found!{RESET}"
|
||||
)
|
||||
|
||||
# Define color codes outside the loop for final message
|
||||
GREEN = "\033[92m"
|
||||
RESET = "\033[0m"
|
||||
logger.info(f" {GREEN}✓ Final enriched results: {len(enriched_results)} passages{RESET}")
|
||||
return enriched_results
|
||||
|
||||
def cleanup(self):
|
||||
"""Explicitly cleanup embedding server and ZMQ resources.
|
||||
|
||||
This method should be called after you're done using the searcher,
|
||||
especially in test environments or batch processing scenarios.
|
||||
"""
|
||||
# Stop embedding server
|
||||
if hasattr(self.backend_impl, "embedding_server_manager"):
|
||||
self.backend_impl.embedding_server_manager.stop_server()
|
||||
|
||||
# Set ZMQ linger but don't terminate global context
|
||||
try:
|
||||
import zmq
|
||||
|
||||
# Just set linger on the global instance
|
||||
ctx = zmq.Context.instance()
|
||||
ctx.linger = 0
|
||||
# NEVER call ctx.term() or destroy() on the global instance
|
||||
# That would block waiting for all sockets to close
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class LeannChat:
|
||||
def __init__(
|
||||
@@ -656,3 +707,12 @@ class LeannChat:
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\nGoodbye!")
|
||||
break
|
||||
|
||||
def cleanup(self):
|
||||
"""Explicitly cleanup embedding server resources.
|
||||
|
||||
This method should be called after you're done using the chat interface,
|
||||
especially in test environments or batch processing scenarios.
|
||||
"""
|
||||
if hasattr(self.searcher, "cleanup"):
|
||||
self.searcher.cleanup()
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from tqdm import tqdm
|
||||
|
||||
from .api import LeannBuilder, LeannChat, LeannSearcher
|
||||
|
||||
@@ -76,14 +74,11 @@ class LeannCLI:
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
leann build my-docs --docs ./documents # Build index from directory
|
||||
leann build my-code --docs ./src ./tests ./config # Build index from multiple directories
|
||||
leann build my-files --docs ./file1.py ./file2.txt ./docs/ # Build index from files and directories
|
||||
leann build my-mixed --docs ./readme.md ./src/ ./config.json # Build index from mixed files/dirs
|
||||
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||
leann search my-docs "query" # Search in my-docs index
|
||||
leann ask my-docs "question" # Ask my-docs index
|
||||
leann list # List all stored indexes
|
||||
leann build my-docs --docs ./documents # Build index named my-docs
|
||||
leann build my-ppts --docs ./ --file-types .pptx,.pdf # Index only PowerPoint and PDF files
|
||||
leann search my-docs "query" # Search in my-docs index
|
||||
leann ask my-docs "question" # Ask my-docs index
|
||||
leann list # List all stored indexes
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -95,11 +90,7 @@ Examples:
|
||||
"index_name", nargs="?", help="Index name (default: current directory name)"
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--docs",
|
||||
type=str,
|
||||
nargs="+",
|
||||
default=["."],
|
||||
help="Documents directories and/or files (default: current directory)",
|
||||
"--docs", type=str, default=".", help="Documents directory (default: current directory)"
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--backend", type=str, default="hnsw", choices=["hnsw", "diskann"]
|
||||
@@ -117,19 +108,7 @@ Examples:
|
||||
build_parser.add_argument("--complexity", type=int, default=64)
|
||||
build_parser.add_argument("--num-threads", type=int, default=1)
|
||||
build_parser.add_argument("--compact", action="store_true", default=True)
|
||||
build_parser.add_argument(
|
||||
"--no-compact",
|
||||
dest="compact",
|
||||
action="store_false",
|
||||
help="Disable compact index storage (store full embeddings; higher storage)",
|
||||
)
|
||||
build_parser.add_argument("--recompute", action="store_true", default=True)
|
||||
build_parser.add_argument(
|
||||
"--no-recompute",
|
||||
dest="recompute",
|
||||
action="store_false",
|
||||
help="Disable embedding recomputation (store full embeddings; lower query latency)",
|
||||
)
|
||||
build_parser.add_argument(
|
||||
"--file-types",
|
||||
type=str,
|
||||
@@ -150,18 +129,6 @@ Examples:
|
||||
default=True,
|
||||
help="Recompute embeddings (default: True)",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"--no-recompute-embeddings",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Disable embedding recomputation during search",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"--no-recompute",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Alias for --no-recompute-embeddings",
|
||||
)
|
||||
search_parser.add_argument(
|
||||
"--pruning-strategy",
|
||||
choices=["global", "local", "proportional"],
|
||||
@@ -190,18 +157,6 @@ Examples:
|
||||
default=True,
|
||||
help="Recompute embeddings (default: True)",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--no-recompute-embeddings",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Disable embedding recomputation during ask",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--no-recompute",
|
||||
dest="recompute_embeddings",
|
||||
action="store_false",
|
||||
help="Alias for --no-recompute-embeddings",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--pruning-strategy",
|
||||
choices=["global", "local", "proportional"],
|
||||
@@ -279,32 +234,6 @@ Examples:
|
||||
"""Check if a file should be excluded using gitignore parser."""
|
||||
return gitignore_matches(str(relative_path))
|
||||
|
||||
def _is_git_submodule(self, path: Path) -> bool:
|
||||
"""Check if a path is a git submodule."""
|
||||
try:
|
||||
# Find the git repo root
|
||||
current_dir = Path.cwd()
|
||||
while current_dir != current_dir.parent:
|
||||
if (current_dir / ".git").exists():
|
||||
gitmodules_path = current_dir / ".gitmodules"
|
||||
if gitmodules_path.exists():
|
||||
# Read .gitmodules to check if this path is a submodule
|
||||
gitmodules_content = gitmodules_path.read_text()
|
||||
# Convert path to relative to git root
|
||||
try:
|
||||
relative_path = path.resolve().relative_to(current_dir)
|
||||
# Check if this path appears in .gitmodules
|
||||
return f"path = {relative_path}" in gitmodules_content
|
||||
except ValueError:
|
||||
# Path is not under git root
|
||||
return False
|
||||
break
|
||||
current_dir = current_dir.parent
|
||||
return False
|
||||
except Exception:
|
||||
# If anything goes wrong, assume it's not a submodule
|
||||
return False
|
||||
|
||||
def list_indexes(self):
|
||||
print("Stored LEANN indexes:")
|
||||
|
||||
@@ -334,9 +263,7 @@ Examples:
|
||||
valid_projects.append(current_path)
|
||||
|
||||
if not valid_projects:
|
||||
print(
|
||||
"No indexes found. Use 'leann build <name> --docs <dir> [<dir2> ...]' to create one."
|
||||
)
|
||||
print("No indexes found. Use 'leann build <name> --docs <dir>' to create one.")
|
||||
return
|
||||
|
||||
total_indexes = 0
|
||||
@@ -383,88 +310,56 @@ Examples:
|
||||
print(f' leann search {example_name} "your query"')
|
||||
print(f" leann ask {example_name} --interactive")
|
||||
|
||||
def load_documents(
|
||||
self, docs_paths: Union[str, list], custom_file_types: Union[str, None] = None
|
||||
):
|
||||
# Handle both single path (string) and multiple paths (list) for backward compatibility
|
||||
if isinstance(docs_paths, str):
|
||||
docs_paths = [docs_paths]
|
||||
|
||||
# Separate files and directories
|
||||
files = []
|
||||
directories = []
|
||||
for path in docs_paths:
|
||||
path_obj = Path(path)
|
||||
if path_obj.is_file():
|
||||
files.append(str(path_obj))
|
||||
elif path_obj.is_dir():
|
||||
# Check if this is a git submodule - if so, skip it
|
||||
if self._is_git_submodule(path_obj):
|
||||
print(f"⚠️ Skipping git submodule: {path}")
|
||||
continue
|
||||
directories.append(str(path_obj))
|
||||
else:
|
||||
print(f"⚠️ Warning: Path '{path}' does not exist, skipping...")
|
||||
continue
|
||||
|
||||
# Print summary of what we're processing
|
||||
total_items = len(files) + len(directories)
|
||||
items_desc = []
|
||||
if files:
|
||||
items_desc.append(f"{len(files)} file{'s' if len(files) > 1 else ''}")
|
||||
if directories:
|
||||
items_desc.append(
|
||||
f"{len(directories)} director{'ies' if len(directories) > 1 else 'y'}"
|
||||
)
|
||||
|
||||
print(f"Loading documents from {' and '.join(items_desc)} ({total_items} total):")
|
||||
if files:
|
||||
print(f" 📄 Files: {', '.join([Path(f).name for f in files])}")
|
||||
if directories:
|
||||
print(f" 📁 Directories: {', '.join(directories)}")
|
||||
|
||||
def load_documents(self, docs_dir: str, custom_file_types: str | None = None):
|
||||
print(f"Loading documents from {docs_dir}...")
|
||||
if custom_file_types:
|
||||
print(f"Using custom file types: {custom_file_types}")
|
||||
|
||||
all_documents = []
|
||||
# Build gitignore parser
|
||||
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||
|
||||
# First, process individual files if any
|
||||
if files:
|
||||
print(f"\n🔄 Processing {len(files)} individual file{'s' if len(files) > 1 else ''}...")
|
||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||
documents = []
|
||||
docs_path = Path(docs_dir)
|
||||
|
||||
# Load individual files using SimpleDirectoryReader with input_files
|
||||
# Note: We skip gitignore filtering for explicitly specified files
|
||||
try:
|
||||
# Group files by their parent directory for efficient loading
|
||||
from collections import defaultdict
|
||||
# Check if we should process PDFs
|
||||
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||
|
||||
files_by_dir = defaultdict(list)
|
||||
for file_path in files:
|
||||
parent_dir = str(Path(file_path).parent)
|
||||
files_by_dir[parent_dir].append(file_path)
|
||||
if should_process_pdfs:
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
# Check if file matches any exclude pattern
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||
continue
|
||||
|
||||
# Load files from each parent directory
|
||||
for parent_dir, file_list in files_by_dir.items():
|
||||
print(
|
||||
f" Loading {len(file_list)} file{'s' if len(file_list) > 1 else ''} from {parent_dir}"
|
||||
)
|
||||
print(f"Processing PDF: {file_path}")
|
||||
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
try:
|
||||
file_docs = SimpleDirectoryReader(
|
||||
parent_dir,
|
||||
input_files=file_list,
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
all_documents.extend(file_docs)
|
||||
print(
|
||||
f" ✅ Loaded {len(file_docs)} document{'s' if len(file_docs) > 1 else ''}"
|
||||
)
|
||||
documents.extend(default_docs)
|
||||
except Exception as e:
|
||||
print(f" ❌ Warning: Could not load files from {parent_dir}: {e}")
|
||||
print(f"Warning: Could not process {file_path}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing individual files: {e}")
|
||||
|
||||
# Define file extensions to process
|
||||
# Load other file types with default reader
|
||||
if custom_file_types:
|
||||
# Parse custom file types from comma-separated string
|
||||
code_extensions = [ext.strip() for ext in custom_file_types.split(",") if ext.strip()]
|
||||
@@ -526,106 +421,41 @@ Examples:
|
||||
".py",
|
||||
".jl",
|
||||
]
|
||||
# Try to load other file types, but don't fail if none are found
|
||||
try:
|
||||
# Create a custom file filter function using our PathSpec
|
||||
def file_filter(file_path: str) -> bool:
|
||||
"""Return True if file should be included (not excluded)"""
|
||||
try:
|
||||
docs_path_obj = Path(docs_dir)
|
||||
file_path_obj = Path(file_path)
|
||||
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||
except (ValueError, OSError):
|
||||
return True # Include files that can't be processed
|
||||
|
||||
# Process each directory
|
||||
if directories:
|
||||
print(
|
||||
f"\n🔄 Processing {len(directories)} director{'ies' if len(directories) > 1 else 'y'}..."
|
||||
)
|
||||
other_docs = SimpleDirectoryReader(
|
||||
docs_dir,
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=code_extensions,
|
||||
file_extractor={}, # Use default extractors
|
||||
filename_as_id=True,
|
||||
).load_data(show_progress=True)
|
||||
|
||||
for docs_dir in directories:
|
||||
print(f"Processing directory: {docs_dir}")
|
||||
# Build gitignore parser for each directory
|
||||
gitignore_matches = self._build_gitignore_parser(docs_dir)
|
||||
# Filter documents after loading based on gitignore rules
|
||||
filtered_docs = []
|
||||
for doc in other_docs:
|
||||
file_path = doc.metadata.get("file_path", "")
|
||||
if file_filter(file_path):
|
||||
filtered_docs.append(doc)
|
||||
|
||||
# Try to use better PDF parsers first, but only if PDFs are requested
|
||||
documents = []
|
||||
docs_path = Path(docs_dir)
|
||||
|
||||
# Check if we should process PDFs
|
||||
should_process_pdfs = custom_file_types is None or ".pdf" in custom_file_types
|
||||
|
||||
if should_process_pdfs:
|
||||
for file_path in docs_path.rglob("*.pdf"):
|
||||
# Check if file matches any exclude pattern
|
||||
try:
|
||||
relative_path = file_path.relative_to(docs_path)
|
||||
if self._should_exclude_file(relative_path, gitignore_matches):
|
||||
continue
|
||||
except ValueError:
|
||||
# Skip files that can't be made relative to docs_path
|
||||
print(f"⚠️ Skipping file outside directory scope: {file_path}")
|
||||
continue
|
||||
|
||||
print(f"Processing PDF: {file_path}")
|
||||
|
||||
# Try PyMuPDF first (best quality)
|
||||
text = extract_pdf_text_with_pymupdf(str(file_path))
|
||||
if text is None:
|
||||
# Try pdfplumber
|
||||
text = extract_pdf_text_with_pdfplumber(str(file_path))
|
||||
|
||||
if text:
|
||||
# Create a simple document structure
|
||||
from llama_index.core import Document
|
||||
|
||||
doc = Document(text=text, metadata={"source": str(file_path)})
|
||||
documents.append(doc)
|
||||
else:
|
||||
# Fallback to default reader
|
||||
print(f"Using default reader for {file_path}")
|
||||
try:
|
||||
default_docs = SimpleDirectoryReader(
|
||||
str(file_path.parent),
|
||||
filename_as_id=True,
|
||||
required_exts=[file_path.suffix],
|
||||
).load_data()
|
||||
documents.extend(default_docs)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not process {file_path}: {e}")
|
||||
|
||||
# Load other file types with default reader
|
||||
try:
|
||||
# Create a custom file filter function using our PathSpec
|
||||
def file_filter(
|
||||
file_path: str, docs_dir=docs_dir, gitignore_matches=gitignore_matches
|
||||
) -> bool:
|
||||
"""Return True if file should be included (not excluded)"""
|
||||
try:
|
||||
docs_path_obj = Path(docs_dir)
|
||||
file_path_obj = Path(file_path)
|
||||
relative_path = file_path_obj.relative_to(docs_path_obj)
|
||||
return not self._should_exclude_file(relative_path, gitignore_matches)
|
||||
except (ValueError, OSError):
|
||||
return True # Include files that can't be processed
|
||||
|
||||
other_docs = SimpleDirectoryReader(
|
||||
docs_dir,
|
||||
recursive=True,
|
||||
encoding="utf-8",
|
||||
required_exts=code_extensions,
|
||||
file_extractor={}, # Use default extractors
|
||||
filename_as_id=True,
|
||||
).load_data(show_progress=True)
|
||||
|
||||
# Filter documents after loading based on gitignore rules
|
||||
filtered_docs = []
|
||||
for doc in other_docs:
|
||||
file_path = doc.metadata.get("file_path", "")
|
||||
if file_filter(file_path):
|
||||
filtered_docs.append(doc)
|
||||
|
||||
documents.extend(filtered_docs)
|
||||
except ValueError as e:
|
||||
if "No files found" in str(e):
|
||||
print(f"No additional files found for other supported types in {docs_dir}.")
|
||||
else:
|
||||
raise e
|
||||
|
||||
all_documents.extend(documents)
|
||||
print(f"Loaded {len(documents)} documents from {docs_dir}")
|
||||
|
||||
documents = all_documents
|
||||
documents.extend(filtered_docs)
|
||||
except ValueError as e:
|
||||
if "No files found" in str(e):
|
||||
print("No additional files found for other supported types.")
|
||||
else:
|
||||
raise e
|
||||
|
||||
all_texts = []
|
||||
|
||||
@@ -676,9 +506,7 @@ Examples:
|
||||
".jl",
|
||||
}
|
||||
|
||||
print("start chunking documents")
|
||||
# Add progress bar for document chunking
|
||||
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
|
||||
for doc in documents:
|
||||
# Check if this is a code file based on source path
|
||||
source_path = doc.metadata.get("source", "")
|
||||
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
|
||||
@@ -694,7 +522,7 @@ Examples:
|
||||
return all_texts
|
||||
|
||||
async def build_index(self, args):
|
||||
docs_paths = args.docs
|
||||
docs_dir = args.docs
|
||||
# Use current directory name if index_name not provided
|
||||
if args.index_name:
|
||||
index_name = args.index_name
|
||||
@@ -705,25 +533,13 @@ Examples:
|
||||
index_dir = self.indexes_dir / index_name
|
||||
index_path = self.get_index_path(index_name)
|
||||
|
||||
# Display all paths being indexed with file/directory distinction
|
||||
files = [p for p in docs_paths if Path(p).is_file()]
|
||||
directories = [p for p in docs_paths if Path(p).is_dir()]
|
||||
|
||||
print(f"📂 Indexing {len(docs_paths)} path{'s' if len(docs_paths) > 1 else ''}:")
|
||||
if files:
|
||||
print(f" 📄 Files ({len(files)}):")
|
||||
for i, file_path in enumerate(files, 1):
|
||||
print(f" {i}. {Path(file_path).resolve()}")
|
||||
if directories:
|
||||
print(f" 📁 Directories ({len(directories)}):")
|
||||
for i, dir_path in enumerate(directories, 1):
|
||||
print(f" {i}. {Path(dir_path).resolve()}")
|
||||
print(f"📂 Indexing: {Path(docs_dir).resolve()}")
|
||||
|
||||
if index_dir.exists() and not args.force:
|
||||
print(f"Index '{index_name}' already exists. Use --force to rebuild.")
|
||||
return
|
||||
|
||||
all_texts = self.load_documents(docs_paths, args.file_types)
|
||||
all_texts = self.load_documents(docs_dir, args.file_types)
|
||||
if not all_texts:
|
||||
print("No documents found")
|
||||
return
|
||||
@@ -759,7 +575,7 @@ Examples:
|
||||
|
||||
if not self.index_exists(index_name):
|
||||
print(
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
||||
)
|
||||
return
|
||||
|
||||
@@ -786,7 +602,7 @@ Examples:
|
||||
|
||||
if not self.index_exists(index_name):
|
||||
print(
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir> [<dir2> ...]' to create it."
|
||||
f"Index '{index_name}' not found. Use 'leann build {index_name} --docs <dir>' to create it."
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ Preserves all optimization parameters to ensure performance
|
||||
|
||||
import logging
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
@@ -373,9 +374,7 @@ def compute_embeddings_ollama(
|
||||
texts: list[str], model_name: str, is_build: bool = False, host: str = "http://localhost:11434"
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Compute embeddings using Ollama API with simplified batch processing.
|
||||
|
||||
Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
|
||||
Compute embeddings using Ollama API.
|
||||
|
||||
Args:
|
||||
texts: List of texts to compute embeddings for
|
||||
@@ -439,19 +438,12 @@ def compute_embeddings_ollama(
|
||||
if any(emb in base_name for emb in ["embed", "bge", "minilm", "e5"]):
|
||||
embedding_models.append(model)
|
||||
|
||||
# Check if model exists (handle versioned names) and resolve to full name
|
||||
resolved_model_name = None
|
||||
for name in model_names:
|
||||
# Exact match
|
||||
if model_name == name:
|
||||
resolved_model_name = name
|
||||
break
|
||||
# Match without version tag (use the versioned name)
|
||||
elif model_name == name.split(":")[0]:
|
||||
resolved_model_name = name
|
||||
break
|
||||
# Check if model exists (handle versioned names)
|
||||
model_found = any(
|
||||
model_name == name.split(":")[0] or model_name == name for name in model_names
|
||||
)
|
||||
|
||||
if not resolved_model_name:
|
||||
if not model_found:
|
||||
error_msg = f"❌ Model '{model_name}' not found in local Ollama.\n\n"
|
||||
|
||||
# Suggest pulling the model
|
||||
@@ -473,11 +465,6 @@ def compute_embeddings_ollama(
|
||||
error_msg += "\n📚 Browse more: https://ollama.com/library"
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Use the resolved model name for all subsequent operations
|
||||
if resolved_model_name != model_name:
|
||||
logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
|
||||
model_name = resolved_model_name
|
||||
|
||||
# Verify the model supports embeddings by testing it
|
||||
try:
|
||||
test_response = requests.post(
|
||||
@@ -498,148 +485,138 @@ def compute_embeddings_ollama(
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.warning(f"Could not verify model existence: {e}")
|
||||
|
||||
# Determine batch size based on device availability
|
||||
# Check for CUDA/MPS availability using torch if available
|
||||
batch_size = 32 # Default for MPS/CPU
|
||||
try:
|
||||
import torch
|
||||
# Process embeddings with optimized concurrent processing
|
||||
import requests
|
||||
|
||||
if torch.cuda.is_available():
|
||||
batch_size = 128 # CUDA gets larger batch size
|
||||
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||
batch_size = 32 # MPS gets smaller batch size
|
||||
except ImportError:
|
||||
# If torch is not available, use conservative batch size
|
||||
batch_size = 32
|
||||
def get_single_embedding(text_idx_tuple):
|
||||
"""Helper function to get embedding for a single text."""
|
||||
text, idx = text_idx_tuple
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
|
||||
logger.info(f"Using batch size: {batch_size}")
|
||||
# Truncate very long texts to avoid API issues
|
||||
truncated_text = text[:8000] if len(text) > 8000 else text
|
||||
|
||||
def get_batch_embeddings(batch_texts):
|
||||
"""Get embeddings for a batch of texts."""
|
||||
all_embeddings = []
|
||||
failed_indices = []
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{host}/api/embeddings",
|
||||
json={"model": model_name, "prompt": truncated_text},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
for i, text in enumerate(batch_texts):
|
||||
max_retries = 3
|
||||
retry_count = 0
|
||||
result = response.json()
|
||||
embedding = result.get("embedding")
|
||||
|
||||
# Truncate very long texts to avoid API issues
|
||||
truncated_text = text[:8000] if len(text) > 8000 else text
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{host}/api/embeddings",
|
||||
json={"model": model_name, "prompt": truncated_text},
|
||||
timeout=30,
|
||||
if embedding is None:
|
||||
raise ValueError(f"No embedding returned for text {idx}")
|
||||
|
||||
return idx, embedding
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
retry_count += 1
|
||||
if retry_count >= max_retries:
|
||||
logger.warning(f"Timeout for text {idx} after {max_retries} retries")
|
||||
return idx, None
|
||||
|
||||
except Exception as e:
|
||||
if retry_count >= max_retries - 1:
|
||||
logger.error(f"Failed to get embedding for text {idx}: {e}")
|
||||
return idx, None
|
||||
retry_count += 1
|
||||
|
||||
return idx, None
|
||||
|
||||
# Determine if we should use concurrent processing
|
||||
use_concurrent = (
|
||||
len(texts) > 5 and not is_build
|
||||
) # Don't use concurrent in build mode to avoid overwhelming
|
||||
max_workers = min(4, len(texts)) # Limit concurrent requests to avoid overwhelming Ollama
|
||||
|
||||
all_embeddings = [None] * len(texts) # Pre-allocate list to maintain order
|
||||
failed_indices = []
|
||||
|
||||
if use_concurrent:
|
||||
logger.info(
|
||||
f"Using concurrent processing with {max_workers} workers for {len(texts)} texts"
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Submit all tasks
|
||||
future_to_idx = {
|
||||
executor.submit(get_single_embedding, (text, idx)): idx
|
||||
for idx, text in enumerate(texts)
|
||||
}
|
||||
|
||||
# Add progress bar for concurrent processing
|
||||
try:
|
||||
if is_build or len(texts) > 10:
|
||||
from tqdm import tqdm
|
||||
|
||||
futures_iterator = tqdm(
|
||||
as_completed(future_to_idx),
|
||||
total=len(texts),
|
||||
desc="Computing Ollama embeddings",
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
embedding = result.get("embedding")
|
||||
|
||||
if embedding is None:
|
||||
raise ValueError(f"No embedding returned for text {i}")
|
||||
|
||||
if not isinstance(embedding, list) or len(embedding) == 0:
|
||||
raise ValueError(f"Invalid embedding format for text {i}")
|
||||
|
||||
all_embeddings.append(embedding)
|
||||
break
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
retry_count += 1
|
||||
if retry_count >= max_retries:
|
||||
logger.warning(f"Timeout for text {i} after {max_retries} retries")
|
||||
failed_indices.append(i)
|
||||
all_embeddings.append(None)
|
||||
break
|
||||
else:
|
||||
futures_iterator = as_completed(future_to_idx)
|
||||
except ImportError:
|
||||
futures_iterator = as_completed(future_to_idx)
|
||||
|
||||
# Collect results as they complete
|
||||
for future in futures_iterator:
|
||||
try:
|
||||
idx, embedding = future.result()
|
||||
if embedding is not None:
|
||||
all_embeddings[idx] = embedding
|
||||
else:
|
||||
failed_indices.append(idx)
|
||||
except Exception as e:
|
||||
retry_count += 1
|
||||
if retry_count >= max_retries:
|
||||
logger.error(f"Failed to get embedding for text {i}: {e}")
|
||||
failed_indices.append(i)
|
||||
all_embeddings.append(None)
|
||||
break
|
||||
return all_embeddings, failed_indices
|
||||
idx = future_to_idx[future]
|
||||
logger.error(f"Exception for text {idx}: {e}")
|
||||
failed_indices.append(idx)
|
||||
|
||||
# Process texts in batches
|
||||
all_embeddings = []
|
||||
all_failed_indices = []
|
||||
|
||||
# Setup progress bar if needed
|
||||
show_progress = is_build or len(texts) > 10
|
||||
try:
|
||||
if show_progress:
|
||||
from tqdm import tqdm
|
||||
except ImportError:
|
||||
show_progress = False
|
||||
|
||||
# Process batches
|
||||
num_batches = (len(texts) + batch_size - 1) // batch_size
|
||||
|
||||
if show_progress:
|
||||
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
|
||||
else:
|
||||
batch_iterator = range(num_batches)
|
||||
# Sequential processing with progress bar
|
||||
show_progress = is_build or len(texts) > 10
|
||||
|
||||
for batch_idx in batch_iterator:
|
||||
start_idx = batch_idx * batch_size
|
||||
end_idx = min(start_idx + batch_size, len(texts))
|
||||
batch_texts = texts[start_idx:end_idx]
|
||||
try:
|
||||
if show_progress:
|
||||
from tqdm import tqdm
|
||||
|
||||
batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
|
||||
iterator = tqdm(
|
||||
enumerate(texts), total=len(texts), desc="Computing Ollama embeddings"
|
||||
)
|
||||
else:
|
||||
iterator = enumerate(texts)
|
||||
except ImportError:
|
||||
iterator = enumerate(texts)
|
||||
|
||||
# Adjust failed indices to global indices
|
||||
global_failed = [start_idx + idx for idx in batch_failed]
|
||||
all_failed_indices.extend(global_failed)
|
||||
all_embeddings.extend(batch_embeddings)
|
||||
for idx, text in iterator:
|
||||
result_idx, embedding = get_single_embedding((text, idx))
|
||||
if embedding is not None:
|
||||
all_embeddings[idx] = embedding
|
||||
else:
|
||||
failed_indices.append(idx)
|
||||
|
||||
# Handle failed embeddings
|
||||
if all_failed_indices:
|
||||
if len(all_failed_indices) == len(texts):
|
||||
if failed_indices:
|
||||
if len(failed_indices) == len(texts):
|
||||
raise RuntimeError("Failed to compute any embeddings")
|
||||
|
||||
logger.warning(
|
||||
f"Failed to compute embeddings for {len(all_failed_indices)}/{len(texts)} texts"
|
||||
)
|
||||
logger.warning(f"Failed to compute embeddings for {len(failed_indices)}/{len(texts)} texts")
|
||||
|
||||
# Use zero embeddings as fallback for failed ones
|
||||
valid_embedding = next((e for e in all_embeddings if e is not None), None)
|
||||
if valid_embedding:
|
||||
embedding_dim = len(valid_embedding)
|
||||
for i, embedding in enumerate(all_embeddings):
|
||||
if embedding is None:
|
||||
all_embeddings[i] = [0.0] * embedding_dim
|
||||
for idx in failed_indices:
|
||||
all_embeddings[idx] = [0.0] * embedding_dim
|
||||
|
||||
# Remove None values
|
||||
# Remove None values and convert to numpy array
|
||||
all_embeddings = [e for e in all_embeddings if e is not None]
|
||||
|
||||
if not all_embeddings:
|
||||
raise RuntimeError("No valid embeddings were computed")
|
||||
|
||||
# Validate embedding dimensions
|
||||
expected_dim = len(all_embeddings[0])
|
||||
inconsistent_dims = []
|
||||
for i, embedding in enumerate(all_embeddings):
|
||||
if len(embedding) != expected_dim:
|
||||
inconsistent_dims.append((i, len(embedding)))
|
||||
|
||||
if inconsistent_dims:
|
||||
error_msg = f"Ollama returned inconsistent embedding dimensions. Expected {expected_dim}, but got:\n"
|
||||
for idx, dim in inconsistent_dims[:10]: # Show first 10 inconsistent ones
|
||||
error_msg += f" - Text {idx}: {dim} dimensions\n"
|
||||
if len(inconsistent_dims) > 10:
|
||||
error_msg += f" ... and {len(inconsistent_dims) - 10} more\n"
|
||||
error_msg += f"\nThis is likely an Ollama API bug with model '{model_name}'. Please try:\n"
|
||||
error_msg += "1. Restart Ollama service: 'ollama serve'\n"
|
||||
error_msg += f"2. Re-pull the model: 'ollama pull {model_name}'\n"
|
||||
error_msg += (
|
||||
"3. Use sentence-transformers instead: --embedding-mode sentence-transformers\n"
|
||||
)
|
||||
error_msg += "4. Report this issue to Ollama: https://github.com/ollama/ollama/issues"
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Convert to numpy array and normalize
|
||||
embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import atexit
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -304,13 +305,24 @@ class EmbeddingServerManager:
|
||||
project_root = Path(__file__).parent.parent.parent.parent.parent
|
||||
logger.info(f"Command: {' '.join(command)}")
|
||||
|
||||
# Let server output go directly to console
|
||||
# The server will respect LEANN_LOG_LEVEL environment variable
|
||||
# In CI environment, redirect output to avoid buffer deadlock
|
||||
# Embedding servers use many print statements that can fill buffers
|
||||
is_ci = os.environ.get("CI") == "true"
|
||||
if is_ci:
|
||||
stdout_target = subprocess.DEVNULL
|
||||
stderr_target = subprocess.DEVNULL
|
||||
logger.info("CI environment detected, redirecting embedding server output to DEVNULL")
|
||||
else:
|
||||
stdout_target = None # Direct to console for visible logs
|
||||
stderr_target = None # Direct to console for visible logs
|
||||
|
||||
# IMPORTANT: Use a new session so we can manage the whole process group reliably
|
||||
self.server_process = subprocess.Popen(
|
||||
command,
|
||||
cwd=project_root,
|
||||
stdout=None, # Direct to console
|
||||
stderr=None, # Direct to console
|
||||
stdout=stdout_target,
|
||||
stderr=stderr_target,
|
||||
start_new_session=True,
|
||||
)
|
||||
self.server_port = port
|
||||
logger.info(f"Server process started with PID: {self.server_process.pid}")
|
||||
@@ -352,7 +364,13 @@ class EmbeddingServerManager:
|
||||
logger.info(
|
||||
f"Terminating server process (PID: {self.server_process.pid}) for backend {self.backend_module_name}..."
|
||||
)
|
||||
self.server_process.terminate()
|
||||
# Try terminating the whole process group first (POSIX)
|
||||
try:
|
||||
pgid = os.getpgid(self.server_process.pid)
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
except Exception:
|
||||
# Fallback to terminating just the process
|
||||
self.server_process.terminate()
|
||||
|
||||
try:
|
||||
self.server_process.wait(timeout=3)
|
||||
@@ -361,7 +379,11 @@ class EmbeddingServerManager:
|
||||
logger.warning(
|
||||
f"Server process {self.server_process.pid} did not terminate gracefully within 3 seconds, killing it."
|
||||
)
|
||||
self.server_process.kill()
|
||||
try:
|
||||
pgid = os.getpgid(self.server_process.pid)
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
except Exception:
|
||||
self.server_process.kill()
|
||||
try:
|
||||
self.server_process.wait(timeout=2)
|
||||
logger.info(f"Server process {self.server_process.pid} killed successfully.")
|
||||
@@ -371,23 +393,21 @@ class EmbeddingServerManager:
|
||||
)
|
||||
# Don't hang indefinitely
|
||||
|
||||
# Clean up process resources to prevent resource tracker warnings
|
||||
try:
|
||||
self.server_process.wait() # Ensure process is fully cleaned up
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Clean up process resources without waiting
|
||||
# The process should already be terminated/killed above
|
||||
# Don't wait here as it can hang CI indefinitely
|
||||
self.server_process = None
|
||||
|
||||
def _launch_server_process_colab(self, command: list, port: int) -> None:
|
||||
"""Launch the server process with Colab-specific settings."""
|
||||
logger.info(f"Colab Command: {' '.join(command)}")
|
||||
|
||||
# In Colab, we need to be more careful about process management
|
||||
# In Colab, redirect to DEVNULL to avoid pipe blocking
|
||||
# PIPE without reading can cause hangs
|
||||
self.server_process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
self.server_port = port
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Literal, Union
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -35,7 +35,7 @@ class LeannBackendSearcherInterface(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def _ensure_server_running(
|
||||
self, passages_source_file: str, port: Union[int, None], **kwargs
|
||||
self, passages_source_file: str, port: Optional[int], **kwargs
|
||||
) -> int:
|
||||
"""Ensure server is running"""
|
||||
pass
|
||||
@@ -50,7 +50,7 @@ class LeannBackendSearcherInterface(ABC):
|
||||
prune_ratio: float = 0.0,
|
||||
recompute_embeddings: bool = False,
|
||||
pruning_strategy: Literal["global", "local", "proportional"] = "global",
|
||||
zmq_port: Union[int, None] = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
"""Search for nearest neighbors
|
||||
@@ -76,7 +76,7 @@ class LeannBackendSearcherInterface(ABC):
|
||||
self,
|
||||
query: str,
|
||||
use_server_if_available: bool = True,
|
||||
zmq_port: Union[int, None] = None,
|
||||
zmq_port: Optional[int] = None,
|
||||
) -> np.ndarray:
|
||||
"""Compute embedding for a query string
|
||||
|
||||
|
||||
@@ -132,10 +132,15 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
import msgpack
|
||||
import zmq
|
||||
|
||||
context = None
|
||||
socket = None
|
||||
try:
|
||||
context = zmq.Context()
|
||||
socket = context.socket(zmq.REQ)
|
||||
socket.setsockopt(zmq.RCVTIMEO, 30000) # 30 second timeout
|
||||
socket.setsockopt(zmq.LINGER, 0) # Don't block on close
|
||||
socket.setsockopt(zmq.RCVTIMEO, 5000) # 5 second timeout
|
||||
socket.setsockopt(zmq.SNDTIMEO, 5000) # 5 second timeout
|
||||
socket.setsockopt(zmq.IMMEDIATE, 1) # Don't wait for connection
|
||||
socket.connect(f"tcp://localhost:{zmq_port}")
|
||||
|
||||
# Send embedding request
|
||||
@@ -147,9 +152,6 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
response_bytes = socket.recv()
|
||||
response = msgpack.unpackb(response_bytes)
|
||||
|
||||
socket.close()
|
||||
context.term()
|
||||
|
||||
# Convert response to numpy array
|
||||
if isinstance(response, list) and len(response) > 0:
|
||||
return np.array(response, dtype=np.float32)
|
||||
@@ -158,6 +160,11 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to compute embeddings via server: {e}")
|
||||
finally:
|
||||
if socket:
|
||||
socket.close(linger=0)
|
||||
if context:
|
||||
context.term()
|
||||
|
||||
@abstractmethod
|
||||
def search(
|
||||
@@ -191,7 +198,27 @@ class BaseSearcher(LeannBackendSearcherInterface, ABC):
|
||||
"""
|
||||
pass
|
||||
|
||||
def __del__(self):
|
||||
"""Ensures the embedding server is stopped when the searcher is destroyed."""
|
||||
def cleanup(self):
|
||||
"""Cleanup resources including embedding server and ZMQ connections."""
|
||||
# Stop embedding server
|
||||
if hasattr(self, "embedding_server_manager"):
|
||||
self.embedding_server_manager.stop_server()
|
||||
|
||||
# Set ZMQ linger but don't terminate global context
|
||||
try:
|
||||
import zmq
|
||||
|
||||
# Just set linger on the global instance
|
||||
ctx = zmq.Context.instance()
|
||||
ctx.linger = 0
|
||||
# NEVER call ctx.term() on the global instance
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def __del__(self):
|
||||
"""Ensures resources are cleaned up when the searcher is destroyed."""
|
||||
try:
|
||||
self.cleanup()
|
||||
except Exception:
|
||||
# Ignore errors during destruction
|
||||
pass
|
||||
|
||||
@@ -4,12 +4,20 @@ Transform your development workflow with intelligent code assistance using LEANN
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Install LEANN globally for MCP integration (with default backend):
|
||||
**Step 1:** First, complete the basic LEANN installation following the [📦 Installation guide](../../README.md#installation) in the root README:
|
||||
|
||||
```bash
|
||||
uv tool install leann-core --with leann
|
||||
uv venv
|
||||
source .venv/bin/activate
|
||||
uv pip install leann
|
||||
```
|
||||
This installs the `leann` CLI into an isolated tool environment and includes both backends so `leann build` works out-of-the-box.
|
||||
|
||||
**Step 2:** Install LEANN globally for MCP integration:
|
||||
```bash
|
||||
uv tool install leann-core
|
||||
```
|
||||
|
||||
This makes the `leann` command available system-wide, which `leann_mcp` requires.
|
||||
|
||||
## 🚀 Quick Setup
|
||||
|
||||
@@ -37,42 +45,6 @@ leann build my-project --docs ./
|
||||
claude
|
||||
```
|
||||
|
||||
## 🚀 Advanced Usage Examples
|
||||
|
||||
### Index Entire Git Repository
|
||||
```bash
|
||||
# Index all tracked files in your git repository, note right now we will skip submodules, but we can add it back easily if you want
|
||||
leann build my-repo --docs $(git ls-files) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Index only specific file types from git
|
||||
leann build my-python-code --docs $(git ls-files "*.py") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
```
|
||||
|
||||
### Multiple Directories and Files
|
||||
```bash
|
||||
# Index multiple directories
|
||||
leann build my-codebase --docs ./src ./tests ./docs ./config --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Mix files and directories
|
||||
leann build my-project --docs ./README.md ./src/ ./package.json ./docs/ --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Specific files only
|
||||
leann build my-configs --docs ./tsconfig.json ./package.json ./webpack.config.js --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
```
|
||||
|
||||
### Advanced Git Integration
|
||||
```bash
|
||||
# Index recently modified files
|
||||
leann build recent-changes --docs $(git diff --name-only HEAD~10..HEAD) --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Index files matching pattern
|
||||
leann build frontend --docs $(git ls-files "*.tsx" "*.ts" "*.jsx" "*.js") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
|
||||
# Index documentation and config files
|
||||
leann build docs-and-configs --docs $(git ls-files "*.md" "*.yml" "*.yaml" "*.json" "*.toml") --embedding-mode sentence-transformers --embedding-model all-MiniLM-L6-v2 --backend hnsw
|
||||
```
|
||||
|
||||
|
||||
**Try this in Claude Code:**
|
||||
```
|
||||
Help me understand this codebase. List available indexes and search for authentication patterns.
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "leann"
|
||||
version = "0.2.9"
|
||||
version = "0.2.7"
|
||||
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
|
||||
@@ -40,9 +40,10 @@ dependencies = [
|
||||
# Other dependencies
|
||||
"ipykernel==6.29.5",
|
||||
"msgpack>=1.1.1",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
|
||||
"mlx>=0.26.3; sys_platform == 'darwin'",
|
||||
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
|
||||
"psutil>=5.8.0",
|
||||
"pybind11>=3.0.0",
|
||||
"pathspec>=0.12.1",
|
||||
"nbconvert>=7.16.6",
|
||||
"gitignore-parser>=0.1.12",
|
||||
@@ -50,19 +51,21 @@ dependencies = [
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=7.0",
|
||||
"pytest-cov>=4.0",
|
||||
"pytest-xdist>=3.0", # For parallel test execution
|
||||
"pytest>=8.3.0", # Minimum version for Python 3.13 support
|
||||
"pytest-cov>=5.0",
|
||||
"pytest-xdist>=3.5", # For parallel test execution
|
||||
"black>=23.0",
|
||||
"ruff>=0.1.0",
|
||||
"ruff==0.12.7", # Fixed version to ensure consistent formatting across all environments
|
||||
"matplotlib",
|
||||
"huggingface-hub>=0.20.0",
|
||||
"pre-commit>=3.5.0",
|
||||
]
|
||||
|
||||
test = [
|
||||
"pytest>=7.0",
|
||||
"pytest-timeout>=2.0",
|
||||
"pytest>=8.3.0", # Minimum version for Python 3.13 support
|
||||
"pytest-timeout>=2.3",
|
||||
"anyio>=4.0", # For async test support (includes pytest plugin)
|
||||
"psutil>=5.9.0", # For process cleanup in tests
|
||||
"llama-index-core>=0.12.0",
|
||||
"llama-index-readers-file>=0.4.0",
|
||||
"python-dotenv>=1.0.0",
|
||||
@@ -154,7 +157,8 @@ markers = [
|
||||
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
||||
"openai: marks tests that require OpenAI API key",
|
||||
]
|
||||
timeout = 600
|
||||
timeout = 300 # Reduced from 600s (10min) to 300s (5min) for CI safety
|
||||
timeout_method = "thread" # Use thread method to avoid non-daemon thread issues
|
||||
addopts = [
|
||||
"-v",
|
||||
"--tb=short",
|
||||
|
||||
103
scripts/diagnose_hang.sh
Executable file
103
scripts/diagnose_hang.sh
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/bin/bash
|
||||
# Diagnostic script for debugging CI hangs
|
||||
|
||||
echo "========================================="
|
||||
echo " CI HANG DIAGNOSTIC SCRIPT"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
echo "📅 Current time: $(date)"
|
||||
echo "🖥️ Hostname: $(hostname)"
|
||||
echo "👤 User: $(whoami)"
|
||||
echo "📂 Working directory: $(pwd)"
|
||||
echo ""
|
||||
|
||||
echo "=== PYTHON ENVIRONMENT ==="
|
||||
python --version 2>&1 || echo "Python not found"
|
||||
pip list 2>&1 | head -20 || echo "pip not available"
|
||||
echo ""
|
||||
|
||||
echo "=== PROCESS INFORMATION ==="
|
||||
echo "Current shell PID: $$"
|
||||
echo "Parent PID: $PPID"
|
||||
echo ""
|
||||
|
||||
echo "All Python processes:"
|
||||
ps aux | grep -E "[p]ython" || echo "No Python processes"
|
||||
echo ""
|
||||
|
||||
echo "All pytest processes:"
|
||||
ps aux | grep -E "[p]ytest" || echo "No pytest processes"
|
||||
echo ""
|
||||
|
||||
echo "Embedding server processes:"
|
||||
ps aux | grep -E "[e]mbedding_server" || echo "No embedding server processes"
|
||||
echo ""
|
||||
|
||||
echo "Zombie processes:"
|
||||
ps aux | grep "<defunct>" || echo "No zombie processes"
|
||||
echo ""
|
||||
|
||||
echo "=== NETWORK INFORMATION ==="
|
||||
echo "Network listeners on typical embedding server ports:"
|
||||
ss -ltn 2>/dev/null | grep -E ":555[0-9]|:556[0-9]" || netstat -ltn 2>/dev/null | grep -E ":555[0-9]|:556[0-9]" || echo "No listeners on embedding ports"
|
||||
echo ""
|
||||
|
||||
echo "All network listeners:"
|
||||
ss -ltn 2>/dev/null | head -20 || netstat -ltn 2>/dev/null | head -20 || echo "Cannot get network info"
|
||||
echo ""
|
||||
|
||||
echo "=== FILE DESCRIPTORS ==="
|
||||
echo "Open files for current shell:"
|
||||
lsof -p $$ 2>/dev/null | head -20 || echo "lsof not available"
|
||||
echo ""
|
||||
|
||||
if [ -d "/proc/$$" ]; then
|
||||
echo "File descriptors for current shell (/proc/$$/fd):"
|
||||
ls -la /proc/$$/fd 2>/dev/null | head -20 || echo "Cannot access /proc/$$/fd"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "=== SYSTEM RESOURCES ==="
|
||||
echo "Memory usage:"
|
||||
free -h 2>/dev/null || vm_stat 2>/dev/null || echo "Cannot get memory info"
|
||||
echo ""
|
||||
|
||||
echo "Disk usage:"
|
||||
df -h . 2>/dev/null || echo "Cannot get disk info"
|
||||
echo ""
|
||||
|
||||
echo "CPU info:"
|
||||
nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo "Cannot get CPU info"
|
||||
echo ""
|
||||
|
||||
echo "=== PYTHON SPECIFIC CHECKS ==="
|
||||
python -c "
|
||||
import sys
|
||||
import os
|
||||
print(f'Python executable: {sys.executable}')
|
||||
print(f'Python path: {sys.path[:3]}...')
|
||||
print(f'Environment PYTHONPATH: {os.environ.get(\"PYTHONPATH\", \"Not set\")}')
|
||||
print(f'Site packages: {[p for p in sys.path if \"site-packages\" in p][:2]}')
|
||||
" 2>&1 || echo "Cannot run Python diagnostics"
|
||||
echo ""
|
||||
|
||||
echo "=== ZMQ SPECIFIC CHECKS ==="
|
||||
python -c "
|
||||
try:
|
||||
import zmq
|
||||
print(f'ZMQ version: {zmq.zmq_version()}')
|
||||
print(f'PyZMQ version: {zmq.pyzmq_version()}')
|
||||
ctx = zmq.Context.instance()
|
||||
print(f'ZMQ context instance: {ctx}')
|
||||
except Exception as e:
|
||||
print(f'ZMQ check failed: {e}')
|
||||
" 2>&1 || echo "Cannot check ZMQ"
|
||||
echo ""
|
||||
|
||||
echo "=== PYTEST CHECK ==="
|
||||
pytest --version 2>&1 || echo "pytest not found"
|
||||
echo ""
|
||||
|
||||
echo "=== END OF DIAGNOSTICS ==="
|
||||
echo "Generated at: $(date)"
|
||||
@@ -1,62 +0,0 @@
|
||||
name: leann-build
|
||||
|
||||
resources:
|
||||
# Choose a GPU for fast embeddings (examples: L4, A10G, A100). CPU also works but is slower.
|
||||
accelerators: L4:1
|
||||
# Optionally pin a cloud, otherwise SkyPilot will auto-select
|
||||
# cloud: aws
|
||||
disk_size: 100
|
||||
|
||||
env:
|
||||
# Build parameters (override with: sky launch -c leann-gpu sky/leann-build.yaml -e key=value)
|
||||
index_name: my-index
|
||||
docs: ./data
|
||||
backend: hnsw # hnsw | diskann
|
||||
complexity: 64
|
||||
graph_degree: 32
|
||||
num_threads: 8
|
||||
# Embedding selection
|
||||
embedding_mode: sentence-transformers # sentence-transformers | openai | mlx | ollama
|
||||
embedding_model: facebook/contriever
|
||||
# Storage/latency knobs
|
||||
recompute: true # true => selective recomputation; false => store full embeddings
|
||||
compact: true # for HNSW only: false when recompute=false
|
||||
# Optional pass-through
|
||||
extra_args: ""
|
||||
|
||||
# Sync local paths to the remote VM. Adjust as needed.
|
||||
file_mounts:
|
||||
# Example: mount your local data directory used for building
|
||||
~/leann-data: ${docs}
|
||||
|
||||
setup: |
|
||||
set -e
|
||||
# Install uv (package manager)
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
|
||||
# Install the LEANN CLI globally on the remote machine
|
||||
uv tool install leann
|
||||
|
||||
run: |
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
# Derive flags from env
|
||||
recompute_flag=""
|
||||
if [ "${recompute}" = "false" ] || [ "${recompute}" = "0" ]; then
|
||||
recompute_flag="--no-recompute"
|
||||
fi
|
||||
compact_flag=""
|
||||
if [ "${compact}" = "false" ] || [ "${compact}" = "0" ]; then
|
||||
compact_flag="--no-compact"
|
||||
fi
|
||||
|
||||
# Build command
|
||||
leann build ${index_name} \
|
||||
--docs ~/leann-data \
|
||||
--backend ${backend} \
|
||||
--complexity ${complexity} \
|
||||
--graph-degree ${graph_degree} \
|
||||
--num-threads ${num_threads} \
|
||||
--embedding-mode ${embedding_mode} \
|
||||
--embedding-model ${embedding_model} \
|
||||
${recompute_flag} ${compact_flag} ${extra_args}
|
||||
@@ -6,10 +6,11 @@ This directory contains automated tests for the LEANN project using pytest.
|
||||
|
||||
### `test_readme_examples.py`
|
||||
Tests the examples shown in README.md:
|
||||
- The basic example code that users see first
|
||||
- The basic example code that users see first (parametrized for both HNSW and DiskANN backends)
|
||||
- Import statements work correctly
|
||||
- Different backend options (HNSW, DiskANN)
|
||||
- Different LLM configuration options
|
||||
- Different LLM configuration options (parametrized for both backends)
|
||||
- **All main README examples are tested with both HNSW and DiskANN backends using pytest parametrization**
|
||||
|
||||
### `test_basic.py`
|
||||
Basic functionality tests that verify:
|
||||
@@ -25,6 +26,16 @@ Tests the document RAG example functionality:
|
||||
- Tests error handling with invalid parameters
|
||||
- Verifies that normalized embeddings are detected and cosine distance is used
|
||||
|
||||
### `test_diskann_partition.py`
|
||||
Tests DiskANN graph partitioning functionality:
|
||||
- Tests DiskANN index building without partitioning (baseline)
|
||||
- Tests automatic graph partitioning with `is_recompute=True`
|
||||
- Verifies that partition files are created and large files are cleaned up for storage saving
|
||||
- Tests search functionality with partitioned indices
|
||||
- Validates medoid and max_base_norm file generation and usage
|
||||
- Includes performance comparison between DiskANN (with partition) and HNSW
|
||||
- **Note**: These tests are skipped in CI due to hardware requirements and computation time
|
||||
|
||||
## Running Tests
|
||||
|
||||
### Install test dependencies:
|
||||
@@ -54,15 +65,23 @@ pytest tests/ -m "not openai"
|
||||
|
||||
# Skip slow tests
|
||||
pytest tests/ -m "not slow"
|
||||
|
||||
# Run DiskANN partition tests (requires local machine, not CI)
|
||||
pytest tests/test_diskann_partition.py
|
||||
```
|
||||
|
||||
### Run with specific backend:
|
||||
```bash
|
||||
# Test only HNSW backend
|
||||
pytest tests/test_basic.py::test_backend_basic[hnsw]
|
||||
pytest tests/test_readme_examples.py::test_readme_basic_example[hnsw]
|
||||
|
||||
# Test only DiskANN backend
|
||||
pytest tests/test_basic.py::test_backend_basic[diskann]
|
||||
pytest tests/test_readme_examples.py::test_readme_basic_example[diskann]
|
||||
|
||||
# All DiskANN tests (parametrized + specialized partition tests)
|
||||
pytest tests/ -k diskann
|
||||
```
|
||||
|
||||
## CI/CD Integration
|
||||
|
||||
301
tests/conftest.py
Normal file
301
tests/conftest.py
Normal file
@@ -0,0 +1,301 @@
|
||||
"""Global test configuration and cleanup fixtures."""
|
||||
|
||||
import faulthandler
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from collections.abc import Generator
|
||||
|
||||
import pytest
|
||||
|
||||
# Enable faulthandler to dump stack traces
|
||||
faulthandler.enable()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def _ci_backtraces():
|
||||
"""Dump stack traces before CI timeout to diagnose hanging."""
|
||||
if os.getenv("CI") == "true":
|
||||
# Dump stack traces 10s before the 180s timeout
|
||||
faulthandler.dump_traceback_later(170, repeat=True)
|
||||
yield
|
||||
faulthandler.cancel_dump_traceback_later()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def global_test_cleanup() -> Generator:
|
||||
"""Global cleanup fixture that runs after all tests.
|
||||
|
||||
This ensures all ZMQ connections and child processes are properly cleaned up,
|
||||
preventing the test runner from hanging on exit.
|
||||
"""
|
||||
yield
|
||||
|
||||
# Cleanup after all tests
|
||||
print("\n🧹 Running global test cleanup...")
|
||||
|
||||
# 1. Force cleanup of any LeannSearcher instances
|
||||
try:
|
||||
import gc
|
||||
|
||||
# Force garbage collection to trigger __del__ methods
|
||||
gc.collect()
|
||||
time.sleep(0.2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 2. Set ZMQ linger but DON'T term Context.instance()
|
||||
# Terminating the global instance can block if other code still has sockets
|
||||
try:
|
||||
import zmq
|
||||
|
||||
# Just set linger on the global instance, don't terminate it
|
||||
ctx = zmq.Context.instance()
|
||||
ctx.linger = 0
|
||||
# Do NOT call ctx.term() or ctx.destroy() on the global instance!
|
||||
# That would block waiting for all sockets to close
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Kill any leftover child processes (including grandchildren)
|
||||
try:
|
||||
import psutil
|
||||
|
||||
current_process = psutil.Process()
|
||||
# Get ALL descendants recursively
|
||||
children = current_process.children(recursive=True)
|
||||
|
||||
if children:
|
||||
print(f"\n⚠️ Cleaning up {len(children)} leftover child processes...")
|
||||
|
||||
# First try to terminate gracefully
|
||||
for child in children:
|
||||
try:
|
||||
print(f" Terminating {child.pid} ({child.name()})")
|
||||
child.terminate()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
|
||||
# Wait a bit for processes to terminate
|
||||
gone, alive = psutil.wait_procs(children, timeout=2)
|
||||
|
||||
# Force kill any remaining processes
|
||||
for child in alive:
|
||||
try:
|
||||
print(f" Force killing process {child.pid} ({child.name()})")
|
||||
child.kill()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
|
||||
# Final wait to ensure cleanup
|
||||
psutil.wait_procs(alive, timeout=1)
|
||||
except ImportError:
|
||||
# psutil not installed, try basic process cleanup
|
||||
try:
|
||||
# Send SIGTERM to all child processes
|
||||
os.killpg(os.getpgid(os.getpid()), signal.SIGTERM)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Warning: Error during process cleanup: {e}")
|
||||
|
||||
# List and clean up remaining threads
|
||||
try:
|
||||
import threading
|
||||
|
||||
threads = [t for t in threading.enumerate() if t is not threading.main_thread()]
|
||||
if threads:
|
||||
print(f"\n⚠️ {len(threads)} non-main threads still running:")
|
||||
for t in threads:
|
||||
print(f" - {t.name} (daemon={t.daemon})")
|
||||
|
||||
# Force cleanup of pytest-timeout threads that block exit
|
||||
if "pytest_timeout" in t.name and not t.daemon:
|
||||
print(f" 🔧 Converting pytest-timeout thread to daemon: {t.name}")
|
||||
try:
|
||||
t.daemon = True
|
||||
print(" ✓ Converted to daemon thread")
|
||||
except Exception as e:
|
||||
print(f" ✗ Failed: {e}")
|
||||
|
||||
# Check if only daemon threads remain
|
||||
non_daemon = [
|
||||
t for t in threading.enumerate() if t is not threading.main_thread() and not t.daemon
|
||||
]
|
||||
if non_daemon:
|
||||
print(f"\n⚠️ {len(non_daemon)} non-daemon threads still blocking exit")
|
||||
# Force exit in CI to prevent hanging
|
||||
if os.environ.get("CI") == "true":
|
||||
print("🔨 Forcing exit in CI environment...")
|
||||
os._exit(0)
|
||||
except Exception as e:
|
||||
print(f"Thread cleanup error: {e}")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def auto_cleanup_searcher():
|
||||
"""Fixture that automatically cleans up LeannSearcher instances."""
|
||||
searchers = []
|
||||
|
||||
def register(searcher):
|
||||
"""Register a searcher for cleanup."""
|
||||
searchers.append(searcher)
|
||||
return searcher
|
||||
|
||||
yield register
|
||||
|
||||
# Cleanup all registered searchers
|
||||
for searcher in searchers:
|
||||
try:
|
||||
searcher.cleanup()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Force garbage collection
|
||||
import gc
|
||||
|
||||
gc.collect()
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def _reap_children():
|
||||
"""Reap all child processes at session end as a safety net."""
|
||||
yield
|
||||
|
||||
# Final aggressive cleanup
|
||||
try:
|
||||
import psutil
|
||||
|
||||
me = psutil.Process()
|
||||
kids = me.children(recursive=True)
|
||||
for p in kids:
|
||||
try:
|
||||
p.terminate()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_, alive = psutil.wait_procs(kids, timeout=2)
|
||||
for p in alive:
|
||||
try:
|
||||
p.kill()
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def cleanup_after_each_test():
|
||||
"""Cleanup after each test to prevent resource leaks."""
|
||||
yield
|
||||
|
||||
# Force garbage collection to trigger any __del__ methods
|
||||
import gc
|
||||
|
||||
gc.collect()
|
||||
|
||||
# Give a moment for async cleanup
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""Configure pytest with better timeout handling."""
|
||||
# Set default timeout method to thread if not specified
|
||||
if not config.getoption("--timeout-method", None):
|
||||
config.option.timeout_method = "thread"
|
||||
|
||||
# Add more logging
|
||||
print(f"🔧 Pytest configured at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Python version: {os.sys.version}")
|
||||
print(f" Platform: {os.sys.platform}")
|
||||
|
||||
|
||||
def pytest_sessionstart(session):
|
||||
"""Called after the Session object has been created."""
|
||||
print(f"🏁 Pytest session starting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Session ID: {id(session)}")
|
||||
|
||||
# Show initial process state
|
||||
try:
|
||||
import psutil
|
||||
|
||||
current = psutil.Process()
|
||||
print(f" Current PID: {current.pid}")
|
||||
print(f" Parent PID: {current.ppid()}")
|
||||
children = current.children(recursive=True)
|
||||
if children:
|
||||
print(f" ⚠️ Already have {len(children)} child processes at start!")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def pytest_sessionfinish(session, exitstatus):
|
||||
"""Called after whole test run finished."""
|
||||
print(f"🏁 Pytest session finishing at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Exit status: {exitstatus}")
|
||||
|
||||
# Aggressive cleanup before pytest exits
|
||||
print("🧹 Starting aggressive cleanup...")
|
||||
|
||||
# First, clean up child processes
|
||||
try:
|
||||
import psutil
|
||||
|
||||
current = psutil.Process()
|
||||
children = current.children(recursive=True)
|
||||
|
||||
if children:
|
||||
print(f" Found {len(children)} child processes to clean up:")
|
||||
for child in children:
|
||||
try:
|
||||
print(f" - PID {child.pid}: {child.name()} (status: {child.status()})")
|
||||
child.terminate()
|
||||
except Exception as e:
|
||||
print(f" - Failed to terminate {child.pid}: {e}")
|
||||
|
||||
# Wait briefly then kill
|
||||
time.sleep(0.5)
|
||||
_, alive = psutil.wait_procs(children, timeout=1)
|
||||
|
||||
for child in alive:
|
||||
try:
|
||||
print(f" - Force killing {child.pid}")
|
||||
child.kill()
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
print(" No child processes found")
|
||||
|
||||
except Exception as e:
|
||||
print(f" Process cleanup error: {e}")
|
||||
|
||||
# Second, clean up problematic threads
|
||||
try:
|
||||
import threading
|
||||
|
||||
threads = [t for t in threading.enumerate() if t is not threading.main_thread()]
|
||||
if threads:
|
||||
print(f" Found {len(threads)} non-main threads:")
|
||||
for t in threads:
|
||||
print(f" - {t.name} (daemon={t.daemon})")
|
||||
# Convert pytest-timeout threads to daemon so they don't block exit
|
||||
if "pytest_timeout" in t.name and not t.daemon:
|
||||
try:
|
||||
t.daemon = True
|
||||
print(" ✓ Converted to daemon")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Force exit if non-daemon threads remain in CI
|
||||
non_daemon = [
|
||||
t for t in threading.enumerate() if t is not threading.main_thread() and not t.daemon
|
||||
]
|
||||
if non_daemon and os.environ.get("CI") == "true":
|
||||
print(f" ⚠️ {len(non_daemon)} non-daemon threads remain, forcing exit...")
|
||||
os._exit(exitstatus or 0)
|
||||
|
||||
except Exception as e:
|
||||
print(f" Thread cleanup error: {e}")
|
||||
|
||||
print(f"✅ Pytest exiting at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
@@ -7,6 +7,7 @@ import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from test_timeout import ci_timeout
|
||||
|
||||
|
||||
def test_imports():
|
||||
@@ -19,6 +20,7 @@ def test_imports():
|
||||
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
||||
)
|
||||
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
||||
@ci_timeout(120) # 2 minute timeout for backend tests
|
||||
def test_backend_basic(backend_name):
|
||||
"""Test basic functionality for each backend."""
|
||||
from leann.api import LeannBuilder, LeannSearcher, SearchResult
|
||||
@@ -68,6 +70,7 @@ def test_backend_basic(backend_name):
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true", reason="Skip model tests in CI to avoid MPS memory issues"
|
||||
)
|
||||
@ci_timeout(180) # 3 minute timeout for large index test
|
||||
def test_large_index():
|
||||
"""Test with larger dataset."""
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
369
tests/test_diskann_partition.py
Normal file
369
tests/test_diskann_partition.py
Normal file
@@ -0,0 +1,369 @@
|
||||
"""
|
||||
Test DiskANN graph partitioning functionality.
|
||||
|
||||
Tests the automatic graph partitioning feature that was implemented to save
|
||||
storage space by partitioning large DiskANN indices and safely deleting
|
||||
redundant files while maintaining search functionality.
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true",
|
||||
reason="Skip DiskANN partition tests in CI - requires specific hardware and large memory",
|
||||
)
|
||||
def test_diskann_without_partition():
|
||||
"""Test DiskANN index building without partition (baseline)."""
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
index_path = str(Path(temp_dir) / "test_no_partition.leann")
|
||||
|
||||
# Test data - enough to trigger index building
|
||||
texts = [
|
||||
f"Document {i} discusses topic {i % 10} with detailed analysis of subject {i // 10}."
|
||||
for i in range(500)
|
||||
]
|
||||
|
||||
# Build without partition (is_recompute=False)
|
||||
builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
num_neighbors=32,
|
||||
search_list_size=50,
|
||||
is_recompute=False, # No partition
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
builder.add_text(text)
|
||||
|
||||
builder.build_index(index_path)
|
||||
|
||||
# Verify index was created
|
||||
index_dir = Path(index_path).parent
|
||||
assert index_dir.exists()
|
||||
|
||||
# Check that traditional DiskANN files exist
|
||||
index_prefix = Path(index_path).stem
|
||||
# Core DiskANN files (beam search index may not be created for small datasets)
|
||||
required_files = [
|
||||
f"{index_prefix}_disk.index",
|
||||
f"{index_prefix}_pq_compressed.bin",
|
||||
f"{index_prefix}_pq_pivots.bin",
|
||||
]
|
||||
|
||||
# Check all generated files first for debugging
|
||||
generated_files = [f.name for f in index_dir.glob(f"{index_prefix}*")]
|
||||
print(f"Generated files: {generated_files}")
|
||||
|
||||
for required_file in required_files:
|
||||
file_path = index_dir / required_file
|
||||
assert file_path.exists(), f"Required file {required_file} not found"
|
||||
|
||||
# Ensure no partition files exist in non-partition mode
|
||||
partition_files = [f"{index_prefix}_disk_graph.index", f"{index_prefix}_partition.bin"]
|
||||
|
||||
for partition_file in partition_files:
|
||||
file_path = index_dir / partition_file
|
||||
assert not file_path.exists(), (
|
||||
f"Partition file {partition_file} should not exist in non-partition mode"
|
||||
)
|
||||
|
||||
# Test search functionality
|
||||
searcher = LeannSearcher(index_path)
|
||||
results = searcher.search("topic 3 analysis", top_k=3)
|
||||
|
||||
assert len(results) > 0
|
||||
assert all(result.score is not None and result.score != float("-inf") for result in results)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true",
|
||||
reason="Skip DiskANN partition tests in CI - requires specific hardware and large memory",
|
||||
)
|
||||
def test_diskann_with_partition():
|
||||
"""Test DiskANN index building with automatic graph partitioning."""
|
||||
from leann.api import LeannBuilder
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
index_path = str(Path(temp_dir) / "test_with_partition.leann")
|
||||
|
||||
# Test data - enough to trigger partitioning
|
||||
texts = [
|
||||
f"Document {i} explores subject {i % 15} with comprehensive coverage of area {i // 15}."
|
||||
for i in range(500)
|
||||
]
|
||||
|
||||
# Build with partition (is_recompute=True)
|
||||
builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
num_neighbors=32,
|
||||
search_list_size=50,
|
||||
is_recompute=True, # Enable automatic partitioning
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
builder.add_text(text)
|
||||
|
||||
builder.build_index(index_path)
|
||||
|
||||
# Verify index was created
|
||||
index_dir = Path(index_path).parent
|
||||
assert index_dir.exists()
|
||||
|
||||
# Check that partition files exist
|
||||
index_prefix = Path(index_path).stem
|
||||
partition_files = [
|
||||
f"{index_prefix}_disk_graph.index", # Partitioned graph
|
||||
f"{index_prefix}_partition.bin", # Partition metadata
|
||||
f"{index_prefix}_pq_compressed.bin",
|
||||
f"{index_prefix}_pq_pivots.bin",
|
||||
]
|
||||
|
||||
for partition_file in partition_files:
|
||||
file_path = index_dir / partition_file
|
||||
assert file_path.exists(), f"Expected partition file {partition_file} not found"
|
||||
|
||||
# Check that large files were cleaned up (storage saving goal)
|
||||
large_files = [f"{index_prefix}_disk.index", f"{index_prefix}_disk_beam_search.index"]
|
||||
|
||||
for large_file in large_files:
|
||||
file_path = index_dir / large_file
|
||||
assert not file_path.exists(), (
|
||||
f"Large file {large_file} should have been deleted for storage saving"
|
||||
)
|
||||
|
||||
# Verify required auxiliary files for partition mode exist
|
||||
required_files = [
|
||||
f"{index_prefix}_disk.index_medoids.bin",
|
||||
f"{index_prefix}_disk.index_max_base_norm.bin",
|
||||
]
|
||||
|
||||
for req_file in required_files:
|
||||
file_path = index_dir / req_file
|
||||
assert file_path.exists(), (
|
||||
f"Required auxiliary file {req_file} missing for partition mode"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true",
|
||||
reason="Skip DiskANN partition tests in CI - requires specific hardware and large memory",
|
||||
)
|
||||
def test_diskann_partition_search_functionality():
|
||||
"""Test that search works correctly with partitioned indices."""
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
index_path = str(Path(temp_dir) / "test_partition_search.leann")
|
||||
|
||||
# Create diverse test data
|
||||
texts = [
|
||||
"LEANN is a storage-efficient approximate nearest neighbor search system.",
|
||||
"Graph partitioning helps reduce memory usage in large scale vector search.",
|
||||
"DiskANN provides high-performance disk-based approximate nearest neighbor search.",
|
||||
"Vector embeddings enable semantic search over unstructured text data.",
|
||||
"Approximate nearest neighbor algorithms trade accuracy for speed and storage.",
|
||||
] * 100 # Repeat to get enough data
|
||||
|
||||
# Build with partitioning
|
||||
builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
is_recompute=True, # Enable partitioning
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
builder.add_text(text)
|
||||
|
||||
builder.build_index(index_path)
|
||||
|
||||
# Test search with partitioned index
|
||||
searcher = LeannSearcher(index_path)
|
||||
|
||||
# Test various queries
|
||||
test_queries = [
|
||||
("vector search algorithms", 5),
|
||||
("LEANN storage efficiency", 3),
|
||||
("graph partitioning memory", 4),
|
||||
("approximate nearest neighbor", 7),
|
||||
]
|
||||
|
||||
for query, top_k in test_queries:
|
||||
results = searcher.search(query, top_k=top_k)
|
||||
|
||||
# Verify search results
|
||||
assert len(results) == top_k, f"Expected {top_k} results for query '{query}'"
|
||||
assert all(result.score is not None for result in results), (
|
||||
"All results should have scores"
|
||||
)
|
||||
assert all(result.score != float("-inf") for result in results), (
|
||||
"No result should have -inf score"
|
||||
)
|
||||
assert all(result.text is not None for result in results), (
|
||||
"All results should have text"
|
||||
)
|
||||
|
||||
# Scores should be in descending order (higher similarity first)
|
||||
scores = [result.score for result in results]
|
||||
assert scores == sorted(scores, reverse=True), (
|
||||
"Results should be sorted by score descending"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true",
|
||||
reason="Skip DiskANN partition tests in CI - requires specific hardware and large memory",
|
||||
)
|
||||
def test_diskann_medoid_and_norm_files():
|
||||
"""Test that medoid and max_base_norm files are correctly generated and used."""
|
||||
import struct
|
||||
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
index_path = str(Path(temp_dir) / "test_medoid_norm.leann")
|
||||
|
||||
# Small but sufficient dataset
|
||||
texts = [f"Test document {i} with content about subject {i % 10}." for i in range(200)]
|
||||
|
||||
builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
is_recompute=True,
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
builder.add_text(text)
|
||||
|
||||
builder.build_index(index_path)
|
||||
|
||||
index_dir = Path(index_path).parent
|
||||
index_prefix = Path(index_path).stem
|
||||
|
||||
# Test medoids file
|
||||
medoids_file = index_dir / f"{index_prefix}_disk.index_medoids.bin"
|
||||
assert medoids_file.exists(), "Medoids file should be generated"
|
||||
|
||||
# Read and validate medoids file format
|
||||
with open(medoids_file, "rb") as f:
|
||||
nshards = struct.unpack("<I", f.read(4))[0]
|
||||
one_val = struct.unpack("<I", f.read(4))[0]
|
||||
medoid_id = struct.unpack("<I", f.read(4))[0]
|
||||
|
||||
assert nshards == 1, "Single-shot build should have 1 shard"
|
||||
assert one_val == 1, "Expected value should be 1"
|
||||
assert medoid_id >= 0, "Medoid ID should be valid (not hardcoded 0)"
|
||||
|
||||
# Test max_base_norm file
|
||||
norm_file = index_dir / f"{index_prefix}_disk.index_max_base_norm.bin"
|
||||
assert norm_file.exists(), "Max base norm file should be generated"
|
||||
|
||||
# Read and validate norm file
|
||||
with open(norm_file, "rb") as f:
|
||||
npts = struct.unpack("<I", f.read(4))[0]
|
||||
ndims = struct.unpack("<I", f.read(4))[0]
|
||||
norm_val = struct.unpack("<f", f.read(4))[0]
|
||||
|
||||
assert npts == 1, "Should have 1 norm point"
|
||||
assert ndims == 1, "Should have 1 dimension"
|
||||
assert norm_val > 0, "Norm value should be positive"
|
||||
assert norm_val != float("inf"), "Norm value should be finite"
|
||||
|
||||
# Test that search works with these files
|
||||
searcher = LeannSearcher(index_path)
|
||||
results = searcher.search("test subject", top_k=3)
|
||||
|
||||
# Verify that scores are not -inf (which indicates norm file was loaded correctly)
|
||||
assert len(results) > 0
|
||||
assert all(result.score != float("-inf") for result in results), (
|
||||
"Scores should not be -inf when norm file is correct"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true",
|
||||
reason="Skip performance comparison in CI - requires significant compute time",
|
||||
)
|
||||
def test_diskann_vs_hnsw_performance():
|
||||
"""Compare DiskANN (with partition) vs HNSW performance."""
|
||||
import time
|
||||
|
||||
from leann.api import LeannBuilder, LeannSearcher
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Test data
|
||||
texts = [
|
||||
f"Performance test document {i} covering topic {i % 20} in detail." for i in range(1000)
|
||||
]
|
||||
query = "performance topic test"
|
||||
|
||||
# Test DiskANN with partitioning
|
||||
diskann_path = str(Path(temp_dir) / "perf_diskann.leann")
|
||||
diskann_builder = LeannBuilder(
|
||||
backend_name="diskann",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
is_recompute=True,
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
diskann_builder.add_text(text)
|
||||
|
||||
start_time = time.time()
|
||||
diskann_builder.build_index(diskann_path)
|
||||
|
||||
# Test HNSW
|
||||
hnsw_path = str(Path(temp_dir) / "perf_hnsw.leann")
|
||||
hnsw_builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
embedding_model="facebook/contriever",
|
||||
embedding_mode="sentence-transformers",
|
||||
is_recompute=True,
|
||||
)
|
||||
|
||||
for text in texts:
|
||||
hnsw_builder.add_text(text)
|
||||
|
||||
start_time = time.time()
|
||||
hnsw_builder.build_index(hnsw_path)
|
||||
|
||||
# Compare search performance
|
||||
diskann_searcher = LeannSearcher(diskann_path)
|
||||
hnsw_searcher = LeannSearcher(hnsw_path)
|
||||
|
||||
# Warm up searches
|
||||
diskann_searcher.search(query, top_k=5)
|
||||
hnsw_searcher.search(query, top_k=5)
|
||||
|
||||
# Timed searches
|
||||
start_time = time.time()
|
||||
diskann_results = diskann_searcher.search(query, top_k=10)
|
||||
diskann_search_time = time.time() - start_time
|
||||
|
||||
start_time = time.time()
|
||||
hnsw_results = hnsw_searcher.search(query, top_k=10)
|
||||
hnsw_search_time = time.time() - start_time
|
||||
|
||||
# Basic assertions
|
||||
assert len(diskann_results) == 10
|
||||
assert len(hnsw_results) == 10
|
||||
assert all(r.score != float("-inf") for r in diskann_results)
|
||||
assert all(r.score != float("-inf") for r in hnsw_results)
|
||||
|
||||
# Performance ratio (informational)
|
||||
if hnsw_search_time > 0:
|
||||
speed_ratio = hnsw_search_time / diskann_search_time
|
||||
print(f"DiskANN search time: {diskann_search_time:.4f}s")
|
||||
print(f"HNSW search time: {hnsw_search_time:.4f}s")
|
||||
print(f"DiskANN is {speed_ratio:.2f}x faster than HNSW")
|
||||
@@ -9,6 +9,7 @@ import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from test_timeout import ci_timeout
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -58,6 +59,10 @@ def test_document_rag_simulated(test_data_dir):
|
||||
|
||||
|
||||
@pytest.mark.skipif(not os.environ.get("OPENAI_API_KEY"), reason="OpenAI API key not available")
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("CI") == "true", reason="Skip OpenAI embedding tests in CI to avoid hanging"
|
||||
)
|
||||
@ci_timeout(60) # 60 second timeout to avoid hanging on OpenAI API calls
|
||||
def test_document_rag_openai(test_data_dir):
|
||||
"""Test document_rag with OpenAI embeddings."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
@@ -8,10 +8,13 @@ import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from test_timeout import ci_timeout
|
||||
|
||||
|
||||
def test_readme_basic_example():
|
||||
"""Test the basic example from README.md."""
|
||||
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
||||
@ci_timeout(90) # 90 second timeout for this comprehensive test
|
||||
def test_readme_basic_example(backend_name):
|
||||
"""Test the basic example from README.md with both backends."""
|
||||
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
||||
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
||||
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
||||
@@ -21,18 +24,18 @@ def test_readme_basic_example():
|
||||
from leann.api import SearchResult
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
INDEX_PATH = str(Path(temp_dir) / "demo.leann")
|
||||
INDEX_PATH = str(Path(temp_dir) / f"demo_{backend_name}.leann")
|
||||
|
||||
# Build an index
|
||||
# In CI, use a smaller model to avoid memory issues
|
||||
if os.environ.get("CI") == "true":
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
backend_name=backend_name,
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # Smaller model
|
||||
dimensions=384, # Smaller dimensions
|
||||
)
|
||||
else:
|
||||
builder = LeannBuilder(backend_name="hnsw")
|
||||
builder = LeannBuilder(backend_name=backend_name)
|
||||
builder.add_text("LEANN saves 97% storage compared to traditional vector databases.")
|
||||
builder.add_text("Tung Tung Tung Sahur called—they need their banana-crocodile hybrid back")
|
||||
builder.build_index(INDEX_PATH)
|
||||
@@ -52,6 +55,9 @@ def test_readme_basic_example():
|
||||
# Verify search results
|
||||
assert len(results) > 0
|
||||
assert isinstance(results[0], SearchResult)
|
||||
assert results[0].score != float("-inf"), (
|
||||
f"should return valid scores, got {results[0].score}"
|
||||
)
|
||||
# The second text about banana-crocodile should be more relevant
|
||||
assert "banana" in results[0].text or "crocodile" in results[0].text
|
||||
|
||||
@@ -75,6 +81,7 @@ def test_readme_imports():
|
||||
assert callable(LeannChat)
|
||||
|
||||
|
||||
@ci_timeout(60) # 60 second timeout
|
||||
def test_backend_options():
|
||||
"""Test different backend options mentioned in documentation."""
|
||||
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
||||
@@ -110,26 +117,32 @@ def test_backend_options():
|
||||
assert len(list(Path(diskann_path).parent.glob(f"{Path(diskann_path).stem}.*"))) > 0
|
||||
|
||||
|
||||
def test_llm_config_simulated():
|
||||
"""Test simulated LLM configuration option."""
|
||||
@pytest.mark.parametrize("backend_name", ["hnsw", "diskann"])
|
||||
@ci_timeout(75) # 75 second timeout for LLM tests
|
||||
def test_llm_config_simulated(backend_name):
|
||||
"""Test simulated LLM configuration option with both backends."""
|
||||
# Skip on macOS CI due to MPS environment issues with all-MiniLM-L6-v2
|
||||
if os.environ.get("CI") == "true" and platform.system() == "Darwin":
|
||||
pytest.skip("Skipping on macOS CI due to MPS environment issues with all-MiniLM-L6-v2")
|
||||
|
||||
# Skip DiskANN tests in CI due to hardware requirements
|
||||
if os.environ.get("CI") == "true" and backend_name == "diskann":
|
||||
pytest.skip("Skip DiskANN tests in CI - requires specific hardware and large memory")
|
||||
|
||||
from leann import LeannBuilder, LeannChat
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Build a simple index
|
||||
index_path = str(Path(temp_dir) / "test.leann")
|
||||
index_path = str(Path(temp_dir) / f"test_{backend_name}.leann")
|
||||
# Use smaller model in CI to avoid memory issues
|
||||
if os.environ.get("CI") == "true":
|
||||
builder = LeannBuilder(
|
||||
backend_name="hnsw",
|
||||
backend_name=backend_name,
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
||||
dimensions=384,
|
||||
)
|
||||
else:
|
||||
builder = LeannBuilder(backend_name="hnsw")
|
||||
builder = LeannBuilder(backend_name=backend_name)
|
||||
builder.add_text("Test document for LLM testing")
|
||||
builder.build_index(index_path)
|
||||
|
||||
|
||||
129
tests/test_timeout.py
Normal file
129
tests/test_timeout.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
Test timeout utilities for CI environments.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from typing import Any, Callable
|
||||
|
||||
|
||||
def timeout_test(seconds: int = 30):
|
||||
"""
|
||||
Decorator to add timeout to test functions, especially useful in CI environments.
|
||||
|
||||
Args:
|
||||
seconds: Timeout in seconds (default: 30)
|
||||
"""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
# Only apply timeout in CI environment
|
||||
if os.environ.get("CI") != "true":
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# Set up timeout handler
|
||||
def timeout_handler(signum, frame):
|
||||
print(f"\n❌ Test {func.__name__} timed out after {seconds} seconds in CI!")
|
||||
print("This usually indicates a hanging process or infinite loop.")
|
||||
# Try to cleanup any hanging processes
|
||||
try:
|
||||
import subprocess
|
||||
|
||||
subprocess.run(
|
||||
["pkill", "-f", "embedding_server"], capture_output=True, timeout=2
|
||||
)
|
||||
subprocess.run(
|
||||
["pkill", "-f", "hnsw_embedding"], capture_output=True, timeout=2
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
# Exit with timeout code
|
||||
sys.exit(124) # Standard timeout exit code
|
||||
|
||||
# Set signal handler and alarm
|
||||
old_handler = signal.signal(signal.SIGALRM, timeout_handler)
|
||||
signal.alarm(seconds)
|
||||
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
signal.alarm(0) # Cancel alarm
|
||||
return result
|
||||
except Exception:
|
||||
signal.alarm(0) # Cancel alarm on exception
|
||||
raise
|
||||
finally:
|
||||
# Restore original handler
|
||||
signal.signal(signal.SIGALRM, old_handler)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def ci_timeout(seconds: int = 60):
|
||||
"""
|
||||
Timeout decorator specifically for CI environments.
|
||||
Uses threading for more reliable timeout handling.
|
||||
|
||||
Args:
|
||||
seconds: Timeout in seconds (default: 60)
|
||||
"""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
# Only apply in CI
|
||||
if os.environ.get("CI") != "true":
|
||||
return func(*args, **kwargs)
|
||||
|
||||
import threading
|
||||
|
||||
result = [None]
|
||||
exception = [None]
|
||||
finished = threading.Event()
|
||||
|
||||
def target():
|
||||
try:
|
||||
result[0] = func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
exception[0] = e
|
||||
finally:
|
||||
finished.set()
|
||||
|
||||
# Start function in thread
|
||||
thread = threading.Thread(target=target, daemon=True)
|
||||
thread.start()
|
||||
|
||||
# Wait for completion or timeout
|
||||
if not finished.wait(timeout=seconds):
|
||||
print(f"\n💥 CI TIMEOUT: Test {func.__name__} exceeded {seconds}s limit!")
|
||||
print("This usually indicates hanging embedding servers or infinite loops.")
|
||||
|
||||
# Try to cleanup embedding servers
|
||||
try:
|
||||
import subprocess
|
||||
|
||||
subprocess.run(
|
||||
["pkill", "-9", "-f", "embedding_server"], capture_output=True, timeout=2
|
||||
)
|
||||
subprocess.run(
|
||||
["pkill", "-9", "-f", "hnsw_embedding"], capture_output=True, timeout=2
|
||||
)
|
||||
print("Attempted to kill hanging embedding servers.")
|
||||
except Exception as e:
|
||||
print(f"Cleanup failed: {e}")
|
||||
|
||||
# Raise TimeoutError instead of sys.exit for better pytest integration
|
||||
raise TimeoutError(f"Test {func.__name__} timed out after {seconds} seconds")
|
||||
|
||||
if exception[0]:
|
||||
raise exception[0]
|
||||
|
||||
return result[0]
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
11
uv.lock
generated
11
uv.lock
generated
@@ -2356,6 +2356,7 @@ dependencies = [
|
||||
{ name = "pdfplumber" },
|
||||
{ name = "protobuf" },
|
||||
{ name = "psutil" },
|
||||
{ name = "pybind11" },
|
||||
{ name = "pymupdf" },
|
||||
{ name = "pypdf2" },
|
||||
{ name = "pypdfium2" },
|
||||
@@ -2438,6 +2439,7 @@ requires-dist = [
|
||||
{ name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.5.0" },
|
||||
{ name = "protobuf", specifier = "==4.25.3" },
|
||||
{ name = "psutil", specifier = ">=5.8.0" },
|
||||
{ name = "pybind11", specifier = ">=3.0.0" },
|
||||
{ name = "pymupdf", specifier = ">=1.26.0" },
|
||||
{ name = "pypdf2", specifier = ">=3.0.0" },
|
||||
{ name = "pypdfium2", specifier = ">=4.30.0" },
|
||||
@@ -4358,6 +4360,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/10/15/6b30e77872012bbfe8265d42a01d5b3c17ef0ac0f2fae531ad91b6a6c02e/pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f", size = 26227521 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pybind11"
|
||||
version = "3.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ef/83/698d120e257a116f2472c710932023ad779409adf2734d2e940f34eea2c5/pybind11-3.0.0.tar.gz", hash = "sha256:c3f07bce3ada51c3e4b76badfa85df11688d12c46111f9d242bc5c9415af7862", size = 544819, upload-time = "2025-07-10T16:52:09.335Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/41/9c/85f50a5476832c3efc67b6d7997808388236ae4754bf53e1749b3bc27577/pybind11-3.0.0-py3-none-any.whl", hash = "sha256:7c5cac504da5a701b5163f0e6a7ba736c713a096a5378383c5b4b064b753f607", size = 292118, upload-time = "2025-07-10T16:52:07.828Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "2.22"
|
||||
|
||||
Reference in New Issue
Block a user