Compare commits

...

21 Commits

Author SHA1 Message Date
GitHub Actions
8375f601ba chore: release v0.1.13 2025-07-27 01:08:17 +00:00
yichuan520030910320
c87c0fe662 update colab install & fix colab path 2025-07-26 18:07:31 -07:00
yichuan520030910320
73927b68ef Merge branch 'main' of https://github.com/yichuan-w/LEANN 2025-07-26 17:09:55 -07:00
yichuan520030910320
cc1a62e5aa update pytoml version again 2025-07-26 17:09:45 -07:00
GitHub Actions
802020cb41 chore: release v0.1.12 2025-07-26 23:35:28 +00:00
yichuan520030910320
cdb92f7cf4 update pytoml version && fix colab env && fix pdf extract in pip 2025-07-26 16:33:13 -07:00
yichuan520030910320
dc69bdec00 Merge branch 'main' of https://github.com/yichuan-w/LEANN 2025-07-25 17:54:43 -07:00
yichuan520030910320
98073e9868 update missing pkg 2025-07-25 17:54:21 -07:00
GitHub Actions
cf2ef48967 chore: release v0.1.11 2025-07-26 00:12:37 +00:00
yichuan520030910320
0692bbf7a2 change workflow 2025-07-25 17:11:56 -07:00
GitHub Actions
52584a171f chore: release v0.1.10 2025-07-25 23:12:16 +00:00
Andy Lee
efd6b5324b fix: add protobuf as a dependency for DiskANN backend
- Fixes 'No module named google' error when starting DiskANN embedding server
- Prevents users from having to manually install protobuf
2025-07-25 16:10:25 -07:00
Andy Lee
2baaa4549b fix: handle relative paths in HNSW embedding server metadata
- Convert relative paths to absolute paths based on metadata file location
- Fixes FileNotFoundError when starting embedding server
- Resolves issue with passages file not found in different working directories
2025-07-25 16:09:53 -07:00
Andy Lee
35310ddd52 fix: pure Python packages not building due to ubuntu-latest check
The build workflow was checking for matrix.os == 'ubuntu-latest',
but we changed the matrix to use 'ubuntu-22.04', causing the
pure Python packages (leann-core and leann) to never be built.

Changed to use pattern matching [[ == ubuntu-* ]] to match any
Ubuntu version.

This explains why v0.1.9 only published the C++ backend packages
but not the pure Python packages.
2025-07-25 15:14:21 -07:00
Andy Lee
fc9c5cb39d fix: make release workflow idempotent
- Check if version is already updated before trying to update
- Check if tag already exists before creating
- Check if GitHub release already exists before creating
- This allows re-running the workflow after partial failures

Previously, if the workflow failed after updating version but before
completing the release, it couldn't be re-run with the same version.
2025-07-25 14:47:35 -07:00
Andy Lee
8f2a1e87ea Merge pull request #7 from yichuan-w/fix/simple-ubuntu22-build
fix: simplify build system for Colab compatibility
2025-07-25 14:08:37 -07:00
Andy Lee
50caf65f28 fix: change ubuntu-latest to ubuntu-22.04 and add Python 3.13
- Explicitly use ubuntu-22.04 instead of ubuntu-latest
- Add Python 3.13 to the build matrix
- This ensures we build on the same OS version as Google Colab
2025-07-25 13:48:59 -07:00
Andy Lee
1b48794ca8 cleanup: remove cibuildwheel workflow files
- Remove ci-cibuildwheel.yml and build-cibuildwheel.yml
- These files were not present in v0.1.5
- Keep only the simple build system
2025-07-25 13:48:08 -07:00
Andy Lee
4aef1d814e revert: simplify build system by removing manylinux/cibuildwheel
- Revert to simple Ubuntu 22.04 builds that should work with Colab
- Remove all manylinux container complexity
- Colab runs on Ubuntu 22.04, so direct builds should be compatible
- Restore build-reusable.yml to v0.1.5 version
- Remove cibuildwheel option from release workflow

This should fix the overcomplicated build issues while maintaining
Colab compatibility through direct Ubuntu 22.04 builds.
2025-07-25 13:46:51 -07:00
GitHub Actions
75ddcd6158 chore: release v0.1.9 2025-07-25 20:04:42 +00:00
Andy Lee
2a4df11f5c fix: absolute path for passages 2025-07-25 11:59:30 -07:00
19 changed files with 3458 additions and 3627 deletions

View File

@@ -1,144 +0,0 @@
name: Build with cibuildwheel
on:
workflow_call:
inputs:
ref:
description: 'Git ref to build'
required: false
type: string
default: ''
jobs:
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
steps:
- uses: actions/checkout@v4
with:
ref: ${{ inputs.ref }}
submodules: recursive
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11' # Version for building pure Python packages
# Build each package separately in our monorepo
- name: Build pure Python packages (leann-core, leann)
if: matrix.os == 'ubuntu-latest' # Only build once, they're platform-independent
run: |
# Install build tools
python -m pip install --upgrade pip build
# Build pure Python packages
python -m build packages/leann-core --outdir wheelhouse/
python -m build packages/leann --outdir wheelhouse/
- name: Build leann-backend-hnsw wheels
uses: pypa/cibuildwheel@v2.16.2
with:
package-dir: packages/leann-backend-hnsw
output-dir: wheelhouse
env:
CIBW_BUILD: cp39-* cp310-* cp311-* cp312-*
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_SKIP: "*-win32 *-manylinux_i686 pp*"
CIBW_BEFORE_ALL_LINUX: |
yum clean all && yum makecache
yum install -y epel-release || true
yum makecache || true
# Install system dependencies
yum install -y \
gcc-c++ \
boost-devel \
protobuf-compiler \
protobuf-devel \
zeromq-devel \
pkgconfig \
openblas-devel \
cmake || echo "Some packages failed, continuing..."
# Verify zmq installation and create pkg-config file if needed
if [ ! -f /usr/lib64/pkgconfig/libzmq.pc ] && [ ! -f /usr/lib/pkgconfig/libzmq.pc ]; then
echo "Creating libzmq.pc file..."
mkdir -p /usr/lib64/pkgconfig
cat > /usr/lib64/pkgconfig/libzmq.pc << 'EOF'
prefix=/usr
exec_prefix=${prefix}
libdir=${exec_prefix}/lib64
includedir=${prefix}/include
Name: libzmq
Description: ZeroMQ library
Version: 4.1.4
Libs: -L${libdir} -lzmq
Cflags: -I${includedir}
EOF
fi
CIBW_BEFORE_ALL_MACOS: |
brew install llvm libomp boost protobuf zeromq
CIBW_ENVIRONMENT_LINUX: |
PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH
CIBW_ENVIRONMENT_MACOS: |
CC=$(brew --prefix llvm)/bin/clang
CXX=$(brew --prefix llvm)/bin/clang++
CIBW_TEST_REQUIRES: leann-core numpy pyzmq msgpack
CIBW_TEST_COMMAND: python -c "import leann_backend_hnsw"
- name: Build leann-backend-diskann wheels
uses: pypa/cibuildwheel@v2.16.2
with:
package-dir: packages/leann-backend-diskann
output-dir: wheelhouse
env:
CIBW_BUILD: cp39-* cp310-* cp311-* cp312-*
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_SKIP: "*-win32 *-manylinux_i686 pp*"
CIBW_BEFORE_ALL_LINUX: |
yum clean all && yum makecache
yum install -y epel-release || true
yum makecache || true
# Install system dependencies for DiskANN
yum install -y \
gcc-c++ \
protobuf-compiler \
protobuf-devel \
openblas-devel \
pkgconfig \
cmake || echo "Some packages failed, continuing..."
yum install -y libaio-devel || echo "libaio-devel not available, continuing..."
CIBW_BEFORE_ALL_MACOS: |
brew install llvm libomp protobuf
CIBW_ENVIRONMENT_LINUX: |
PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH
CIBW_ENVIRONMENT_MACOS: |
CC=$(brew --prefix llvm)/bin/clang
CXX=$(brew --prefix llvm)/bin/clang++
CIBW_TEST_REQUIRES: leann-core numpy
CIBW_TEST_COMMAND: python -c "import leann_backend_diskann"
- name: List built packages
run: |
echo "📦 Built packages:"
ls -la wheelhouse/
- uses: actions/upload-artifact@v4
with:
name: cibw-wheels-${{ matrix.os }}
path: ./wheelhouse/*.whl

View File

@@ -13,107 +13,46 @@ jobs:
build:
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
strategy:
fail-fast: false
matrix:
include:
- os: ubuntu-latest
- os: ubuntu-22.04
python: '3.9'
container: 'quay.io/pypa/manylinux2014_x86_64'
- os: ubuntu-latest
- os: ubuntu-22.04
python: '3.10'
container: 'quay.io/pypa/manylinux2014_x86_64'
- os: ubuntu-latest
- os: ubuntu-22.04
python: '3.11'
container: 'quay.io/pypa/manylinux2014_x86_64'
- os: ubuntu-latest
- os: ubuntu-22.04
python: '3.12'
container: 'quay.io/pypa/manylinux2014_x86_64'
- os: ubuntu-latest
- os: ubuntu-22.04
python: '3.13'
container: 'quay.io/pypa/manylinux2014_x86_64'
- os: macos-latest
python: '3.9'
container: ''
- os: macos-latest
python: '3.10'
container: ''
- os: macos-latest
python: '3.11'
container: ''
- os: macos-latest
python: '3.12'
container: ''
- os: macos-latest
python: '3.13'
container: ''
runs-on: ${{ matrix.os }}
container: ${{ matrix.container }}
steps:
# For manylinux2014 compatibility, we'll handle checkout differently
- uses: actions/checkout@v4
if: matrix.container == ''
with:
ref: ${{ inputs.ref }}
submodules: recursive
# Manual checkout for containers to avoid Node.js compatibility issues
- name: Manual checkout in container
if: matrix.container != ''
run: |
# Install git if not available
yum install -y git || true
# Configure git to handle the directory ownership issue
git config --global --add safe.directory ${GITHUB_WORKSPACE}
git config --global --add safe.directory /__w/LEANN/LEANN
git config --global --add safe.directory /github/workspace
git config --global --add safe.directory $(pwd)
# Clone the repository manually in the container
git init
git remote add origin https://github.com/${GITHUB_REPOSITORY}.git
# Fetch the appropriate ref
if [ -n "${{ inputs.ref }}" ]; then
git fetch --depth=1 origin ${{ inputs.ref }}
else
git fetch --depth=1 origin ${GITHUB_SHA}
fi
git checkout FETCH_HEAD
# Initialize submodules
git submodule update --init --recursive
- name: Setup Python (macOS and regular Ubuntu)
if: matrix.container == ''
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}
- name: Setup Python (manylinux container)
if: matrix.container != ''
run: |
# Use the pre-installed Python version in manylinux container
# Convert Python version format (3.9 -> 39, 3.10 -> 310, etc.)
PY_VER=$(echo "${{ matrix.python }}" | sed 's/\.//g')
/opt/python/cp${PY_VER}-*/bin/python -m pip install --upgrade pip
# Create symlinks for convenience
ln -sf /opt/python/cp${PY_VER}-*/bin/python /usr/local/bin/python
ln -sf /opt/python/cp${PY_VER}-*/bin/pip /usr/local/bin/pip
- name: Install uv (macOS and regular Ubuntu)
if: matrix.container == ''
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Install uv (manylinux container)
if: matrix.container != ''
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Install system dependencies (Ubuntu - regular)
if: runner.os == 'Linux' && matrix.container == ''
- name: Install system dependencies (Ubuntu)
if: runner.os == 'Linux'
run: |
sudo apt-get update
sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
@@ -126,64 +65,6 @@ jobs:
echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Install system dependencies (manylinux container)
if: runner.os == 'Linux' && matrix.container != ''
run: |
# manylinux2014 uses yum instead of apt
# Update yum cache first
yum clean all
yum makecache
# Install EPEL repository
yum install -y epel-release || true
# Update cache again after EPEL
yum makecache || true
# Install development packages
# Note: Some packages might have different names in CentOS 7
yum install -y \
gcc-c++ \
boost-devel \
protobuf-compiler \
protobuf-devel \
zeromq-devel \
pkgconfig \
openblas-devel \
cmake || {
echo "Some packages failed to install, trying alternatives..."
# Try alternative package names
yum install -y libzmq3-devel || true
yum install -y libzmq-devel || true
}
# Install optional packages that might not be available
yum install -y libaio-devel || echo "libaio-devel not available, continuing..."
# Verify zmq installation and create pkg-config file if needed
if [ ! -f /usr/lib64/pkgconfig/libzmq.pc ] && [ ! -f /usr/lib/pkgconfig/libzmq.pc ]; then
echo "Creating libzmq.pc file..."
mkdir -p /usr/lib64/pkgconfig
cat > /usr/lib64/pkgconfig/libzmq.pc << 'EOF'
prefix=/usr
exec_prefix=${prefix}
libdir=${exec_prefix}/lib64
includedir=${prefix}/include
Name: libzmq
Description: ZeroMQ library
Version: 4.1.4
Libs: -L${libdir} -lzmq
Cflags: -I${includedir}
EOF
fi
# Update PKG_CONFIG_PATH
echo "PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
# Build tools are pre-installed in manylinux
# MKL is more complex in container, skip for now and use OpenBLAS
- name: Install system dependencies (macOS)
if: runner.os == 'macOS'
run: |
@@ -191,65 +72,44 @@ jobs:
- name: Install build dependencies
run: |
if [[ -n "${{ matrix.container }}" ]]; then
# In manylinux container, use regular pip
pip install scikit-build-core numpy swig Cython pybind11 auditwheel
uv pip install --system scikit-build-core numpy swig Cython pybind11
if [[ "$RUNNER_OS" == "Linux" ]]; then
uv pip install --system auditwheel
else
# Regular environment, use uv
uv pip install --system scikit-build-core numpy swig Cython pybind11
if [[ "$RUNNER_OS" == "Linux" ]]; then
uv pip install --system auditwheel
else
uv pip install --system delocate
fi
uv pip install --system delocate
fi
- name: Build packages
run: |
# Choose build command based on environment
if [[ -n "${{ matrix.container }}" ]]; then
BUILD_CMD="pip wheel . --no-deps -w dist"
else
BUILD_CMD="uv build --wheel --python python"
fi
# Build core (platform independent)
if [ "${{ matrix.os }}" == "ubuntu-latest" ]; then
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
cd packages/leann-core
if [[ -n "${{ matrix.container }}" ]]; then
pip wheel . --no-deps -w dist
else
uv build
fi
uv build
cd ../..
fi
# Build HNSW backend
cd packages/leann-backend-hnsw
if [ "${{ matrix.os }}" == "macos-latest" ]; then
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ $BUILD_CMD
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
else
eval $BUILD_CMD
uv build --wheel --python python
fi
cd ../..
# Build DiskANN backend
cd packages/leann-backend-diskann
if [ "${{ matrix.os }}" == "macos-latest" ]; then
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ $BUILD_CMD
CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
else
eval $BUILD_CMD
uv build --wheel --python python
fi
cd ../..
# Build meta package (platform independent)
if [ "${{ matrix.os }}" == "ubuntu-latest" ]; then
if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
cd packages/leann
if [[ -n "${{ matrix.container }}" ]]; then
pip wheel . --no-deps -w dist
else
uv build
fi
uv build
cd ../..
fi
@@ -259,9 +119,6 @@ jobs:
# Repair HNSW wheel
cd packages/leann-backend-hnsw
if [ -d dist ]; then
# Show what platform auditwheel will use
auditwheel show dist/*.whl || true
# Let auditwheel auto-detect the appropriate manylinux tag
auditwheel repair dist/*.whl -w dist_repaired
rm -rf dist
mv dist_repaired dist
@@ -271,9 +128,6 @@ jobs:
# Repair DiskANN wheel
cd packages/leann-backend-diskann
if [ -d dist ]; then
# Show what platform auditwheel will use
auditwheel show dist/*.whl || true
# Let auditwheel auto-detect the appropriate manylinux tag
auditwheel repair dist/*.whl -w dist_repaired
rm -rf dist
mv dist_repaired dist
@@ -309,5 +163,5 @@ jobs:
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: packages-${{ matrix.os }}-py${{ matrix.python }}${{ matrix.container && '-manylinux' || '' }}
name: packages-${{ matrix.os }}-py${{ matrix.python }}
path: packages/*/dist/

View File

@@ -1,12 +0,0 @@
name: CI - cibuildwheel (Test)
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch: # Allow manual triggering
jobs:
build:
uses: ./.github/workflows/build-cibuildwheel.yml

View File

@@ -7,11 +7,6 @@ on:
description: 'Version to release (e.g., 0.1.2)'
required: true
type: string
use_cibuildwheel:
description: 'Use cibuildwheel for better compatibility (recommended for Colab)'
required: false
type: boolean
default: false
jobs:
update-version:
@@ -27,46 +22,50 @@ jobs:
- name: Validate version
run: |
if ! [[ "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "❌ Invalid version format"
# Remove 'v' prefix if present for validation
VERSION_CLEAN="${{ inputs.version }}"
VERSION_CLEAN="${VERSION_CLEAN#v}"
if ! [[ "$VERSION_CLEAN" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "❌ Invalid version format. Expected format: X.Y.Z or vX.Y.Z"
exit 1
fi
echo "✅ Version format valid"
echo "✅ Version format valid: ${{ inputs.version }}"
- name: Update versions and push
id: push
run: |
./scripts/bump_version.sh ${{ inputs.version }}
git config user.name "GitHub Actions"
git config user.email "actions@github.com"
git add packages/*/pyproject.toml
git commit -m "chore: release v${{ inputs.version }}"
git push origin main
# Check current version
CURRENT_VERSION=$(grep "^version" packages/leann-core/pyproject.toml | cut -d'"' -f2)
echo "Current version: $CURRENT_VERSION"
echo "Target version: ${{ inputs.version }}"
if [ "$CURRENT_VERSION" = "${{ inputs.version }}" ]; then
echo "⚠️ Version is already ${{ inputs.version }}, skipping update"
COMMIT_SHA=$(git rev-parse HEAD)
else
./scripts/bump_version.sh ${{ inputs.version }}
git config user.name "GitHub Actions"
git config user.email "actions@github.com"
git add packages/*/pyproject.toml
git commit -m "chore: release v${{ inputs.version }}"
git push origin main
COMMIT_SHA=$(git rev-parse HEAD)
echo "✅ Pushed version update: $COMMIT_SHA"
fi
COMMIT_SHA=$(git rev-parse HEAD)
echo "commit-sha=$COMMIT_SHA" >> $GITHUB_OUTPUT
echo "✅ Pushed version update: $COMMIT_SHA"
build-packages-reusable:
name: Build packages (Standard)
build-packages:
name: Build packages
needs: update-version
if: ${{ !inputs.use_cibuildwheel }}
uses: ./.github/workflows/build-reusable.yml
with:
ref: ${{ needs.update-version.outputs.commit-sha }}
build-packages-cibuildwheel:
name: Build packages (cibuildwheel)
needs: update-version
if: ${{ inputs.use_cibuildwheel }}
uses: ./.github/workflows/build-cibuildwheel.yml
with:
ref: ${{ needs.update-version.outputs.commit-sha }}
ref: 'main'
publish:
name: Publish and Release
needs: [update-version, build-packages-reusable, build-packages-cibuildwheel]
if: always() && needs.update-version.result == 'success' && (needs.build-packages-reusable.result == 'success' || needs.build-packages-cibuildwheel.result == 'success')
needs: [update-version, build-packages]
if: always() && needs.update-version.result == 'success' && needs.build-packages.result == 'success'
runs-on: ubuntu-latest
permissions:
contents: write
@@ -74,7 +73,7 @@ jobs:
steps:
- uses: actions/checkout@v4
with:
ref: ${{ needs.update-version.outputs.commit-sha }}
ref: 'main'
- name: Download all artifacts
uses: actions/download-artifact@v4
@@ -107,12 +106,24 @@ jobs:
- name: Create release
run: |
git tag "v${{ inputs.version }}"
git push origin "v${{ inputs.version }}"
# Check if tag already exists
if git rev-parse "v${{ inputs.version }}" >/dev/null 2>&1; then
echo "⚠️ Tag v${{ inputs.version }} already exists, skipping tag creation"
else
git tag "v${{ inputs.version }}"
git push origin "v${{ inputs.version }}"
echo "✅ Created and pushed tag v${{ inputs.version }}"
fi
gh release create "v${{ inputs.version }}" \
--title "Release v${{ inputs.version }}" \
--notes "🚀 Released to PyPI: https://pypi.org/project/leann/${{ inputs.version }}/" \
--latest
# Check if release already exists
if gh release view "v${{ inputs.version }}" >/dev/null 2>&1; then
echo "⚠️ Release v${{ inputs.version }} already exists, skipping release creation"
else
gh release create "v${{ inputs.version }}" \
--title "Release v${{ inputs.version }}" \
--notes "🚀 Released to PyPI: https://pypi.org/project/leann/${{ inputs.version }}/" \
--latest
echo "✅ Created GitHub release v${{ inputs.version }}"
fi
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -195,7 +195,7 @@ Once the index is built, you can ask questions like:
- "Show me emails about travel expenses"
</details>
### 🔍 Time Machine for the Web: RAG Your Entire Google Browser History!
### 🔍 Time Machine for the Web: RAG Your Entire Chrome Browser History!
<p align="center">
<img src="videos/google_clear.gif" alt="LEANN Browser History Search Demo" width="600">

View File

@@ -13,8 +13,12 @@
"metadata": {},
"outputs": [],
"source": [
"# install this if you areusing colab\n",
"! pip install leann"
"# install this if you are using colab\n",
"! uv pip install leann-core leann-backend-hnsw --no-deps\n",
"! uv pip install leann --no-deps\n",
"# For Colab environment, we need to set some environment variables\n",
"import os\n",
"os.environ['LEANN_LOG_LEVEL'] = 'INFO' # Enable more detailed logging"
]
},
{
@@ -26,81 +30,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: Registering backend 'hnsw'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/yichuan/Desktop/code/LEANN/leann/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever\n",
"WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.\n",
"Writing passages: 100%|██████████| 5/5 [00:00<00:00, 27887.66chunk/s]\n",
"Batches: 100%|██████████| 1/1 [00:00<00:00, 13.51it/s]\n",
"WARNING:leann_backend_hnsw.hnsw_backend:Converting data to float32, shape: (5, 768)\n",
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Converting HNSW index to CSR-pruned format...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"M: 64 for level: 0\n",
"Starting conversion: knowledge.index -> knowledge.csr.tmp\n",
"[0.00s] Reading Index HNSW header...\n",
"[0.00s] Header read: d=768, ntotal=5\n",
"[0.00s] Reading HNSW struct vectors...\n",
" Reading vector (dtype=<class 'numpy.float64'>, fmt='d')... Count=6, Bytes=48\n",
"[0.00s] Read assign_probas (6)\n",
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=7, Bytes=28\n",
"[0.11s] Read cum_nneighbor_per_level (7)\n",
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=5, Bytes=20\n",
"[0.21s] Read levels (5)\n",
"[0.30s] Probing for compact storage flag...\n",
"[0.30s] Found compact flag: False\n",
"[0.30s] Compact flag is False, reading original format...\n",
"[0.30s] Probing for potential extra byte before non-compact offsets...\n",
"[0.30s] Found and consumed an unexpected 0x00 byte.\n",
" Reading vector (dtype=<class 'numpy.uint64'>, fmt='Q')... Count=6, Bytes=48\n",
"[0.30s] Read offsets (6)\n",
"[0.40s] Attempting to read neighbors vector...\n",
" Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=320, Bytes=1280\n",
"[0.40s] Read neighbors (320)\n",
"[0.50s] Read scalar params (ep=4, max_lvl=0)\n",
"[0.50s] Checking for storage data...\n",
"[0.50s] Found storage fourcc: 49467849.\n",
"[0.50s] Converting to CSR format...\n",
"[0.50s] Conversion loop finished. \n",
"[0.50s] Running validation checks...\n",
" Checking total valid neighbor count...\n",
" OK: Total valid neighbors = 20\n",
" Checking final pointer indices...\n",
" OK: Final pointers match data size.\n",
"[0.50s] Deleting original neighbors and offsets arrays...\n",
" CSR Stats: |data|=20, |level_ptr|=10\n",
"[0.59s] Writing CSR HNSW graph data in FAISS-compatible order...\n",
" Pruning embeddings: Writing NULL storage marker.\n",
"[0.69s] Conversion complete.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann_backend_hnsw.hnsw_backend:✅ CSR conversion successful.\n",
"INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'knowledge.index'\n"
]
}
],
"outputs": [],
"source": [
"from leann.api import LeannBuilder\n",
"\n",
@@ -122,93 +54,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann.api:🔍 LeannSearcher.search() called:\n",
"INFO:leann.api: Query: 'programming languages'\n",
"INFO:leann.api: Top_k: 2\n",
"INFO:leann.api: Additional kwargs: {}\n",
"INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Using port 5560 instead of 5557\n",
"INFO:leann.embedding_server_manager:Starting embedding server on port 5560...\n",
"INFO:leann.embedding_server_manager:Command: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever --passages-file knowledge.leann.meta.json\n",
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"INFO:leann.embedding_server_manager:Server process started with PID: 4574\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n",
"[read_HNSW NL v4] Read levels vector, size: 5\n",
"[read_HNSW NL v4] Reading Compact Storage format indices...\n",
"[read_HNSW NL v4] Read compact_level_ptr, size: 10\n",
"[read_HNSW NL v4] Read compact_node_offsets, size: 6\n",
"[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n",
"[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n",
"[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n",
"[read_HNSW NL v4] Reading neighbors data into memory.\n",
"[read_HNSW NL v4] Read neighbors data, size: 20\n",
"[read_HNSW NL v4] Finished reading metadata and CSR indices.\n",
"INFO: Skipping external storage loading, since is_recompute is true.\n",
"INFO: Registering backend 'hnsw'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann.embedding_server_manager:Embedding server is ready!\n",
"INFO:leann.api: Launching server time: 1.078078269958496 seconds\n",
"INFO:leann.embedding_server_manager:Existing server process (PID 4574) is compatible\n",
"INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever\n",
"WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.\n",
"INFO:leann.api: Generated embedding shape: (1, 768)\n",
"INFO:leann.api: Embedding time: 2.9307072162628174 seconds\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ZmqDistanceComputer initialized: d=768, metric=0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann.api: Search time: 0.27327895164489746 seconds\n",
"INFO:leann.api: Backend returned: labels=2 results\n",
"INFO:leann.api: Processing 2 passage IDs:\n",
"INFO:leann.api: 1. passage_id='0' -> SUCCESS: C# is a powerful programming language and it is good at game development...\n",
"INFO:leann.api: 2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is good at machine learning tasks...\n",
"INFO:leann.api: Final enriched results: 2 passages\n"
]
},
{
"data": {
"text/plain": [
"[SearchResult(id='0', score=np.float32(0.9874103), text='C# is a powerful programming language and it is good at game development', metadata={}),\n",
" SearchResult(id='1', score=np.float32(0.8922168), text='Python is a powerful programming language and it is good at machine learning tasks', metadata={})]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from leann.api import LeannSearcher\n",
"\n",
@@ -228,79 +76,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann.chat:Attempting to create LLM of type='hf' with model='Qwen/Qwen3-0.6B'\n",
"INFO:leann.chat:Initializing HFChat with model='Qwen/Qwen3-0.6B'\n",
"INFO:leann.chat:MPS is available. Using Apple Silicon GPU.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n",
"[read_HNSW NL v4] Read levels vector, size: 5\n",
"[read_HNSW NL v4] Reading Compact Storage format indices...\n",
"[read_HNSW NL v4] Read compact_level_ptr, size: 10\n",
"[read_HNSW NL v4] Read compact_node_offsets, size: 6\n",
"[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n",
"[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n",
"[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n",
"[read_HNSW NL v4] Reading neighbors data into memory.\n",
"[read_HNSW NL v4] Read neighbors data, size: 20\n",
"[read_HNSW NL v4] Finished reading metadata and CSR indices.\n",
"INFO: Skipping external storage loading, since is_recompute is true.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:leann.api:🔍 LeannSearcher.search() called:\n",
"INFO:leann.api: Query: 'Compare the two retrieved programming languages and tell me their advantages.'\n",
"INFO:leann.api: Top_k: 2\n",
"INFO:leann.api: Additional kwargs: {}\n",
"INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
"INFO:leann.embedding_server_manager:Found compatible server on port 5560\n",
"INFO:leann.embedding_server_manager:Using existing compatible server on port 5560\n",
"INFO:leann.api: Launching server time: 0.04932403564453125 seconds\n",
"INFO:leann.embedding_server_manager:Found compatible server on port 5560\n",
"INFO:leann.embedding_server_manager:Using existing compatible server on port 5560\n",
"INFO:leann.api: Generated embedding shape: (1, 768)\n",
"INFO:leann.api: Embedding time: 0.06902289390563965 seconds\n",
"INFO:leann.api: Search time: 0.026793241500854492 seconds\n",
"INFO:leann.api: Backend returned: labels=2 results\n",
"INFO:leann.api: Processing 2 passage IDs:\n",
"INFO:leann.api: 1. passage_id='0' -> SUCCESS: C# is a powerful programming language and it is good at game development...\n",
"INFO:leann.api: 2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is good at machine learning tasks...\n",
"INFO:leann.api: Final enriched results: 2 passages\n",
"INFO:leann.chat:Generating with HuggingFace model, config: {'max_new_tokens': 128, 'temperature': 0.7, 'top_p': 0.9, 'do_sample': True, 'pad_token_id': 151645, 'eos_token_id': 151645}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ZmqDistanceComputer initialized: d=768, metric=0\n"
]
},
{
"data": {
"text/plain": [
"\"<think>\\n\\n</think>\\n\\nBased on the context provided, here's a comparison of the two retrieved programming languages:\\n\\n**C#** is known for being a powerful programming language and is well-suited for game development. It is often used in game development and is popular among developers working on Windows applications.\\n\\n**Python**, on the other hand, is also a powerful language and is well-suited for machine learning tasks. It is widely used for data analysis, scientific computing, and other applications that require handling large datasets or performing complex calculations.\\n\\n**Advantages**:\\n- C#: Strong for game development and cross-platform compatibility.\\n- Python: Strong for\""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from leann.api import LeannChat\n",
"\n",

View File

@@ -97,11 +97,13 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
# Use HNSW backend for better macOS compatibility
builder = LeannBuilder(
backend_name="hnsw",
embedding_model="facebook/contriever",
embedding_model="text-embedding-3-small",
embedding_mode="openai",
graph_degree=32,
complexity=64,
is_compact=True,
is_recompute=True,
is_compact=False,
is_recompute=False,
num_threads=1 # Force single-threaded mode
)

View File

@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-diskann"
version = "0.1.8"
dependencies = ["leann-core==0.1.8", "numpy"]
version = "0.1.13"
dependencies = ["leann-core==0.1.13", "numpy", "protobuf>=3.19.0"]
[tool.scikit-build]
# Key: simplified CMake path

View File

@@ -48,6 +48,10 @@ class HNSWBuilder(LeannBackendBuilderInterface):
self.efConstruction = self.build_params.setdefault("efConstruction", 200)
self.distance_metric = self.build_params.setdefault("distance_metric", "mips")
self.dimensions = self.build_params.get("dimensions")
if not self.is_recompute:
if self.is_compact:
# TODO: support this case @andy
raise ValueError("is_recompute is False, but is_compact is True. This is not compatible now. change is compact to False and you can use the original HNSW index.")
def build(self, data: np.ndarray, ids: List[str], index_path: str, **kwargs):
from . import faiss # type: ignore

View File

@@ -81,7 +81,21 @@ def create_hnsw_embedding_server(
with open(passages_file, "r") as f:
meta = json.load(f)
passages = PassageManager(meta["passage_sources"])
# Convert relative paths to absolute paths based on metadata file location
metadata_dir = Path(
passages_file
).parent.parent # Go up one level from the metadata file
passage_sources = []
for source in meta["passage_sources"]:
source_copy = source.copy()
# Convert relative paths to absolute paths
if not Path(source_copy["path"]).is_absolute():
source_copy["path"] = str(metadata_dir / source_copy["path"])
if not Path(source_copy["index_path"]).is_absolute():
source_copy["index_path"] = str(metadata_dir / source_copy["index_path"])
passage_sources.append(source_copy)
passages = PassageManager(passage_sources)
logger.info(
f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
)
@@ -270,15 +284,15 @@ def create_hnsw_embedding_server(
if __name__ == "__main__":
import signal
import sys
def signal_handler(sig, frame):
logger.info(f"Received signal {sig}, shutting down gracefully...")
sys.exit(0)
# Register signal handlers for graceful shutdown
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
parser = argparse.ArgumentParser(description="HNSW Embedding service")
parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
parser.add_argument(

View File

@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"
[project]
name = "leann-backend-hnsw"
version = "0.1.8"
version = "0.1.13"
description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
dependencies = [
"leann-core==0.1.8",
"leann-core==0.1.13",
"numpy",
"pyzmq>=23.0.0",
"msgpack>=1.0.0",

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann-core"
version = "0.1.8"
version = "0.1.13"
description = "Core API and plugin system for LEANN"
readme = "README.md"
requires-python = ">=3.9"
@@ -21,6 +21,23 @@ dependencies = [
"sentence-transformers>=2.2.0",
"llama-index-core>=0.12.0",
"python-dotenv>=1.0.0",
"openai>=1.0.0",
"huggingface-hub>=0.20.0",
"transformers>=4.30.0",
"requests>=2.25.0",
"accelerate>=0.20.0",
"PyPDF2>=3.0.0",
"pymupdf>=1.23.0",
"pdfplumber>=0.10.0",
"mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
]
[project.optional-dependencies]
colab = [
"torch>=2.0.0,<3.0.0", # 限制torch版本避免冲突
"transformers>=4.30.0,<5.0.0", # 限制transformers版本
"accelerate>=0.20.0,<1.0.0", # 限制accelerate版本
]
[project.scripts]

View File

@@ -117,8 +117,15 @@ class PassageManager:
assert source["type"] == "jsonl", "only jsonl is supported"
passage_file = source["path"]
index_file = source["index_path"] # .idx file
# Fix path resolution for Colab and other environments
if not Path(index_file).is_absolute():
# If relative path, try to resolve it properly
index_file = str(Path(index_file).resolve())
if not Path(index_file).exists():
raise FileNotFoundError(f"Passage index file not found: {index_file}")
with open(index_file, "rb") as f:
offset_map = pickle.load(f)
self.offset_maps[passage_file] = offset_map
@@ -381,6 +388,10 @@ class LeannBuilder:
class LeannSearcher:
def __init__(self, index_path: str, enable_warmup: bool = False, **backend_kwargs):
# Fix path resolution for Colab and other environments
if not Path(index_path).is_absolute():
index_path = str(Path(index_path).resolve())
self.meta_path_str = f"{index_path}.meta.json"
if not Path(self.meta_path_str).exists():
raise FileNotFoundError(

View File

@@ -7,6 +7,33 @@ from llama_index.core.node_parser import SentenceSplitter
from .api import LeannBuilder, LeannSearcher, LeannChat
def extract_pdf_text_with_pymupdf(file_path: str) -> str:
"""Extract text from PDF using PyMuPDF for better quality."""
try:
import fitz # PyMuPDF
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
except ImportError:
# Fallback to default reader
return None
def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
"""Extract text from PDF using pdfplumber for better quality."""
try:
import pdfplumber
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text
except ImportError:
# Fallback to default reader
return None
class LeannCLI:
def __init__(self):
@@ -145,12 +172,42 @@ Examples:
def load_documents(self, docs_dir: str):
print(f"Loading documents from {docs_dir}...")
documents = SimpleDirectoryReader(
# Try to use better PDF parsers first
documents = []
docs_path = Path(docs_dir)
for file_path in docs_path.rglob("*.pdf"):
print(f"Processing PDF: {file_path}")
# Try PyMuPDF first (best quality)
text = extract_pdf_text_with_pymupdf(str(file_path))
if text is None:
# Try pdfplumber
text = extract_pdf_text_with_pdfplumber(str(file_path))
if text:
# Create a simple document structure
from llama_index.core import Document
doc = Document(text=text, metadata={"source": str(file_path)})
documents.append(doc)
else:
# Fallback to default reader
print(f"Using default reader for {file_path}")
default_docs = SimpleDirectoryReader(
str(file_path.parent),
filename_as_id=True,
required_exts=[file_path.suffix],
).load_data()
documents.extend(default_docs)
# Load other file types with default reader
other_docs = SimpleDirectoryReader(
docs_dir,
recursive=True,
encoding="utf-8",
required_exts=[".pdf", ".txt", ".md", ".docx"],
required_exts=[".txt", ".md", ".docx"],
).load_data(show_progress=True)
documents.extend(other_docs)
all_texts = []
for doc in documents:

View File

@@ -264,9 +264,10 @@ def compute_embeddings_openai(texts: List[str], model_name: str) -> np.ndarray:
logger.info(
f"Computing embeddings for {len(texts)} texts using OpenAI API, model: '{model_name}'"
)
print(f"len of texts: {len(texts)}")
# OpenAI has limits on batch size and input length
max_batch_size = 100 # Conservative batch size
max_batch_size = 1000 # Conservative batch size
all_embeddings = []
try:
@@ -296,6 +297,7 @@ def compute_embeddings_openai(texts: List[str], model_name: str) -> np.ndarray:
logger.info(
f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
)
print(f"len of embeddings: {len(embeddings)}")
return embeddings

View File

@@ -18,6 +18,24 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
def _is_colab_environment() -> bool:
"""Check if we're running in Google Colab environment."""
return "COLAB_GPU" in os.environ or "COLAB_TPU" in os.environ
def _get_available_port(start_port: int = 5557) -> int:
"""Get an available port starting from start_port."""
port = start_port
while port < start_port + 100: # Try up to 100 ports
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("localhost", port))
return port
except OSError:
port += 1
raise RuntimeError(f"No available ports found in range {start_port}-{start_port+100}")
def _check_port(port: int) -> bool:
"""Check if a port is in use"""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -175,48 +193,59 @@ class EmbeddingServerManager:
embedding_mode: str = "sentence-transformers",
**kwargs,
) -> tuple[bool, int]:
"""
Starts the embedding server process.
Args:
port (int): The preferred ZMQ port for the server.
model_name (str): The name of the embedding model to use.
**kwargs: Additional arguments for the server.
Returns:
tuple[bool, int]: (success, actual_port_used)
"""
"""Start the embedding server."""
passages_file = kwargs.get("passages_file")
assert isinstance(passages_file, str), "passages_file must be a string"
# Check if we have a compatible running server
# Check if we have a compatible server already running
if self._has_compatible_running_server(model_name, passages_file):
assert self.server_port is not None, (
"a compatible running server should set server_port"
)
return True, self.server_port
logger.info("Found compatible running server!")
return True, port
# Find available port (compatible or free)
try:
actual_port, is_compatible = _find_compatible_port_or_next_available(
port, model_name, passages_file
)
except RuntimeError as e:
logger.error(str(e))
return False, port
# For Colab environment, use a different strategy
if _is_colab_environment():
logger.info("Detected Colab environment, using alternative startup strategy")
return self._start_server_colab(port, model_name, embedding_mode, **kwargs)
# Find a compatible port or next available
actual_port, is_compatible = _find_compatible_port_or_next_available(
port, model_name, passages_file
)
if is_compatible:
logger.info(f"Using existing compatible server on port {actual_port}")
self.server_port = actual_port
self.server_process = None # We don't own this process
logger.info(f"Found compatible server on port {actual_port}")
return True, actual_port
if actual_port != port:
logger.info(f"Using port {actual_port} instead of {port}")
# Start new server
# Start a new server
return self._start_new_server(actual_port, model_name, embedding_mode, **kwargs)
def _start_server_colab(
self,
port: int,
model_name: str,
embedding_mode: str = "sentence-transformers",
**kwargs,
) -> tuple[bool, int]:
"""Start server with Colab-specific configuration."""
# Try to find an available port
try:
actual_port = _get_available_port(port)
except RuntimeError:
logger.error("No available ports found")
return False, port
logger.info(f"Starting server on port {actual_port} for Colab environment")
# Use a simpler startup strategy for Colab
command = self._build_server_command(actual_port, model_name, embedding_mode, **kwargs)
try:
# In Colab, we'll use a more direct approach
self._launch_server_process_colab(command, actual_port)
return self._wait_for_server_ready_colab(actual_port)
except Exception as e:
logger.error(f"Failed to start embedding server in Colab: {e}")
return False, actual_port
def _has_compatible_running_server(
self, model_name: str, passages_file: str
) -> bool:
@@ -269,7 +298,9 @@ class EmbeddingServerManager:
]
if kwargs.get("passages_file"):
command.extend(["--passages-file", str(kwargs["passages_file"])])
# Convert to absolute path to ensure subprocess can find the file
passages_file = Path(kwargs["passages_file"]).resolve()
command.extend(["--passages-file", str(passages_file)])
if embedding_mode != "sentence-transformers":
command.extend(["--embedding-mode", embedding_mode])
@@ -346,3 +377,45 @@ class EmbeddingServerManager:
pass
self.server_process = None
def _launch_server_process_colab(self, command: list, port: int) -> None:
"""Launch the server process with Colab-specific settings."""
logger.info(f"Colab Command: {' '.join(command)}")
# In Colab, we need to be more careful about process management
self.server_process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
self.server_port = port
logger.info(f"Colab server process started with PID: {self.server_process.pid}")
# Register atexit callback
if not self._atexit_registered:
atexit.register(lambda: self.stop_server() if self.server_process else None)
self._atexit_registered = True
def _wait_for_server_ready_colab(self, port: int) -> tuple[bool, int]:
"""Wait for the server to be ready with Colab-specific timeout."""
max_wait, wait_interval = 30, 0.5 # Shorter timeout for Colab
for _ in range(int(max_wait / wait_interval)):
if _check_port(port):
logger.info("Colab embedding server is ready!")
return True, port
if self.server_process and self.server_process.poll() is not None:
# Check for error output
stdout, stderr = self.server_process.communicate()
logger.error(f"Colab server terminated during startup.")
logger.error(f"stdout: {stdout}")
logger.error(f"stderr: {stderr}")
return False, port
time.sleep(wait_interval)
logger.error(f"Colab server failed to start within {max_wait} seconds.")
self.stop_server()
return False, port

View File

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "leann"
version = "0.1.8"
version = "0.1.13"
description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
readme = "README.md"
requires-python = ">=3.9"

View File

@@ -25,14 +25,21 @@ dependencies = [
"requests>=2.25.0",
"sentence-transformers>=2.2.0",
"openai>=1.0.0",
# PDF parsing dependencies - essential for document processing
"PyPDF2>=3.0.0",
"pdfplumber>=0.11.0",
"pymupdf>=1.26.0",
"pypdfium2>=4.30.0",
# LlamaIndex core and readers - updated versions
"llama-index>=0.12.44",
"llama-index-readers-file>=0.4.0", # Essential for PDF parsing
"llama-index-readers-docling",
"llama-index-node-parser-docling",
"ipykernel==6.29.5",
"msgpack>=1.1.1",
"llama-index-vector-stores-faiss>=0.4.0",
"llama-index-embeddings-huggingface>=0.5.5",
# Other dependencies
"ipykernel==6.29.5",
"msgpack>=1.1.1",
"mlx>=0.26.3; sys_platform == 'darwin'",
"mlx-lm>=0.26.0; sys_platform == 'darwin'",
"psutil>=5.8.0",
@@ -52,6 +59,14 @@ diskann = [
"leann-backend-diskann",
]
# Add a new optional dependency group for document processing
documents = [
"beautifulsoup4>=4.13.0", # For HTML parsing
"python-docx>=0.8.11", # For Word documents
"openpyxl>=3.1.0", # For Excel files
"pandas>=2.2.0", # For data processing
]
[tool.setuptools]
py-modules = []

6109
uv.lock generated
View File

File diff suppressed because it is too large Load Diff