chore: release v0.1.13

update colab install & fix colab path
Merge branch 'main' of https://github.com/yichuan-w/LEANN
2025-07-27 01:08:17 +00:00 · 2025-07-26 18:07:31 -07:00 · 2025-07-26 17:09:55 -07:00 · 2025-07-26 17:09:45 -07:00 · 2025-07-26 23:35:28 +00:00 · 2025-07-26 16:33:13 -07:00
19 changed files with 3458 additions and 3627 deletions
--- a/.github/workflows/build-cibuildwheel.yml
+++ b/.github/workflows/build-cibuildwheel.yml
@@ -1,144 +0,0 @@
-name: Build with cibuildwheel
-
-on:
-  workflow_call:
-    inputs:
-      ref:
-        description: 'Git ref to build'
-        required: false
-        type: string
-        default: ''
-
-jobs:
-  build_wheels:
-    name: Build wheels on ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, macos-latest]
-        
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.ref }}
-          submodules: recursive
-      
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'  # Version for building pure Python packages
-      
-      # Build each package separately in our monorepo
-      - name: Build pure Python packages (leann-core, leann)
-        if: matrix.os == 'ubuntu-latest'  # Only build once, they're platform-independent
-        run: |
-          # Install build tools
-          python -m pip install --upgrade pip build
-          
-          # Build pure Python packages
-          python -m build packages/leann-core --outdir wheelhouse/
-          python -m build packages/leann --outdir wheelhouse/
-      
-      - name: Build leann-backend-hnsw wheels  
-        uses: pypa/cibuildwheel@v2.16.2
-        with:
-          package-dir: packages/leann-backend-hnsw
-          output-dir: wheelhouse
-        env:
-          CIBW_BUILD: cp39-* cp310-* cp311-* cp312-*
-          CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
-          CIBW_SKIP: "*-win32 *-manylinux_i686 pp*"
-          
-          CIBW_BEFORE_ALL_LINUX: |
-            yum clean all && yum makecache
-            yum install -y epel-release || true
-            yum makecache || true
-            # Install system dependencies
-            yum install -y \
-              gcc-c++ \
-              boost-devel \
-              protobuf-compiler \
-              protobuf-devel \
-              zeromq-devel \
-              pkgconfig \
-              openblas-devel \
-              cmake || echo "Some packages failed, continuing..."
-            
-            # Verify zmq installation and create pkg-config file if needed
-            if [ ! -f /usr/lib64/pkgconfig/libzmq.pc ] && [ ! -f /usr/lib/pkgconfig/libzmq.pc ]; then
-              echo "Creating libzmq.pc file..."
-              mkdir -p /usr/lib64/pkgconfig
-              cat > /usr/lib64/pkgconfig/libzmq.pc << 'EOF'
-            prefix=/usr
-            exec_prefix=${prefix}
-            libdir=${exec_prefix}/lib64
-            includedir=${prefix}/include
-            
-            Name: libzmq
-            Description: ZeroMQ library
-            Version: 4.1.4
-            Libs: -L${libdir} -lzmq
-            Cflags: -I${includedir}
-            EOF
-            fi
-          
-          CIBW_BEFORE_ALL_MACOS: |
-            brew install llvm libomp boost protobuf zeromq
-          
-          CIBW_ENVIRONMENT_LINUX: |
-            PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH
-          
-          CIBW_ENVIRONMENT_MACOS: |
-            CC=$(brew --prefix llvm)/bin/clang
-            CXX=$(brew --prefix llvm)/bin/clang++
-          
-          CIBW_TEST_REQUIRES: leann-core numpy pyzmq msgpack
-          CIBW_TEST_COMMAND: python -c "import leann_backend_hnsw"
-      
-      - name: Build leann-backend-diskann wheels
-        uses: pypa/cibuildwheel@v2.16.2
-        with:
-          package-dir: packages/leann-backend-diskann
-          output-dir: wheelhouse
-        env:
-          CIBW_BUILD: cp39-* cp310-* cp311-* cp312-*
-          CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
-          CIBW_SKIP: "*-win32 *-manylinux_i686 pp*"
-          
-          CIBW_BEFORE_ALL_LINUX: |
-            yum clean all && yum makecache
-            yum install -y epel-release || true
-            yum makecache || true
-            # Install system dependencies for DiskANN
-            yum install -y \
-              gcc-c++ \
-              protobuf-compiler \
-              protobuf-devel \
-              openblas-devel \
-              pkgconfig \
-              cmake || echo "Some packages failed, continuing..."
-            yum install -y libaio-devel || echo "libaio-devel not available, continuing..."
-          
-          CIBW_BEFORE_ALL_MACOS: |
-            brew install llvm libomp protobuf
-          
-          CIBW_ENVIRONMENT_LINUX: |
-            PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH
-          
-          CIBW_ENVIRONMENT_MACOS: |
-            CC=$(brew --prefix llvm)/bin/clang
-            CXX=$(brew --prefix llvm)/bin/clang++
-          
-          CIBW_TEST_REQUIRES: leann-core numpy
-          CIBW_TEST_COMMAND: python -c "import leann_backend_diskann"
-      
-      - name: List built packages
-        run: |
-          echo "📦 Built packages:"
-          ls -la wheelhouse/
-      
-      - uses: actions/upload-artifact@v4
-        with:
-          name: cibw-wheels-${{ matrix.os }}
-          path: ./wheelhouse/*.whl 
--- a/.github/workflows/build-reusable.yml
+++ b/.github/workflows/build-reusable.yml
@@ -13,107 +13,46 @@ jobs:
  build:
    name: Build ${{ matrix.os }} Python ${{ matrix.python }}
    strategy:
-      fail-fast: false
      matrix:
        include:
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
            python: '3.9'
-            container: 'quay.io/pypa/manylinux2014_x86_64'
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
            python: '3.10'
-            container: 'quay.io/pypa/manylinux2014_x86_64'
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
            python: '3.11'
-            container: 'quay.io/pypa/manylinux2014_x86_64'
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
            python: '3.12'
-            container: 'quay.io/pypa/manylinux2014_x86_64'
-          - os: ubuntu-latest
+          - os: ubuntu-22.04
            python: '3.13'
-            container: 'quay.io/pypa/manylinux2014_x86_64'
          - os: macos-latest
            python: '3.9'
-            container: ''
          - os: macos-latest
            python: '3.10'
-            container: ''
          - os: macos-latest
            python: '3.11'
-            container: ''
          - os: macos-latest
            python: '3.12'
-            container: ''
          - os: macos-latest
            python: '3.13'
-            container: ''
    runs-on: ${{ matrix.os }}
-    container: ${{ matrix.container }}
    
    steps:
-      # For manylinux2014 compatibility, we'll handle checkout differently
      - uses: actions/checkout@v4
-        if: matrix.container == ''
        with:
          ref: ${{ inputs.ref }}
          submodules: recursive
      
-      # Manual checkout for containers to avoid Node.js compatibility issues
-      - name: Manual checkout in container
-        if: matrix.container != ''
-        run: |
-          # Install git if not available
-          yum install -y git || true
-          
-          # Configure git to handle the directory ownership issue
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          git config --global --add safe.directory /__w/LEANN/LEANN
-          git config --global --add safe.directory /github/workspace
-          git config --global --add safe.directory $(pwd)
-          
-          # Clone the repository manually in the container
-          git init
-          git remote add origin https://github.com/${GITHUB_REPOSITORY}.git
-          
-          # Fetch the appropriate ref
-          if [ -n "${{ inputs.ref }}" ]; then
-            git fetch --depth=1 origin ${{ inputs.ref }}
-          else
-            git fetch --depth=1 origin ${GITHUB_SHA}
-          fi
-          git checkout FETCH_HEAD
-          
-          # Initialize submodules
-          git submodule update --init --recursive
-      
-      - name: Setup Python (macOS and regular Ubuntu)
-        if: matrix.container == ''
+      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python }}
      
-      - name: Setup Python (manylinux container)
-        if: matrix.container != ''
-        run: |
-          # Use the pre-installed Python version in manylinux container
-          # Convert Python version format (3.9 -> 39, 3.10 -> 310, etc.)
-          PY_VER=$(echo "${{ matrix.python }}" | sed 's/\.//g')
-          /opt/python/cp${PY_VER}-*/bin/python -m pip install --upgrade pip
-          # Create symlinks for convenience
-          ln -sf /opt/python/cp${PY_VER}-*/bin/python /usr/local/bin/python
-          ln -sf /opt/python/cp${PY_VER}-*/bin/pip /usr/local/bin/pip
-      
-      - name: Install uv (macOS and regular Ubuntu)
-        if: matrix.container == ''
+      - name: Install uv
        uses: astral-sh/setup-uv@v4
      
-      - name: Install uv (manylinux container)
-        if: matrix.container != ''
-        run: |
-          curl -LsSf https://astral.sh/uv/install.sh | sh
-          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
-      
-      - name: Install system dependencies (Ubuntu - regular)
-        if: runner.os == 'Linux' && matrix.container == ''
+      - name: Install system dependencies (Ubuntu)
+        if: runner.os == 'Linux'
        run: |
          sudo apt-get update
          sudo apt-get install -y libomp-dev libboost-all-dev protobuf-compiler libzmq3-dev \
@@ -126,64 +65,6 @@ jobs:
          echo "MKLROOT=/opt/intel/oneapi/mkl/latest" >> $GITHUB_ENV
          echo "LD_LIBRARY_PATH=/opt/intel/oneapi/mkl/latest/lib/intel64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
      
-      - name: Install system dependencies (manylinux container)
-        if: runner.os == 'Linux' && matrix.container != ''
-        run: |
-          # manylinux2014 uses yum instead of apt
-          # Update yum cache first
-          yum clean all
-          yum makecache
-          
-          # Install EPEL repository
-          yum install -y epel-release || true
-          
-          # Update cache again after EPEL
-          yum makecache || true
-          
-          # Install development packages
-          # Note: Some packages might have different names in CentOS 7
-          yum install -y \
-            gcc-c++ \
-            boost-devel \
-            protobuf-compiler \
-            protobuf-devel \
-            zeromq-devel \
-            pkgconfig \
-            openblas-devel \
-            cmake || {
-              echo "Some packages failed to install, trying alternatives..."
-              # Try alternative package names
-              yum install -y libzmq3-devel || true
-              yum install -y libzmq-devel || true
-          }
-          
-          # Install optional packages that might not be available
-          yum install -y libaio-devel || echo "libaio-devel not available, continuing..."
-          
-          # Verify zmq installation and create pkg-config file if needed
-          if [ ! -f /usr/lib64/pkgconfig/libzmq.pc ] && [ ! -f /usr/lib/pkgconfig/libzmq.pc ]; then
-            echo "Creating libzmq.pc file..."
-            mkdir -p /usr/lib64/pkgconfig
-            cat > /usr/lib64/pkgconfig/libzmq.pc << 'EOF'
-          prefix=/usr
-          exec_prefix=${prefix}
-          libdir=${exec_prefix}/lib64
-          includedir=${prefix}/include
-          
-          Name: libzmq
-          Description: ZeroMQ library
-          Version: 4.1.4
-          Libs: -L${libdir} -lzmq
-          Cflags: -I${includedir}
-          EOF
-          fi
-          
-          # Update PKG_CONFIG_PATH
-          echo "PKG_CONFIG_PATH=/usr/lib64/pkgconfig:/usr/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
-          
-          # Build tools are pre-installed in manylinux
-          # MKL is more complex in container, skip for now and use OpenBLAS
-      
      - name: Install system dependencies (macOS)
        if: runner.os == 'macOS'
        run: |
@@ -191,65 +72,44 @@ jobs:
      
      - name: Install build dependencies
        run: |
-          if [[ -n "${{ matrix.container }}" ]]; then
-            # In manylinux container, use regular pip
-            pip install scikit-build-core numpy swig Cython pybind11 auditwheel
+          uv pip install --system scikit-build-core numpy swig Cython pybind11
+          if [[ "$RUNNER_OS" == "Linux" ]]; then
+            uv pip install --system auditwheel
          else
-            # Regular environment, use uv
-            uv pip install --system scikit-build-core numpy swig Cython pybind11
-            if [[ "$RUNNER_OS" == "Linux" ]]; then
-              uv pip install --system auditwheel
-            else
-              uv pip install --system delocate
-            fi
+            uv pip install --system delocate
          fi
      
      - name: Build packages
        run: |
-          # Choose build command based on environment
-          if [[ -n "${{ matrix.container }}" ]]; then
-            BUILD_CMD="pip wheel . --no-deps -w dist"
-          else
-            BUILD_CMD="uv build --wheel --python python"
-          fi
-          
          # Build core (platform independent)
-          if [ "${{ matrix.os }}" == "ubuntu-latest" ]; then
+          if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
            cd packages/leann-core
-            if [[ -n "${{ matrix.container }}" ]]; then
-              pip wheel . --no-deps -w dist
-            else
-              uv build
-            fi
+            uv build
            cd ../..
          fi
          
          # Build HNSW backend
          cd packages/leann-backend-hnsw
          if [ "${{ matrix.os }}" == "macos-latest" ]; then
-            CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ $BUILD_CMD
+            CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
          else
-            eval $BUILD_CMD
+            uv build --wheel --python python
          fi
          cd ../..
          
          # Build DiskANN backend
          cd packages/leann-backend-diskann
          if [ "${{ matrix.os }}" == "macos-latest" ]; then
-            CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ $BUILD_CMD
+            CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ uv build --wheel --python python
          else
-            eval $BUILD_CMD
+            uv build --wheel --python python
          fi
          cd ../..
          
          # Build meta package (platform independent)
-          if [ "${{ matrix.os }}" == "ubuntu-latest" ]; then
+          if [[ "${{ matrix.os }}" == ubuntu-* ]]; then
            cd packages/leann
-            if [[ -n "${{ matrix.container }}" ]]; then
-              pip wheel . --no-deps -w dist
-            else
-              uv build
-            fi
+            uv build
            cd ../..
          fi
      
@@ -259,9 +119,6 @@ jobs:
          # Repair HNSW wheel
          cd packages/leann-backend-hnsw
          if [ -d dist ]; then
-            # Show what platform auditwheel will use
-            auditwheel show dist/*.whl || true
-            # Let auditwheel auto-detect the appropriate manylinux tag
            auditwheel repair dist/*.whl -w dist_repaired
            rm -rf dist
            mv dist_repaired dist
@@ -271,9 +128,6 @@ jobs:
          # Repair DiskANN wheel
          cd packages/leann-backend-diskann
          if [ -d dist ]; then
-            # Show what platform auditwheel will use
-            auditwheel show dist/*.whl || true
-            # Let auditwheel auto-detect the appropriate manylinux tag
            auditwheel repair dist/*.whl -w dist_repaired
            rm -rf dist
            mv dist_repaired dist
@@ -309,5 +163,5 @@ jobs:
      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          name: packages-${{ matrix.os }}-py${{ matrix.python }}${{ matrix.container && '-manylinux' || '' }}
+          name: packages-${{ matrix.os }}-py${{ matrix.python }}
          path: packages/*/dist/ 
--- a/.github/workflows/ci-cibuildwheel.yml
+++ b/.github/workflows/ci-cibuildwheel.yml
@@ -1,12 +0,0 @@
-name: CI - cibuildwheel (Test)
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-  workflow_dispatch:  # Allow manual triggering
-
-jobs:
-  build:
-    uses: ./.github/workflows/build-cibuildwheel.yml 
--- a/.github/workflows/release-manual.yml
+++ b/.github/workflows/release-manual.yml
@@ -7,11 +7,6 @@ on:
        description: 'Version to release (e.g., 0.1.2)'
        required: true
        type: string
-      use_cibuildwheel:
-        description: 'Use cibuildwheel for better compatibility (recommended for Colab)'
-        required: false
-        type: boolean
-        default: false

 jobs:
  update-version:
@@ -27,46 +22,50 @@ jobs:
      
      - name: Validate version
        run: |
-          if ! [[ "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
-            echo "❌ Invalid version format"
+          # Remove 'v' prefix if present for validation
+          VERSION_CLEAN="${{ inputs.version }}"
+          VERSION_CLEAN="${VERSION_CLEAN#v}"
+          if ! [[ "$VERSION_CLEAN" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+            echo "❌ Invalid version format. Expected format: X.Y.Z or vX.Y.Z"
            exit 1
          fi
-          echo "✅ Version format valid"
+          echo "✅ Version format valid: ${{ inputs.version }}"
      
      - name: Update versions and push
        id: push
        run: |
-          ./scripts/bump_version.sh ${{ inputs.version }}
-          git config user.name "GitHub Actions"
-          git config user.email "actions@github.com"
-          git add packages/*/pyproject.toml
-          git commit -m "chore: release v${{ inputs.version }}"
-          git push origin main
+          # Check current version
+          CURRENT_VERSION=$(grep "^version" packages/leann-core/pyproject.toml | cut -d'"' -f2)
+          echo "Current version: $CURRENT_VERSION"
+          echo "Target version: ${{ inputs.version }}"
+          
+          if [ "$CURRENT_VERSION" = "${{ inputs.version }}" ]; then
+            echo "⚠️  Version is already ${{ inputs.version }}, skipping update"
+            COMMIT_SHA=$(git rev-parse HEAD)
+          else
+            ./scripts/bump_version.sh ${{ inputs.version }}
+            git config user.name "GitHub Actions"
+            git config user.email "actions@github.com"
+            git add packages/*/pyproject.toml
+            git commit -m "chore: release v${{ inputs.version }}"
+            git push origin main
+            COMMIT_SHA=$(git rev-parse HEAD)
+            echo "✅ Pushed version update: $COMMIT_SHA"
+          fi
          
-          COMMIT_SHA=$(git rev-parse HEAD)
          echo "commit-sha=$COMMIT_SHA" >> $GITHUB_OUTPUT
-          echo "✅ Pushed version update: $COMMIT_SHA"

-  build-packages-reusable:
-    name: Build packages (Standard)
+  build-packages:
+    name: Build packages
    needs: update-version
-    if: ${{ !inputs.use_cibuildwheel }}
    uses: ./.github/workflows/build-reusable.yml
    with:
-      ref: ${{ needs.update-version.outputs.commit-sha }}
-  
-  build-packages-cibuildwheel:
-    name: Build packages (cibuildwheel)
-    needs: update-version
-    if: ${{ inputs.use_cibuildwheel }}
-    uses: ./.github/workflows/build-cibuildwheel.yml
-    with:
-      ref: ${{ needs.update-version.outputs.commit-sha }}
+      ref: 'main' 

  publish:
    name: Publish and Release
-    needs: [update-version, build-packages-reusable, build-packages-cibuildwheel]
-    if: always() && needs.update-version.result == 'success' && (needs.build-packages-reusable.result == 'success' || needs.build-packages-cibuildwheel.result == 'success')
+    needs: [update-version, build-packages]
+    if: always() && needs.update-version.result == 'success' && needs.build-packages.result == 'success'
    runs-on: ubuntu-latest
    permissions:
      contents: write
@@ -74,7 +73,7 @@ jobs:
    steps:
      - uses: actions/checkout@v4
        with:
-          ref: ${{ needs.update-version.outputs.commit-sha }}
+          ref: 'main' 
      
      - name: Download all artifacts
        uses: actions/download-artifact@v4
@@ -107,12 +106,24 @@ jobs:
      
      - name: Create release
        run: |
-          git tag "v${{ inputs.version }}"
-          git push origin "v${{ inputs.version }}"
+          # Check if tag already exists
+          if git rev-parse "v${{ inputs.version }}" >/dev/null 2>&1; then
+            echo "⚠️  Tag v${{ inputs.version }} already exists, skipping tag creation"
+          else
+            git tag "v${{ inputs.version }}"
+            git push origin "v${{ inputs.version }}"
+            echo "✅ Created and pushed tag v${{ inputs.version }}"
+          fi
          
-          gh release create "v${{ inputs.version }}" \
-            --title "Release v${{ inputs.version }}" \
-            --notes "🚀 Released to PyPI: https://pypi.org/project/leann/${{ inputs.version }}/" \
-            --latest
+          # Check if release already exists
+          if gh release view "v${{ inputs.version }}" >/dev/null 2>&1; then
+            echo "⚠️  Release v${{ inputs.version }} already exists, skipping release creation"
+          else
+            gh release create "v${{ inputs.version }}" \
+              --title "Release v${{ inputs.version }}" \
+              --notes "🚀 Released to PyPI: https://pypi.org/project/leann/${{ inputs.version }}/" \
+              --latest
+            echo "✅ Created GitHub release v${{ inputs.version }}"
+          fi
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/README.md
+++ b/README.md
@@ -195,7 +195,7 @@ Once the index is built, you can ask questions like:
 - "Show me emails about travel expenses"
 </details>

-### 🔍 Time Machine for the Web: RAG Your Entire Google Browser History!
+### 🔍 Time Machine for the Web: RAG Your Entire Chrome Browser History!

 <p align="center">
  <img src="videos/google_clear.gif" alt="LEANN Browser History Search Demo" width="600">
--- a/demo.ipynb
+++ b/demo.ipynb
@@ -13,8 +13,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# install this if you areusing colab\n",
-    "! pip install leann"
+    "# install this if you are using colab\n",
+    "! uv pip install leann-core leann-backend-hnsw --no-deps\n",
+    "! uv pip install leann --no-deps\n",
+    "# For Colab environment, we need to set some environment variables\n",
+    "import os\n",
+    "os.environ['LEANN_LOG_LEVEL'] = 'INFO'  # Enable more detailed logging"
   ]
  },
  {
@@ -26,81 +30,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO: Registering backend 'hnsw'\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/yichuan/Desktop/code/LEANN/leann/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever\n",
-      "WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.\n",
-      "Writing passages: 100%|██████████| 5/5 [00:00<00:00, 27887.66chunk/s]\n",
-      "Batches: 100%|██████████| 1/1 [00:00<00:00, 13.51it/s]\n",
-      "WARNING:leann_backend_hnsw.hnsw_backend:Converting data to float32, shape: (5, 768)\n",
-      "INFO:leann_backend_hnsw.hnsw_backend:INFO: Converting HNSW index to CSR-pruned format...\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "M: 64 for level: 0\n",
-      "Starting conversion: knowledge.index -> knowledge.csr.tmp\n",
-      "[0.00s] Reading Index HNSW header...\n",
-      "[0.00s]   Header read: d=768, ntotal=5\n",
-      "[0.00s] Reading HNSW struct vectors...\n",
-      "  Reading vector (dtype=<class 'numpy.float64'>, fmt='d')... Count=6, Bytes=48\n",
-      "[0.00s]   Read assign_probas (6)\n",
-      "  Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=7, Bytes=28\n",
-      "[0.11s]   Read cum_nneighbor_per_level (7)\n",
-      "  Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=5, Bytes=20\n",
-      "[0.21s]   Read levels (5)\n",
-      "[0.30s]   Probing for compact storage flag...\n",
-      "[0.30s]   Found compact flag: False\n",
-      "[0.30s]   Compact flag is False, reading original format...\n",
-      "[0.30s]   Probing for potential extra byte before non-compact offsets...\n",
-      "[0.30s]   Found and consumed an unexpected 0x00 byte.\n",
-      "  Reading vector (dtype=<class 'numpy.uint64'>, fmt='Q')... Count=6, Bytes=48\n",
-      "[0.30s]   Read offsets (6)\n",
-      "[0.40s]   Attempting to read neighbors vector...\n",
-      "  Reading vector (dtype=<class 'numpy.int32'>, fmt='i')... Count=320, Bytes=1280\n",
-      "[0.40s]   Read neighbors (320)\n",
-      "[0.50s]   Read scalar params (ep=4, max_lvl=0)\n",
-      "[0.50s] Checking for storage data...\n",
-      "[0.50s]   Found storage fourcc: 49467849.\n",
-      "[0.50s] Converting to CSR format...\n",
-      "[0.50s]   Conversion loop finished.                        \n",
-      "[0.50s] Running validation checks...\n",
-      "    Checking total valid neighbor count...\n",
-      "    OK: Total valid neighbors = 20\n",
-      "    Checking final pointer indices...\n",
-      "    OK: Final pointers match data size.\n",
-      "[0.50s] Deleting original neighbors and offsets arrays...\n",
-      "    CSR Stats: |data|=20, |level_ptr|=10\n",
-      "[0.59s] Writing CSR HNSW graph data in FAISS-compatible order...\n",
-      "   Pruning embeddings: Writing NULL storage marker.\n",
-      "[0.69s] Conversion complete.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:leann_backend_hnsw.hnsw_backend:✅ CSR conversion successful.\n",
-      "INFO:leann_backend_hnsw.hnsw_backend:INFO: Replaced original index with CSR-pruned version at 'knowledge.index'\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from leann.api import LeannBuilder\n",
    "\n",
@@ -122,93 +54,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:leann.api:🔍 LeannSearcher.search() called:\n",
-      "INFO:leann.api:  Query: 'programming languages'\n",
-      "INFO:leann.api:  Top_k: 2\n",
-      "INFO:leann.api:  Additional kwargs: {}\n",
-      "INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
-      "INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
-      "INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
-      "INFO:leann.embedding_server_manager:Using port 5560 instead of 5557\n",
-      "INFO:leann.embedding_server_manager:Starting embedding server on port 5560...\n",
-      "INFO:leann.embedding_server_manager:Command: /Users/yichuan/Desktop/code/LEANN/leann/.venv/bin/python -m leann_backend_hnsw.hnsw_embedding_server --zmq-port 5560 --model-name facebook/contriever --passages-file knowledge.leann.meta.json\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "INFO:leann.embedding_server_manager:Server process started with PID: 4574\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n",
-      "[read_HNSW NL v4] Read levels vector, size: 5\n",
-      "[read_HNSW NL v4] Reading Compact Storage format indices...\n",
-      "[read_HNSW NL v4] Read compact_level_ptr, size: 10\n",
-      "[read_HNSW NL v4] Read compact_node_offsets, size: 6\n",
-      "[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n",
-      "[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n",
-      "[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n",
-      "[read_HNSW NL v4] Reading neighbors data into memory.\n",
-      "[read_HNSW NL v4] Read neighbors data, size: 20\n",
-      "[read_HNSW NL v4] Finished reading metadata and CSR indices.\n",
-      "INFO: Skipping external storage loading, since is_recompute is true.\n",
-      "INFO: Registering backend 'hnsw'\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:leann.embedding_server_manager:Embedding server is ready!\n",
-      "INFO:leann.api:  Launching server time: 1.078078269958496 seconds\n",
-      "INFO:leann.embedding_server_manager:Existing server process (PID 4574) is compatible\n",
-      "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: facebook/contriever\n",
-      "WARNING:sentence_transformers.SentenceTransformer:No sentence-transformers model found with name facebook/contriever. Creating a new one with mean pooling.\n",
-      "INFO:leann.api:  Generated embedding shape: (1, 768)\n",
-      "INFO:leann.api:  Embedding time: 2.9307072162628174 seconds\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ZmqDistanceComputer initialized: d=768, metric=0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:leann.api:  Search time: 0.27327895164489746 seconds\n",
-      "INFO:leann.api:  Backend returned: labels=2 results\n",
-      "INFO:leann.api:  Processing 2 passage IDs:\n",
-      "INFO:leann.api:    1. passage_id='0' -> SUCCESS: C# is a powerful programming language and it is good at game development...\n",
-      "INFO:leann.api:    2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is good at machine learning tasks...\n",
-      "INFO:leann.api:  Final enriched results: 2 passages\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[SearchResult(id='0', score=np.float32(0.9874103), text='C# is a powerful programming language and it is good at game development', metadata={}),\n",
-       " SearchResult(id='1', score=np.float32(0.8922168), text='Python is a powerful programming language and it is good at machine learning tasks', metadata={})]"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from leann.api import LeannSearcher\n",
    "\n",
@@ -228,79 +76,7 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:leann.chat:Attempting to create LLM of type='hf' with model='Qwen/Qwen3-0.6B'\n",
-      "INFO:leann.chat:Initializing HFChat with model='Qwen/Qwen3-0.6B'\n",
-      "INFO:leann.chat:MPS is available. Using Apple Silicon GPU.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[read_HNSW - CSR NL v4] Reading metadata & CSR indices (manual offset)...\n",
-      "[read_HNSW NL v4] Read levels vector, size: 5\n",
-      "[read_HNSW NL v4] Reading Compact Storage format indices...\n",
-      "[read_HNSW NL v4] Read compact_level_ptr, size: 10\n",
-      "[read_HNSW NL v4] Read compact_node_offsets, size: 6\n",
-      "[read_HNSW NL v4] Read entry_point: 4, max_level: 0\n",
-      "[read_HNSW NL v4] Read storage fourcc: 0x6c6c756e\n",
-      "[read_HNSW NL v4 FIX] Detected FileIOReader. Neighbors size field offset: 326\n",
-      "[read_HNSW NL v4] Reading neighbors data into memory.\n",
-      "[read_HNSW NL v4] Read neighbors data, size: 20\n",
-      "[read_HNSW NL v4] Finished reading metadata and CSR indices.\n",
-      "INFO: Skipping external storage loading, since is_recompute is true.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:leann.api:🔍 LeannSearcher.search() called:\n",
-      "INFO:leann.api:  Query: 'Compare the two retrieved programming languages and tell me their advantages.'\n",
-      "INFO:leann.api:  Top_k: 2\n",
-      "INFO:leann.api:  Additional kwargs: {}\n",
-      "INFO:leann.embedding_server_manager:Port 5557 has incompatible server, trying next port...\n",
-      "INFO:leann.embedding_server_manager:Port 5558 has incompatible server, trying next port...\n",
-      "INFO:leann.embedding_server_manager:Port 5559 has incompatible server, trying next port...\n",
-      "INFO:leann.embedding_server_manager:Found compatible server on port 5560\n",
-      "INFO:leann.embedding_server_manager:Using existing compatible server on port 5560\n",
-      "INFO:leann.api:  Launching server time: 0.04932403564453125 seconds\n",
-      "INFO:leann.embedding_server_manager:Found compatible server on port 5560\n",
-      "INFO:leann.embedding_server_manager:Using existing compatible server on port 5560\n",
-      "INFO:leann.api:  Generated embedding shape: (1, 768)\n",
-      "INFO:leann.api:  Embedding time: 0.06902289390563965 seconds\n",
-      "INFO:leann.api:  Search time: 0.026793241500854492 seconds\n",
-      "INFO:leann.api:  Backend returned: labels=2 results\n",
-      "INFO:leann.api:  Processing 2 passage IDs:\n",
-      "INFO:leann.api:    1. passage_id='0' -> SUCCESS: C# is a powerful programming language and it is good at game development...\n",
-      "INFO:leann.api:    2. passage_id='1' -> SUCCESS: Python is a powerful programming language and it is good at machine learning tasks...\n",
-      "INFO:leann.api:  Final enriched results: 2 passages\n",
-      "INFO:leann.chat:Generating with HuggingFace model, config: {'max_new_tokens': 128, 'temperature': 0.7, 'top_p': 0.9, 'do_sample': True, 'pad_token_id': 151645, 'eos_token_id': 151645}\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ZmqDistanceComputer initialized: d=768, metric=0\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "\"<think>\\n\\n</think>\\n\\nBased on the context provided, here's a comparison of the two retrieved programming languages:\\n\\n**C#** is known for being a powerful programming language and is well-suited for game development. It is often used in game development and is popular among developers working on Windows applications.\\n\\n**Python**, on the other hand, is also a powerful language and is well-suited for machine learning tasks. It is widely used for data analysis, scientific computing, and other applications that require handling large datasets or performing complex calculations.\\n\\n**Advantages**:\\n- C#: Strong for game development and cross-platform compatibility.\\n- Python: Strong for\""
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from leann.api import LeannChat\n",
    "\n",
--- a/examples/google_history_reader_leann.py
+++ b/examples/google_history_reader_leann.py
@@ -97,11 +97,13 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
        # Use HNSW backend for better macOS compatibility
        builder = LeannBuilder(
            backend_name="hnsw",
-            embedding_model="facebook/contriever",
+            embedding_model="text-embedding-3-small",
+            embedding_mode="openai",
+            
            graph_degree=32, 
            complexity=64,
-            is_compact=True,
-            is_recompute=True,
+            is_compact=False,
+            is_recompute=False,
            num_threads=1  # Force single-threaded mode
        )

--- a/packages/leann-backend-diskann/pyproject.toml
+++ b/packages/leann-backend-diskann/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "scikit_build_core.build"

 [project]
 name = "leann-backend-diskann"
-version = "0.1.8"
-dependencies = ["leann-core==0.1.8", "numpy"]
+version = "0.1.13"
+dependencies = ["leann-core==0.1.13", "numpy", "protobuf>=3.19.0"]

 [tool.scikit-build]
 # Key: simplified CMake path
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_backend.py
@@ -48,6 +48,10 @@ class HNSWBuilder(LeannBackendBuilderInterface):
        self.efConstruction = self.build_params.setdefault("efConstruction", 200)
        self.distance_metric = self.build_params.setdefault("distance_metric", "mips")
        self.dimensions = self.build_params.get("dimensions")
+        if not self.is_recompute:
+            if self.is_compact:
+                # TODO: support this case @andy
+                raise ValueError("is_recompute is False, but is_compact is True. This is not compatible now. change is compact to False and you can use the original HNSW index.")

    def build(self, data: np.ndarray, ids: List[str], index_path: str, **kwargs):
        from . import faiss  # type: ignore
--- a/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
+++ b/packages/leann-backend-hnsw/leann_backend_hnsw/hnsw_embedding_server.py
@@ -81,7 +81,21 @@ def create_hnsw_embedding_server(
    with open(passages_file, "r") as f:
        meta = json.load(f)

-    passages = PassageManager(meta["passage_sources"])
+    # Convert relative paths to absolute paths based on metadata file location
+    metadata_dir = Path(
+        passages_file
+    ).parent.parent  # Go up one level from the metadata file
+    passage_sources = []
+    for source in meta["passage_sources"]:
+        source_copy = source.copy()
+        # Convert relative paths to absolute paths
+        if not Path(source_copy["path"]).is_absolute():
+            source_copy["path"] = str(metadata_dir / source_copy["path"])
+        if not Path(source_copy["index_path"]).is_absolute():
+            source_copy["index_path"] = str(metadata_dir / source_copy["index_path"])
+        passage_sources.append(source_copy)
+
+    passages = PassageManager(passage_sources)
    logger.info(
        f"Loaded PassageManager with {len(passages.global_offset_map)} passages from metadata"
    )
@@ -270,15 +284,15 @@ def create_hnsw_embedding_server(
 if __name__ == "__main__":
    import signal
    import sys
-    
+
    def signal_handler(sig, frame):
        logger.info(f"Received signal {sig}, shutting down gracefully...")
        sys.exit(0)
-    
+
    # Register signal handlers for graceful shutdown
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
-    
+
    parser = argparse.ArgumentParser(description="HNSW Embedding service")
    parser.add_argument("--zmq-port", type=int, default=5555, help="ZMQ port to run on")
    parser.add_argument(
--- a/packages/leann-backend-hnsw/pyproject.toml
+++ b/packages/leann-backend-hnsw/pyproject.toml
@@ -6,10 +6,10 @@ build-backend = "scikit_build_core.build"

 [project]
 name = "leann-backend-hnsw"
-version = "0.1.8"
+version = "0.1.13"
 description = "Custom-built HNSW (Faiss) backend for the Leann toolkit."
 dependencies = [
-    "leann-core==0.1.8", 
+    "leann-core==0.1.13", 
    "numpy",
    "pyzmq>=23.0.0",
    "msgpack>=1.0.0",
--- a/packages/leann-core/pyproject.toml
+++ b/packages/leann-core/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "leann-core"
-version = "0.1.8"
+version = "0.1.13"
 description = "Core API and plugin system for LEANN"
 readme = "README.md"
 requires-python = ">=3.9"
@@ -21,6 +21,23 @@ dependencies = [
    "sentence-transformers>=2.2.0",
    "llama-index-core>=0.12.0",
    "python-dotenv>=1.0.0",
+    "openai>=1.0.0",
+    "huggingface-hub>=0.20.0",
+    "transformers>=4.30.0",
+    "requests>=2.25.0",
+    "accelerate>=0.20.0",
+    "PyPDF2>=3.0.0",
+    "pymupdf>=1.23.0",
+    "pdfplumber>=0.10.0",
+    "mlx>=0.26.3; sys_platform == 'darwin'",
+    "mlx-lm>=0.26.0; sys_platform == 'darwin'",
+]
+
+[project.optional-dependencies]
+colab = [
+    "torch>=2.0.0,<3.0.0",  # 限制torch版本避免冲突
+    "transformers>=4.30.0,<5.0.0",  # 限制transformers版本
+    "accelerate>=0.20.0,<1.0.0",  # 限制accelerate版本
 ]

 [project.scripts]
--- a/packages/leann-core/src/leann/api.py
+++ b/packages/leann-core/src/leann/api.py
@@ -117,8 +117,15 @@ class PassageManager:
            assert source["type"] == "jsonl", "only jsonl is supported"
            passage_file = source["path"]
            index_file = source["index_path"]  # .idx file
+            
+            # Fix path resolution for Colab and other environments
+            if not Path(index_file).is_absolute():
+                # If relative path, try to resolve it properly
+                index_file = str(Path(index_file).resolve())
+            
            if not Path(index_file).exists():
                raise FileNotFoundError(f"Passage index file not found: {index_file}")
+            
            with open(index_file, "rb") as f:
                offset_map = pickle.load(f)
                self.offset_maps[passage_file] = offset_map
@@ -381,6 +388,10 @@ class LeannBuilder:

 class LeannSearcher:
    def __init__(self, index_path: str, enable_warmup: bool = False, **backend_kwargs):
+        # Fix path resolution for Colab and other environments
+        if not Path(index_path).is_absolute():
+            index_path = str(Path(index_path).resolve())
+        
        self.meta_path_str = f"{index_path}.meta.json"
        if not Path(self.meta_path_str).exists():
            raise FileNotFoundError(
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -7,6 +7,33 @@ from llama_index.core.node_parser import SentenceSplitter

 from .api import LeannBuilder, LeannSearcher, LeannChat

+def extract_pdf_text_with_pymupdf(file_path: str) -> str:
+    """Extract text from PDF using PyMuPDF for better quality."""
+    try:
+        import fitz  # PyMuPDF
+        doc = fitz.open(file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        doc.close()
+        return text
+    except ImportError:
+        # Fallback to default reader
+        return None
+
+def extract_pdf_text_with_pdfplumber(file_path: str) -> str:
+    """Extract text from PDF using pdfplumber for better quality."""
+    try:
+        import pdfplumber
+        text = ""
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+        return text
+    except ImportError:
+        # Fallback to default reader
+        return None
+

 class LeannCLI:
    def __init__(self):
@@ -145,12 +172,42 @@ Examples:
    def load_documents(self, docs_dir: str):
        print(f"Loading documents from {docs_dir}...")

-        documents = SimpleDirectoryReader(
+        # Try to use better PDF parsers first
+        documents = []
+        docs_path = Path(docs_dir)
+        
+        for file_path in docs_path.rglob("*.pdf"):
+            print(f"Processing PDF: {file_path}")
+            
+            # Try PyMuPDF first (best quality)
+            text = extract_pdf_text_with_pymupdf(str(file_path))
+            if text is None:
+                # Try pdfplumber
+                text = extract_pdf_text_with_pdfplumber(str(file_path))
+            
+            if text:
+                # Create a simple document structure
+                from llama_index.core import Document
+                doc = Document(text=text, metadata={"source": str(file_path)})
+                documents.append(doc)
+            else:
+                # Fallback to default reader
+                print(f"Using default reader for {file_path}")
+                default_docs = SimpleDirectoryReader(
+                    str(file_path.parent),
+                    filename_as_id=True,
+                    required_exts=[file_path.suffix],
+                ).load_data()
+                documents.extend(default_docs)
+
+        # Load other file types with default reader
+        other_docs = SimpleDirectoryReader(
            docs_dir,
            recursive=True,
            encoding="utf-8",
-            required_exts=[".pdf", ".txt", ".md", ".docx"],
+            required_exts=[".txt", ".md", ".docx"],
        ).load_data(show_progress=True)
+        documents.extend(other_docs)

        all_texts = []
        for doc in documents:
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -264,9 +264,10 @@ def compute_embeddings_openai(texts: List[str], model_name: str) -> np.ndarray:
    logger.info(
        f"Computing embeddings for {len(texts)} texts using OpenAI API, model: '{model_name}'"
    )
+    print(f"len of texts: {len(texts)}")

    # OpenAI has limits on batch size and input length
-    max_batch_size = 100  # Conservative batch size
+    max_batch_size = 1000  # Conservative batch size
    all_embeddings = []

    try:
@@ -296,6 +297,7 @@ def compute_embeddings_openai(texts: List[str], model_name: str) -> np.ndarray:
    logger.info(
        f"Generated {len(embeddings)} embeddings, dimension: {embeddings.shape[1]}"
    )
+    print(f"len of embeddings: {len(embeddings)}")
    return embeddings


--- a/packages/leann-core/src/leann/embedding_server_manager.py
+++ b/packages/leann-core/src/leann/embedding_server_manager.py
@@ -18,6 +18,24 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)


+def _is_colab_environment() -> bool:
+    """Check if we're running in Google Colab environment."""
+    return "COLAB_GPU" in os.environ or "COLAB_TPU" in os.environ
+
+
+def _get_available_port(start_port: int = 5557) -> int:
+    """Get an available port starting from start_port."""
+    port = start_port
+    while port < start_port + 100:  # Try up to 100 ports
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.bind(("localhost", port))
+                return port
+        except OSError:
+            port += 1
+    raise RuntimeError(f"No available ports found in range {start_port}-{start_port+100}")
+
+
 def _check_port(port: int) -> bool:
    """Check if a port is in use"""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -175,48 +193,59 @@ class EmbeddingServerManager:
        embedding_mode: str = "sentence-transformers",
        **kwargs,
    ) -> tuple[bool, int]:
-        """
-        Starts the embedding server process.
-
-        Args:
-            port (int): The preferred ZMQ port for the server.
-            model_name (str): The name of the embedding model to use.
-            **kwargs: Additional arguments for the server.
-
-        Returns:
-            tuple[bool, int]: (success, actual_port_used)
-        """
+        """Start the embedding server."""
        passages_file = kwargs.get("passages_file")
-        assert isinstance(passages_file, str), "passages_file must be a string"

-        # Check if we have a compatible running server
+        # Check if we have a compatible server already running
        if self._has_compatible_running_server(model_name, passages_file):
-            assert self.server_port is not None, (
-                "a compatible running server should set server_port"
-            )
-            return True, self.server_port
+            logger.info("Found compatible running server!")
+            return True, port

-        # Find available port (compatible or free)
-        try:
-            actual_port, is_compatible = _find_compatible_port_or_next_available(
-                port, model_name, passages_file
-            )
-        except RuntimeError as e:
-            logger.error(str(e))
-            return False, port
+        # For Colab environment, use a different strategy
+        if _is_colab_environment():
+            logger.info("Detected Colab environment, using alternative startup strategy")
+            return self._start_server_colab(port, model_name, embedding_mode, **kwargs)
+
+        # Find a compatible port or next available
+        actual_port, is_compatible = _find_compatible_port_or_next_available(
+            port, model_name, passages_file
+        )

        if is_compatible:
-            logger.info(f"Using existing compatible server on port {actual_port}")
-            self.server_port = actual_port
-            self.server_process = None  # We don't own this process
+            logger.info(f"Found compatible server on port {actual_port}")
            return True, actual_port

-        if actual_port != port:
-            logger.info(f"Using port {actual_port} instead of {port}")
-
-        # Start new server
+        # Start a new server
        return self._start_new_server(actual_port, model_name, embedding_mode, **kwargs)

+    def _start_server_colab(
+        self,
+        port: int,
+        model_name: str,
+        embedding_mode: str = "sentence-transformers",
+        **kwargs,
+    ) -> tuple[bool, int]:
+        """Start server with Colab-specific configuration."""
+        # Try to find an available port
+        try:
+            actual_port = _get_available_port(port)
+        except RuntimeError:
+            logger.error("No available ports found")
+            return False, port
+
+        logger.info(f"Starting server on port {actual_port} for Colab environment")
+        
+        # Use a simpler startup strategy for Colab
+        command = self._build_server_command(actual_port, model_name, embedding_mode, **kwargs)
+        
+        try:
+            # In Colab, we'll use a more direct approach
+            self._launch_server_process_colab(command, actual_port)
+            return self._wait_for_server_ready_colab(actual_port)
+        except Exception as e:
+            logger.error(f"Failed to start embedding server in Colab: {e}")
+            return False, actual_port
+
    def _has_compatible_running_server(
        self, model_name: str, passages_file: str
    ) -> bool:
@@ -269,7 +298,9 @@ class EmbeddingServerManager:
        ]

        if kwargs.get("passages_file"):
-            command.extend(["--passages-file", str(kwargs["passages_file"])])
+            # Convert to absolute path to ensure subprocess can find the file
+            passages_file = Path(kwargs["passages_file"]).resolve()
+            command.extend(["--passages-file", str(passages_file)])
        if embedding_mode != "sentence-transformers":
            command.extend(["--embedding-mode", embedding_mode])

@@ -346,3 +377,45 @@ class EmbeddingServerManager:
            pass

        self.server_process = None
+
+    def _launch_server_process_colab(self, command: list, port: int) -> None:
+        """Launch the server process with Colab-specific settings."""
+        logger.info(f"Colab Command: {' '.join(command)}")
+
+        # In Colab, we need to be more careful about process management
+        self.server_process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        self.server_port = port
+        logger.info(f"Colab server process started with PID: {self.server_process.pid}")
+
+        # Register atexit callback
+        if not self._atexit_registered:
+            atexit.register(lambda: self.stop_server() if self.server_process else None)
+            self._atexit_registered = True
+
+    def _wait_for_server_ready_colab(self, port: int) -> tuple[bool, int]:
+        """Wait for the server to be ready with Colab-specific timeout."""
+        max_wait, wait_interval = 30, 0.5  # Shorter timeout for Colab
+        
+        for _ in range(int(max_wait / wait_interval)):
+            if _check_port(port):
+                logger.info("Colab embedding server is ready!")
+                return True, port
+
+            if self.server_process and self.server_process.poll() is not None:
+                # Check for error output
+                stdout, stderr = self.server_process.communicate()
+                logger.error(f"Colab server terminated during startup.")
+                logger.error(f"stdout: {stdout}")
+                logger.error(f"stderr: {stderr}")
+                return False, port
+
+            time.sleep(wait_interval)
+
+        logger.error(f"Colab server failed to start within {max_wait} seconds.")
+        self.stop_server()
+        return False, port
--- a/packages/leann/pyproject.toml
+++ b/packages/leann/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "leann"
-version = "0.1.8"
+version = "0.1.13"
 description = "LEANN - The smallest vector index in the world. RAG Everything with LEANN!"
 readme = "README.md"
 requires-python = ">=3.9"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,14 +25,21 @@ dependencies = [
    "requests>=2.25.0",
    "sentence-transformers>=2.2.0",
    "openai>=1.0.0",
+    # PDF parsing dependencies - essential for document processing
    "PyPDF2>=3.0.0",
+    "pdfplumber>=0.11.0",
+    "pymupdf>=1.26.0",
+    "pypdfium2>=4.30.0",
+    # LlamaIndex core and readers - updated versions
    "llama-index>=0.12.44",
+    "llama-index-readers-file>=0.4.0",  # Essential for PDF parsing
    "llama-index-readers-docling",
    "llama-index-node-parser-docling",
-    "ipykernel==6.29.5",
-    "msgpack>=1.1.1",
    "llama-index-vector-stores-faiss>=0.4.0",
    "llama-index-embeddings-huggingface>=0.5.5",
+    # Other dependencies
+    "ipykernel==6.29.5",
+    "msgpack>=1.1.1",
    "mlx>=0.26.3; sys_platform == 'darwin'",
    "mlx-lm>=0.26.0; sys_platform == 'darwin'",
    "psutil>=5.8.0",
@@ -52,6 +59,14 @@ diskann = [
    "leann-backend-diskann",
 ]

+# Add a new optional dependency group for document processing
+documents = [
+    "beautifulsoup4>=4.13.0",  # For HTML parsing
+    "python-docx>=0.8.11",     # For Word documents
+    "openpyxl>=3.1.0",         # For Excel files
+    "pandas>=2.2.0",           # For data processing
+]
+
 [tool.setuptools]
 py-modules = []

--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
GitHub Actions	8375f601ba	chore: release v0.1.13	2025-07-27 01:08:17 +00:00
yichuan520030910320	c87c0fe662	update colab install & fix colab path	2025-07-26 18:07:31 -07:00
yichuan520030910320	73927b68ef	Merge branch 'main' of https://github.com/yichuan-w/LEANN	2025-07-26 17:09:55 -07:00
yichuan520030910320	cc1a62e5aa	update pytoml version again	2025-07-26 17:09:45 -07:00
GitHub Actions	802020cb41	chore: release v0.1.12	2025-07-26 23:35:28 +00:00
yichuan520030910320	cdb92f7cf4	update pytoml version && fix colab env && fix pdf extract in pip	2025-07-26 16:33:13 -07:00
yichuan520030910320	dc69bdec00	Merge branch 'main' of https://github.com/yichuan-w/LEANN	2025-07-25 17:54:43 -07:00
yichuan520030910320	98073e9868	update missing pkg	2025-07-25 17:54:21 -07:00
GitHub Actions	cf2ef48967	chore: release v0.1.11	2025-07-26 00:12:37 +00:00
yichuan520030910320	0692bbf7a2	change workflow	2025-07-25 17:11:56 -07:00
GitHub Actions	52584a171f	chore: release v0.1.10	2025-07-25 23:12:16 +00:00
Andy Lee	efd6b5324b	fix: add protobuf as a dependency for DiskANN backend - Fixes 'No module named google' error when starting DiskANN embedding server - Prevents users from having to manually install protobuf	2025-07-25 16:10:25 -07:00
Andy Lee	2baaa4549b	fix: handle relative paths in HNSW embedding server metadata - Convert relative paths to absolute paths based on metadata file location - Fixes FileNotFoundError when starting embedding server - Resolves issue with passages file not found in different working directories	2025-07-25 16:09:53 -07:00
Andy Lee	35310ddd52	fix: pure Python packages not building due to ubuntu-latest check The build workflow was checking for matrix.os == 'ubuntu-latest', but we changed the matrix to use 'ubuntu-22.04', causing the pure Python packages (leann-core and leann) to never be built. Changed to use pattern matching [[ == ubuntu-* ]] to match any Ubuntu version. This explains why v0.1.9 only published the C++ backend packages but not the pure Python packages.	2025-07-25 15:14:21 -07:00
Andy Lee	fc9c5cb39d	fix: make release workflow idempotent - Check if version is already updated before trying to update - Check if tag already exists before creating - Check if GitHub release already exists before creating - This allows re-running the workflow after partial failures Previously, if the workflow failed after updating version but before completing the release, it couldn't be re-run with the same version.	2025-07-25 14:47:35 -07:00
Andy Lee	8f2a1e87ea	Merge pull request #7 from yichuan-w/fix/simple-ubuntu22-build fix: simplify build system for Colab compatibility	2025-07-25 14:08:37 -07:00
Andy Lee	50caf65f28	fix: change ubuntu-latest to ubuntu-22.04 and add Python 3.13 - Explicitly use ubuntu-22.04 instead of ubuntu-latest - Add Python 3.13 to the build matrix - This ensures we build on the same OS version as Google Colab	2025-07-25 13:48:59 -07:00
Andy Lee	1b48794ca8	cleanup: remove cibuildwheel workflow files - Remove ci-cibuildwheel.yml and build-cibuildwheel.yml - These files were not present in v0.1.5 - Keep only the simple build system	2025-07-25 13:48:08 -07:00
Andy Lee	4aef1d814e	revert: simplify build system by removing manylinux/cibuildwheel - Revert to simple Ubuntu 22.04 builds that should work with Colab - Remove all manylinux container complexity - Colab runs on Ubuntu 22.04, so direct builds should be compatible - Restore build-reusable.yml to v0.1.5 version - Remove cibuildwheel option from release workflow This should fix the overcomplicated build issues while maintaining Colab compatibility through direct Ubuntu 22.04 builds.	2025-07-25 13:46:51 -07:00
GitHub Actions	75ddcd6158	chore: release v0.1.9	2025-07-25 20:04:42 +00:00
Andy Lee	2a4df11f5c	fix: absolute path for passages	2025-07-25 11:59:30 -07:00